This notebook contains the main part of my code. For the dependencies, see requirements.txt.
Environment: Ubuntu 16.04 with GTX 1080Ti and 64GB RAM.
import segmentation_models_pytorch as smp
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import torch.optim as optim
from torch.autograd import Variable
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
# set pretrained = False if you don't want to use pretrained weight
vgg11 = models.vgg11(pretrained = True)
vgg_img = vgg11.features[:16]
print(vgg_img)
def make_vgg_audio(cfg, batch_norm=False):
layers = []
in_channels = 2
for v in cfg:
if v == 'M':
layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
elif v == 'M2':
layers += [nn.MaxPool2d(kernel_size=(2, 1), stride=(2, 1))]
else:
conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
if batch_norm:
layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
else:
layers += [conv2d, nn.ReLU(inplace=True)]
in_channels = v
return nn.Sequential(*layers)
vgg_cfg = ['M2', 32, 32, 'M', 64, 64, 'M', 128, 128, 'M', 128, 128, 'M', 128, 128]
vgg_spec = make_vgg_audio(vgg_cfg, batch_norm = True)
print(vgg_spec)
class PotentialLocalizeNet(nn.Module):
def __init__(self, vgg_img, vgg_spec):
super(PotentialLocalizeNet, self).__init__()
self.vgg_img = vgg_img
self.vgg_spec = vgg_spec
self.fc1_i = nn.Conv2d(512, 128, kernel_size = 1)
self.fc2_i = nn.Conv2d(128, 128, kernel_size = 1)
self.fc4_i = nn.Conv2d(128, 1, kernel_size = 1)
self.fc1_s = nn.Linear(128 * 16, 128)
self.fc2_s = nn.Linear(128, 128)
self.fc1_v2s = nn.Linear(128, 128)
self.fc2_v2s = nn.Linear(128, 128)
def predict(self, img, spec):
img_feat = self.vgg_img(img)
tmp = F.relu(self.fc1_i(img_feat))
f_v = F.relu(self.fc2_i(tmp))
tmp = self.fc4_i(f_v)
#potential localization map
p_alpha = F.softmax(tmp.view(-1, 14 * 14), dim = 1).view(-1, 14, 14).unsqueeze(1)
#global average pooling
spec_feat = self.vgg_spec(spec).mean(dim = 3).view(-1, 128 * 16)
tmp = F.relu(self.fc1_s(spec_feat))
f_s = F.relu(self.fc2_s(tmp))
f_s_normed = f_s/f_s.norm(dim = 1).unsqueeze(1)
f_v_normed = f_v/f_v.norm(dim = 1).unsqueeze(1)
#attention map
a = torch.sum(f_s_normed.unsqueeze(2).unsqueeze(3) * f_v_normed, dim = 1).unsqueeze(1)
#localization map
alpha = (a * p_alpha)/(a * p_alpha).sum(dim = (2, 3)).unsqueeze(2).unsqueeze(3)
return alpha, p_alpha
def predict_img(self, img):
img_feat = self.vgg_img(img)
tmp = F.relu(self.fc1_i(img_feat))
f_v = F.relu(self.fc2_i(tmp))
tmp = self.fc4_i(f_v)
#potential localization map
p_alpha = F.softmax(tmp.view(-1, 14 * 14), dim = 1).view(-1, 14, 14).unsqueeze(1)
return p_alpha
def forward(self, img, spec):
img_feat = self.vgg_img(img)
tmp = F.relu(self.fc1_i(img_feat))
f_v = F.relu(self.fc2_i(tmp))
tmp = self.fc4_i(f_v)
#potential localization map
p_alpha = F.softmax(tmp.view(-1, 14 * 14), dim = 1).view(-1, 14, 14).unsqueeze(1)
#global average pooling
spec_feat = self.vgg_spec(spec).mean(dim = 3).view(-1, 128 * 16)
tmp = F.relu(self.fc1_s(spec_feat))
f_s = F.relu(self.fc2_s(tmp))
f_s_normed = f_s/f_s.norm(dim = 1).unsqueeze(1)
f_v_normed = f_v/f_v.norm(dim = 1).unsqueeze(1)
#attention map
a = torch.sum(f_s_normed.unsqueeze(2).unsqueeze(3) * f_v_normed, dim = 1).unsqueeze(1)
#localization map
alpha = (a * p_alpha)/(a * p_alpha).sum(dim = (2, 3)).unsqueeze(2).unsqueeze(3)
f_v2s = (f_v * alpha).sum(dim = (2, 3))
tmp = F.relu(self.fc1_v2s(f_v2s))
f_s_hat = F.relu(self.fc2_v2s(tmp))
f_s_hat_normed = f_s_hat/f_s_hat.norm(dim = 1).unsqueeze(1)
return f_s_normed, f_s_hat_normed
model = PotentialLocalizeNet(vgg_img, vgg_spec)
print(model)
for the setting with pretrained weight:
optimizer = optim.Adam(
[
{"params": model.vgg_img.parameters(), "lr": 1e-5},
{"params": model.vgg_spec.parameters(), "lr": 1e-4},
{"params": model.fc1_i.parameters(), "lr": 1e-4},
{"params": model.fc2_i.parameters(), "lr": 1e-4},
{"params": model.fc4_i.parameters(), "lr": 1e-4},
{"params": model.fc1_s.parameters(), "lr": 1e-4},
{"params": model.fc2_s.parameters(), "lr": 1e-4},
{"params": model.fc1_v2s.parameters(), "lr": 1e-4},
{"params": model.fc2_v2s.parameters(), "lr": 1e-4},
],
lr=1e-4,
)
num_epochs = 50
for the setting without pretrained weight:
optimizer = optim.Adam(model.parameters(), lr = 0.0001)
num_epochs = 25
model = smp.Unet('resnet34', encoder_weights='imagenet')
print(model)
optimizer = optim.Adam(model.parameters(), lr = 0.0001)
num_epochs = 100
for the implementation, we use codes from https://github.com/utkuozbulak/pytorch-cnn-visualizations
def preprocess_image(pil_im, resize_im=True):
"""
Processes image for CNNs
Args:
PIL_img (PIL_img): Image to process
resize_im (bool): Resize to 224 or not
returns:
im_as_var (torch variable): Variable that contains processed float tensor
"""
# mean and std list for channels (Imagenet)
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
# Resize image
if resize_im:
pil_im.thumbnail((512, 512))
im_as_arr = np.float32(pil_im)
im_as_arr = im_as_arr.transpose(2, 0, 1) # Convert array to D,W,H
# Normalize the channels
for channel, _ in enumerate(im_as_arr):
im_as_arr[channel] /= 255
im_as_arr[channel] -= mean[channel]
im_as_arr[channel] /= std[channel]
# Convert to float tensor
im_as_ten = torch.from_numpy(im_as_arr).float()
# Add one more channel to the beginning. Tensor shape = 1,3,224,224
im_as_ten.unsqueeze_(0)
# Convert to Pytorch variable
im_as_var = Variable(im_as_ten, requires_grad=True)
return im_as_var
def get_params(img_path):
original_image = Image.open(img_path).convert('RGB').resize((224, 224))
prep_img = preprocess_image(original_image)
# Define model
pretrained_model = models.vgg11(pretrained=True)
pretrained_model.eval()
return (original_image,
prep_img,
pretrained_model)
class CamExtractor():
"""
Extracts cam features from the model
"""
def __init__(self, model, target_layer):
self.model = model
self.target_layer = target_layer
self.gradients = None
def save_gradient(self, grad):
self.gradients = grad
def forward_pass_on_convolutions(self, x):
"""
Does a forward pass on convolutions, hooks the function at given layer
"""
conv_output = None
for module_pos, module in self.model.features._modules.items():
x = module(x) # Forward
if int(module_pos) == self.target_layer:
x.register_hook(self.save_gradient)
conv_output = x # Save the convolution output on that layer
return conv_output, x
def forward_pass(self, x):
"""
Does a full forward pass on the model
"""
# Forward pass on the convolutions
conv_output, x = self.forward_pass_on_convolutions(x)
x = x.view(x.size(0), -1) # Flatten
# Forward pass on the classifier
x = self.model.classifier(x)
return conv_output, x
class GradCam():
"""
Produces class activation map
"""
def __init__(self, model, target_layer):
self.model = model
self.model.eval()
# Define extractor
self.extractor = CamExtractor(self.model, target_layer)
def generate_cam(self, input_image, rank, target_class=None):
# Full forward pass
# conv_output is the output of convolutions at specified layer
# model_output is the final output of the model (1, 1000)
conv_output, model_output = self.extractor.forward_pass(input_image)
if target_class is None:
#target_class = np.argmax(model_output.data.numpy())
target_class = np.argsort(model_output.data.numpy().ravel())[-(rank + 1)]
#print(target_class)
# Target for backprop
one_hot_output = torch.FloatTensor(1, model_output.size()[-1]).zero_()
one_hot_output[0][target_class] = 1
# Zero grads
self.model.features.zero_grad()
self.model.classifier.zero_grad()
# Backward pass with specified target
model_output.backward(gradient=one_hot_output, retain_graph=True)
# Get hooked gradients
guided_gradients = self.extractor.gradients.data.numpy()[0]
# Get convolution outputs
target = conv_output.data.numpy()[0]
# Get weights from gradients
weights = np.mean(guided_gradients, axis=(1, 2)) # Take averages for each gradient
# Create empty numpy array for cam
cam = np.ones(target.shape[1:], dtype=np.float32)
# Multiply each weight with its conv output and then, sum
for i, w in enumerate(weights):
cam += w * target[i, :, :]
cam = np.maximum(cam, 0)
cam = (cam - np.min(cam)) / (np.max(cam) - np.min(cam)) # Normalize between 0-1
cam = np.uint8(cam * 255) # Scale between 0-255 to visualize
cam = np.uint8(Image.fromarray(cam).resize((input_image.shape[2],
input_image.shape[3]), Image.ANTIALIAS))/255
# ^ I am extremely unhappy with this line. Originally resizing was done in cv2 which
# supports resizing numpy matrices with antialiasing, however,
# when I moved the repository to PIL, this option was out of the window.
# So, in order to use resizing with ANTIALIAS feature of PIL,
# I briefly convert matrix to PIL image and then back.
# If there is a more beautiful way, do not hesitate to send a PR.
return cam, target_class
def generate_cam_max(self, input_image, size):
# Full forward pass
# conv_output is the output of convolutions at specified layer
# model_output is the final output of the model (1, 1000)
conv_output, model_output = self.extractor.forward_pass(input_image)
#target_class = np.argmax(model_output.data.numpy())
target_classes = np.argsort(model_output.data.numpy().ravel())[::-1][:size]
#print(target_class)
# Target for backprop
cams = np.zeros((size, 224, 224))
for num, target_class in enumerate(tqdm(target_classes)):
one_hot_output = torch.FloatTensor(1, model_output.size()[-1]).zero_()
one_hot_output[0][target_class] = 1
# Zero grads
self.model.features.zero_grad()
self.model.classifier.zero_grad()
# Backward pass with specified target
model_output.backward(gradient=one_hot_output, retain_graph=True)
# Get hooked gradients
guided_gradients = self.extractor.gradients.data.numpy()[0]
# Get convolution outputs
target = conv_output.data.numpy()[0]
# Get weights from gradients
weights = np.mean(guided_gradients, axis=(1, 2)) # Take averages for each gradient
# Create empty numpy array for cam
cam = np.ones(target.shape[1:], dtype=np.float32)
# Multiply each weight with its conv output and then, sum
for i, w in enumerate(weights):
cam += w * target[i, :, :]
cam = np.maximum(cam, 0)
cam = (cam - np.min(cam)) / (np.max(cam) - np.min(cam)) # Normalize between 0-1
cam = np.uint8(cam * 255) # Scale between 0-255 to visualize
cam = np.uint8(Image.fromarray(cam).resize((input_image.shape[2],
input_image.shape[3]), Image.ANTIALIAS))/255
# ^ I am extremely unhappy with this line. Originally resizing was done in cv2 which
# supports resizing numpy matrices with antialiasing, however,
# when I moved the repository to PIL, this option was out of the window.
# So, in order to use resizing with ANTIALIAS feature of PIL,
# I briefly convert matrix to PIL image and then back.
# If there is a more beautiful way, do not hesitate to send a PR.
cams[num] = cam/cam.sum()
cam_max = cams.max(axis = 0)
cam_max = cam_max/cam_max.sum()
return cam_max
img_path = 'example.png'
(original_image, prep_img, pretrained_model) = get_params(img_path)
grad_cam = GradCam(pretrained_model, target_layer=20)
cam, class_id = grad_cam.generate_cam(prep_img, 0)
cam = cam/cam.sum()
plt.imshow(original_image)
plt.axis('off')
plt.show()
plt.imshow(original_image)
plt.imshow(cam, alpha = 0.5, cmap = 'jet')
plt.axis('off')
plt.show()
img_path = 'example.png'
top_N = 30
(original_image, prep_img, pretrained_model) = get_params(img_path)
grad_cam = GradCam(pretrained_model, target_layer=20)
cam = grad_cam.generate_cam_max(prep_img, top_N)
cam = cam/cam.sum()
plt.imshow(original_image)
plt.axis('off')
plt.show()
plt.imshow(original_image)
plt.imshow(cam, alpha = 0.5, cmap = 'jet')
plt.axis('off')
plt.show()