Skip to main content

VGG19 / VGG16

Input Image: 224 x 224 x 3
All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel RGB images of shape (3 x H x W), where H and W are expected to be at least 224. The images have to be loaded in to a range of [0, 1] and then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225].

For all layers: Bias = True, Filter size = 3x3. Stride = 1, Padding = 1 (same)
Each conv layer is followed by ReLU
19 layers are including 3 fully connected layers.
So there are 16 convolutional layers with relu, 5 MaxPool Layers, 3 Fully connected layers.
So, 16 kernels and 16 biases for conv layers and 3 kernels/weights and 3 biases for FC layers.
5 max pools (2x2) so output size before FC layers = 224 / (2^5) = 224/32 = 7
Also,

Torch Input = C x H x W = 3 x 224 x 224

Conv2D(input_channel, output_channel, stride, padding)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Conv2d-1         [-1, 64, 224, 224]           1,792
              ReLU-2         [-1, 64, 224, 224]               0
            Conv2d-3         [-1, 64, 224, 224]          36,928
              ReLU-4         [-1, 64, 224, 224]               0
         MaxPool2d-5         [-1, 64, 112, 112]               0
            Conv2d-6        [-1, 128, 112, 112]          73,856
              ReLU-7        [-1, 128, 112, 112]               0
            Conv2d-8        [-1, 128, 112, 112]         147,584
              ReLU-9        [-1, 128, 112, 112]               0
        MaxPool2d-10          [-1, 128, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]         295,168
             ReLU-12          [-1, 256, 56, 56]               0
           Conv2d-13          [-1, 256, 56, 56]         590,080
             ReLU-14          [-1, 256, 56, 56]               0
           Conv2d-15          [-1, 256, 56, 56]         590,080
             ReLU-16          [-1, 256, 56, 56]               0
           Conv2d-17          [-1, 256, 56, 56]         590,080
             ReLU-18          [-1, 256, 56, 56]               0
        MaxPool2d-19          [-1, 256, 28, 28]               0
           Conv2d-20          [-1, 512, 28, 28]       1,180,160
             ReLU-21          [-1, 512, 28, 28]               0
           Conv2d-22          [-1, 512, 28, 28]       2,359,808
             ReLU-23          [-1, 512, 28, 28]               0
           Conv2d-24          [-1, 512, 28, 28]       2,359,808
             ReLU-25          [-1, 512, 28, 28]               0
           Conv2d-26          [-1, 512, 28, 28]       2,359,808
             ReLU-27          [-1, 512, 28, 28]               0
        MaxPool2d-28          [-1, 512, 14, 14]               0
           Conv2d-29          [-1, 512, 14, 14]       2,359,808
             ReLU-30          [-1, 512, 14, 14]               0
           Conv2d-31          [-1, 512, 14, 14]       2,359,808
             ReLU-32          [-1, 512, 14, 14]               0
           Conv2d-33          [-1, 512, 14, 14]       2,359,808
             ReLU-34          [-1, 512, 14, 14]               0
           Conv2d-35          [-1, 512, 14, 14]       2,359,808
             ReLU-36          [-1, 512, 14, 14]               0
        MaxPool2d-37            [-1, 512, 7, 7]               0
AdaptiveAvgPool2d-38            [-1, 512, 7, 7]               0
           Linear-39                 [-1, 4096]     102,764,544
             ReLU-40                 [-1, 4096]               0
          Dropout-41                 [-1, 4096]               0
           Linear-42                 [-1, 4096]      16,781,312
             ReLU-43                 [-1, 4096]               0
          Dropout-44                 [-1, 4096]               0
           Linear-45                 [-1, 1000]       4,097,000
================================================================
Total params: 143,667,240
Trainable params: 143,667,240
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.57
Forward/backward pass size (MB): 238.69
Params size (MB): 548.05
Estimated Total Size (MB): 787.31
----------------------------------------------------------------
-------------------
Model's state_dict:
features.0.weight        torch.Size([64, 3, 3, 3])
features.0.bias          torch.Size([64])
features.2.weight        torch.Size([64, 64, 3, 3])
features.2.bias          torch.Size([64])
features.5.weight        torch.Size([128, 64, 3, 3])
features.5.bias          torch.Size([128])
features.7.weight        torch.Size([128, 128, 3, 3])
features.7.bias          torch.Size([128])
features.10.weight       torch.Size([256, 128, 3, 3])
features.10.bias         torch.Size([256])
features.12.weight       torch.Size([256, 256, 3, 3])
features.12.bias         torch.Size([256])
features.14.weight       torch.Size([256, 256, 3, 3])
features.14.bias         torch.Size([256])
features.16.weight       torch.Size([256, 256, 3, 3])
features.16.bias         torch.Size([256])
features.19.weight       torch.Size([512, 256, 3, 3])
features.19.bias         torch.Size([512])
features.21.weight       torch.Size([512, 512, 3, 3])
features.21.bias         torch.Size([512])
features.23.weight       torch.Size([512, 512, 3, 3])
features.23.bias         torch.Size([512])
features.25.weight       torch.Size([512, 512, 3, 3])
features.25.bias         torch.Size([512])
features.28.weight       torch.Size([512, 512, 3, 3])
features.28.bias         torch.Size([512])
features.30.weight       torch.Size([512, 512, 3, 3])
features.30.bias         torch.Size([512])
features.32.weight       torch.Size([512, 512, 3, 3])
features.32.bias         torch.Size([512])
features.34.weight       torch.Size([512, 512, 3, 3])
features.34.bias         torch.Size([512])
classifier.0.weight      torch.Size([4096, 25088])
classifier.0.bias        torch.Size([4096])
classifier.3.weight      torch.Size([4096, 4096])
classifier.3.bias        torch.Size([4096])
classifier.6.weight      torch.Size([1000, 4096])
classifier.6.bias        torch.Size([1000])
-------------------
Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace)
Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace)
MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace)
Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace)
MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace)
Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace)
Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace)
Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace)
MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace)
Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace)
Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace)
Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace)
MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace)
Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace)
Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace)
Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace)
MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
---------------------------
[Sequential(
  (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU(inplace)
  (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): ReLU(inplace)
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (6): ReLU(inplace)
  (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (8): ReLU(inplace)
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (11): ReLU(inplace)
  (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (13): ReLU(inplace)
  (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (15): ReLU(inplace)
  (16): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (17): ReLU(inplace)
  (18): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (19): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (20): ReLU(inplace)
  (21): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (22): ReLU(inplace)
  (23): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (24): ReLU(inplace)
  (25): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (26): ReLU(inplace)
  (27): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (28): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (29): ReLU(inplace)
  (30): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (31): ReLU(inplace)
  (32): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (33): ReLU(inplace)
  (34): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (35): ReLU(inplace)
  (36): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
), AdaptiveAvgPool2d(output_size=(7, 7)), Sequential(
  (0): Linear(in_features=25088, out_features=4096, bias=True)
  (1): ReLU(inplace)
  (2): Dropout(p=0.5)
  (3): Linear(in_features=4096, out_features=4096, bias=True)
  (4): ReLU(inplace)
  (5): Dropout(p=0.5)
  (6): Linear(in_features=4096, out_features=1000, bias=True)
)]




VGG Model:

Downloaded Val Dataset from ImageNet website with labels.
But, the label indices given in ImageNet is different from the indices on which VGG was trained - although classes were same.
https://stackoverflow.com/questions/42537483/
No preprocessing is done, except cropping 224x224 from center and normalization.

RGB data should be supplied. 
(Checked with BGR to confirm - gave less accuracy)
PIL sometimes read image as GrayScale, so explicitly image was read using Image.open(path).convert('RGB')

Using a single central crop, the top-5 classification error on the validation set of ILSVRC-2012 I got was 14.386%
42807 out of 50000 were correctly classified under top 5 classes.
The time to just test (on validation set) was 12min 26s - batch size 1 was used.
Thought of using 50000 batch size, CUDA out of Memory error! Even at 50.

https://gist.github.com/ksimonyan/fd8800eeb36e276cd6f9#note
This link suggested first resizing the smallest size to 256 (preserving aspect ratio) and then cropping to 224x224
I tried it, resizing to 256 using bilinear interpolation using transforms.Resize(256) from Pytorch and also normalization.
Accuracy was better: 45443, 9.114% - strange.

Some mistakes:
Forgot to use model.eval()
Checked in model.train() to confirm that output does change when eval is not used.
Also used RandomCrop instead of CenterCrop, thus was getting different results even in eval mode.

Good Read:
https://medium.com/@josh_2774/deep-learning-with-pytorch-9574e74d17ad\
https://gist.github.com/ksimonyan/fd8800eeb36e276cd6f9#note
http://www.csc.kth.se/~roelof/deepdream/visclasses.html

Val Label:
https://gist.github.com/lvdmaaten/f94d6415ef448e043dae63b48e993da5
https://github.com/torch/tutorials/blob/master/7_imagenet_classification/synset_words.txt
https://gist.github.com/yrevar/942d3a0ac09ec9e5eb3a

From the link:
ILSVRC-2012 performance
Using 10 test crops (corners, centre, and horizontal flips), the top-5 classification error on the validation set of ILSVRC-2012 is 13.1%.

Using a single central crop, the top-5 classification error on the validation set of ILSVRC-2012 is 15.4%



import torch
from PIL import Image
from torchvision import models, transforms

assert torch.cuda.is_available(), 'GPU NOT AVAILABLE!'
device = torch.device("cuda:0")       # "cpu"
model = models.vgg19(pretrained=True).to(device)
for param in model.parameters():
    param.requires_grad = False

class pilDataset():
    def __init__(self, paths):
        self.paths = paths

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, index): # When you evaluate a[i] Python calls a.__getitem__(i)
        path = None
        path = self.paths[index]
        img = Image.open(path).convert('RGB')
        return img

filename = 'val.txt'
image_paths = []
gt =[]
with open(filename) as fh:
    for line in fh:
        imgname, lbl = line.strip().split(' ', 1)
        image_paths.append('ImageNet/ILSVRC2012_img_val/' + imgname)
        gt.append(lbl)

_image_size = 224
_mean = [0.485, 0.456, 0.406]
_std = [0.229, 0.224, 0.225]
trans = transforms.Compose([  
    transforms.Resize(256),
    transforms.CenterCrop(_image_size),                  # RandomCrop
    transforms.ToTensor(),                              # range 0 - 1
    transforms.Normalize(_mean, _std),
]) # Torchvision reads datasets into PILImage (Python imaging format) so transform will work on pil datatypes only

%%time
k = 5
intopk = 0
val_set = pilDataset(image_paths)

model.eval()
for i in range(len(val_set)):
    imgtensor = trans(val_set[i])                          #imgtensor = imgtensor[[2, 1, 0], :, :]
    ip = imgtensor.unsqueeze(0).to(device)                 #dimension is already CHW, we make it 1CHW
    logits = model(ip)
    _, idxs = torch.topk(logits, k)
    idxs = idxs.cpu().numpy().squeeze().tolist()
    if float(gt[i]) in idxs:
        intopk += 1
print(intopk)