Buiding an architecture to detect class and position of multiple entities in a single image. Based off of convolutional YOLO3 architecture. Paper:

Dataset Preparation

pascal_root = Path('PascalData')
j = json.load((pascal_root/'PASCAL_VOC'/'pascal_train2007.json').open())
j.keys()
dict_keys(['images', 'type', 'annotations', 'categories'])

hw_bb[source]

hw_bb(bb)

Convert height width to bounding box coords

bb_hw[source]

bb_hw(a)

Convert bounding box coords to height width

show_img[source]

show_img(im, figsize=None, ax=None)

Show the given image

draw_outline[source]

draw_outline(o, lw)

Draw the outline around a bounding box

draw_rect[source]

draw_rect(ax, b)

Add a square capturing a given bounding box

draw_text[source]

draw_text(ax, xy, txt, sz=14)

Draw the class type aboce the bounding box of font size sz

ax = show_img(im)
b = bb_hw(im0_a[0])
draw_rect(ax, b)
draw_text(ax, b[:2], cats[im0_a[1]])

draw_im[source]

draw_im(im, ann)

Draw an image along with it's bounding box and class

draw_idx[source]

draw_idx(i)

Open and draw a given image with id i

draw_idx(12)
torch.Size([3, 333, 500])

Architecture

get_res_body[source]

get_res_body(size, n_in, c_out=10)

Return the body of a resnet with given size and n_in input channels

res_body = get_res_body(34, 3)
res_body(torch.randn(1,3, 224, 224)).shape
torch.Size([1, 512, 7, 7])

We now have to take the output of the ResNet body and turn it into k outputs of size (4 + 4c) where k is the number of anchor boxes, 4 outputs for bounding box coordinates and 4c activations representing the predicted class

class StandardConv[source]

StandardConv(n_in, n_out, ks=3, stride=2, drop=0.1, **kwargs) :: Module

Standard convolutional layer to downsample the ResNet output

class PredictionConv[source]

PredictionConv(k, n_in, n_clas) :: Module

Convolutional layer to split activations into predicted bbox coordinates and classes

class SSDHead[source]

SSDHead(k, n_clas, drop=0.1) :: Module

Convolutional layers to split resnet body activations into predictions of various sizes. Each representing different regions of the image

class GenericModel[source]

GenericModel(body, head) :: Module

Wrapper to combine a head and body into a single model

ssd_head = SSDHead(9, 10)
ssd = GenericModel(res_body, ssd_head)
ssd
GenericModel(
  (body): Sequential(
    (0): ConvBatchLayer(
      (conv): AutoConv(
        (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      )
      (bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (activation): Identity()
    )
    (1): ConvBatchLayer(
      (conv): AutoConv(
        (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
      (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (activation): Identity()
    )
    (2): ConvBatchLayer(
      (conv): AutoConv(
        (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
      (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (activation): Identity()
    )
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): BaseRes(
        (blocks): Sequential(
          (0): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
          (1): ReLU()
          (2): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
        )
        (identity): Identity()
        (activation): ReLU()
      )
      (1): BaseRes(
        (blocks): Sequential(
          (0): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
          (1): ReLU()
          (2): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
        )
        (identity): Identity()
        (activation): ReLU()
      )
      (2): BaseRes(
        (blocks): Sequential(
          (0): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
          (1): ReLU()
          (2): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
        )
        (identity): Identity()
        (activation): ReLU()
      )
    )
    (5): Sequential(
      (0): BaseRes(
        (blocks): Sequential(
          (0): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
            )
            (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
          (1): ReLU()
          (2): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
        )
        (identity): Sequential(
          (0): AvgPool2d(kernel_size=2, stride=2, padding=0)
          (1): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1))
            )
            (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
        )
        (activation): ReLU()
      )
      (1): BaseRes(
        (blocks): Sequential(
          (0): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
          (1): ReLU()
          (2): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
        )
        (identity): Identity()
        (activation): ReLU()
      )
      (2): BaseRes(
        (blocks): Sequential(
          (0): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
          (1): ReLU()
          (2): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
        )
        (identity): Identity()
        (activation): ReLU()
      )
      (3): BaseRes(
        (blocks): Sequential(
          (0): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
          (1): ReLU()
          (2): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
        )
        (identity): Identity()
        (activation): ReLU()
      )
    )
    (6): Sequential(
      (0): BaseRes(
        (blocks): Sequential(
          (0): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
            )
            (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
          (1): ReLU()
          (2): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
        )
        (identity): Sequential(
          (0): AvgPool2d(kernel_size=2, stride=2, padding=0)
          (1): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1))
            )
            (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
        )
        (activation): ReLU()
      )
      (1): BaseRes(
        (blocks): Sequential(
          (0): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
          (1): ReLU()
          (2): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
        )
        (identity): Identity()
        (activation): ReLU()
      )
      (2): BaseRes(
        (blocks): Sequential(
          (0): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
          (1): ReLU()
          (2): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
        )
        (identity): Identity()
        (activation): ReLU()
      )
      (3): BaseRes(
        (blocks): Sequential(
          (0): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
          (1): ReLU()
          (2): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
        )
        (identity): Identity()
        (activation): ReLU()
      )
      (4): BaseRes(
        (blocks): Sequential(
          (0): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
          (1): ReLU()
          (2): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
        )
        (identity): Identity()
        (activation): ReLU()
      )
      (5): BaseRes(
        (blocks): Sequential(
          (0): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
          (1): ReLU()
          (2): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
        )
        (identity): Identity()
        (activation): ReLU()
      )
    )
    (7): Sequential(
      (0): BaseRes(
        (blocks): Sequential(
          (0): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
            )
            (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
          (1): ReLU()
          (2): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
        )
        (identity): Sequential(
          (0): AvgPool2d(kernel_size=2, stride=2, padding=0)
          (1): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1))
            )
            (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
        )
        (activation): ReLU()
      )
      (1): BaseRes(
        (blocks): Sequential(
          (0): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
          (1): ReLU()
          (2): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
        )
        (identity): Identity()
        (activation): ReLU()
      )
      (2): BaseRes(
        (blocks): Sequential(
          (0): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
          (1): ReLU()
          (2): ConvBatchLayer(
            (conv): AutoConv(
              (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (activation): Identity()
          )
        )
        (identity): Identity()
        (activation): ReLU()
      )
    )
  )
  (head): SSDHead(
    (relu): ReLU()
    (drop): Dropout(p=0.1, inplace=False)
    (initial_layer): StandardConv(
      (layers): Sequential(
        (0): ConvBatchLayer(
          (conv): AutoConv(
            (conv): Conv2d(512, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          )
          (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (activation): Identity()
        )
        (1): Dropout(p=0.1, inplace=False)
      )
    )
    (conv_to_4): StandardConv(
      (layers): Sequential(
        (0): ConvBatchLayer(
          (conv): AutoConv(
            (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          )
          (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (activation): Identity()
        )
        (1): Dropout(p=0.1, inplace=False)
      )
    )
    (conv_to_2): StandardConv(
      (layers): Sequential(
        (0): ConvBatchLayer(
          (conv): AutoConv(
            (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          )
          (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (activation): Identity()
        )
        (1): Dropout(p=0.1, inplace=False)
      )
    )
    (conv_to_1): StandardConv(
      (layers): Sequential(
        (0): ConvBatchLayer(
          (conv): AutoConv(
            (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          )
          (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (activation): Identity()
        )
        (1): Dropout(p=0.1, inplace=False)
      )
    )
    (out_conv_4): PredictionConv(
      (class_conv): AutoConv(
        (conv): Conv2d(256, 11, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
      (bbox_conv): AutoConv(
        (conv): Conv2d(256, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
    )
    (out_conv_2): PredictionConv(
      (class_conv): AutoConv(
        (conv): Conv2d(256, 11, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
      (bbox_conv): AutoConv(
        (conv): Conv2d(256, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
    )
    (out_conv_1): PredictionConv(
      (class_conv): AutoConv(
        (conv): Conv2d(256, 11, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
      (bbox_conv): AutoConv(
        (conv): Conv2d(256, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
    )
  )
)
!python notebook2script.py SingleShotDetector.ipynb
Converted SingleShotDetector.ipynb to ModernArchitecturesFromPyTorch/nb_SingleShotDetector.py