Fast-R-CNN/model.py

import math

import torch
import torch.nn as nn
import torchvision
from torchvision import models

from utils import compute_offsets, assign_label, generate_proposal
from loss import ClsScoreRegression, BboxRegression


class FeatureExtractor(nn.Module):
    """
    Image feature extraction with MobileNet.
    """
    def __init__(self, reshape_size=224, pooling=False, verbose=False):
        super().__init__()

        self.mobilenet = models.mobilenet_v2(pretrained=True)
        self.mobilenet = nn.Sequential(*list(self.mobilenet.children())[:-1]) # Remove the last classifier

        # average pooling
        if pooling:
            self.mobilenet.add_module('LastAvgPool', nn.AvgPool2d(math.ceil(reshape_size/32.))) # input: N x 1280 x 7 x 7

        for i in self.mobilenet.named_parameters():
            i[1].requires_grad = True # fine-tune all

    def forward(self, img, verbose=False):
        """
        Inputs:
        - img: Batch of resized images, of shape Nx3x224x224

        Outputs:
        - feat: Image feature, of shape Nx1280 (pooled) or Nx1280x7x7
        """
        num_img = img.shape[0]

        img_prepro = img

        feat = []
        process_batch = 500
        for b in range(math.ceil(num_img/process_batch)):
            feat.append(self.mobilenet(img_prepro[b*process_batch:(b+1)*process_batch]
                                    ).squeeze(-1).squeeze(-1)) # forward and squeeze
        feat = torch.cat(feat)

        if verbose:
            print('Output feature shape: ', feat.shape)

        return feat


class FastRCNN(nn.Module):
    def __init__(self, in_dim=1280, hidden_dim=256, num_classes=20, \
                roi_output_w=2, roi_output_h=2, drop_ratio=0.3):
        super().__init__()

        assert(num_classes != 0)
        self.num_classes = num_classes
        self.roi_output_w, self.roi_output_h = roi_output_w, roi_output_h
        self.feat_extractor = FeatureExtractor()
        ##############################################################################
        # TODO: Declare the cls & bbox heads (in Fast R-CNN).                        #
        # The cls & bbox heads share a sequential module with a Linear layer,        #
        # followed by a Dropout (p=drop_ratio), a ReLU nonlinearity and another      #
        # Linear layer.                                                              #
        # The cls head is a Linear layer that predicts num_classes + 1 (background). #
        # The det head is a Linear layer that predicts offsets(dim=4).               #
        # HINT: The dimension of the two Linear layers are in_dim -> hidden_dim and  #
        # hidden_dim -> hidden_dim.                                                  #
        ##############################################################################
        # Replace "pass" statement with your code
        self.cls_head = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.Dropout(drop_ratio),
            nn.ReLU(),
            nn.Linear(hidden_dim, num_classes+1)
        )
        self.bbox_head = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.Dropout(drop_ratio),
            nn.ReLU(),
            nn.Linear(hidden_dim, 4)
        )
        ##############################################################################
        #                               END OF YOUR CODE                             #
        ##############################################################################

    def forward(self, images, bboxes, bbox_batch_ids, proposals, proposal_batch_ids):
        """
        Training-time forward pass for our two-stage Faster R-CNN detector.

        Inputs:
        - images: Tensor of shape (B, 3, H, W) giving input images
        - bboxes: Tensor of shape (N, 5) giving ground-truth bounding boxes
        and category labels, from the dataloader, where N is the total number
        of GT boxes in the batch
        - bbox_batch_ids: Tensor of shape (N, ) giving the index (in the batch)
        of the image that each GT box belongs to
        - proposals: Tensor of shape (M, 4) giving the proposals for input images,
        where M is the total number of proposals in the batch
        - proposal_batch_ids: Tensor of shape (M, ) giving the index of the image
        that each proposals belongs to

        Outputs:
        - total_loss: Torch scalar giving the overall training loss.
        """
        w_cls = 1 # for cls_scores
        w_bbox = 1 # for offsets
        total_loss = None
        ##############################################################################
        # TODO: Implement the forward pass of Fast R-CNN.                            #
        # A few key steps are outlined as follows:                                   #
        # i) Extract image fearure.                                                  #
        # ii) Perform RoI Align on proposals, then meanpool the feature in the       #
        #     spatial dimension.                                                     #
        # iii) Pass the RoI feature through the shared-fc layer. Predict             #
        #      classification scores ans box offsets.                                #
        # iv) Assign the proposals with targets of each image.                       #
        # v) Compute the cls_loss between the predicted class_prob and GT_class      #
        #    (For poistive & negative proposals)                                     #
        #    Compute the bbox_loss between the offsets and GT_offsets                #
        #    (For positive proposals)                                                #
        #    Compute the total_loss which is formulated as:                          #
        #    total_loss = w_cls*cls_loss + w_bbox*bbox_loss.                         #
        ##############################################################################
        # Replace "pass" statement with your code
        B, _, H, W = images.shape

        # extract image feature
        feat=self.feat_extractor.forward(images)
        # print(feat.shape)

        # perform RoI Pool & mean pool
        feat=torchvision.ops.roi_pool(feat, torch.cat((proposal_batch_ids.unsqueeze(1), proposals),dim=1), output_size=(self.roi_output_w, self.roi_output_h))
        # print(feat.shape)
        feat=feat.mean(dim=[2,3])
        # print(feat.shape)

        # forward heads, get predicted cls scores & offsets
        cls_scores=self.cls_head(feat)
        bbox_offsets=self.bbox_head(feat)
        # print(cls_scores.shape, bbox_offsets.shape)

        # assign targets with proposals
        pos_masks, neg_masks, GT_labels, GT_bboxes = [], [], [], []
        for img_idx in range(B):
            # get the positive/negative proposals and corresponding
            # GT box & class label of this image
            pos_mask, neg_mask, GT_label, GT_bbox = assign_label(proposals[proposal_batch_ids==img_idx,:], bboxes[bbox_batch_ids==img_idx,:], self.num_classes)
            # print(pos_mask.shape, neg_mask.shape, GT_label.shape, GT_bbox.shape)
            pos_masks.append(pos_mask)
            neg_masks.append(neg_mask)
            GT_labels.append(GT_label)
            GT_bboxes.append(GT_bbox)

        # compute loss
        cls_loss = 0
        img_idx = 0
        for GT_label in GT_labels:
            # print(cls_scores.shape, GT_label.shape)
            cls_loss += ClsScoreRegression(cls_scores[proposal_batch_ids==img_idx,:], GT_label, B)
            img_idx += 1
        bbox_loss = 0
        img_idx=0

        for GT_bbox in GT_bboxes:
            bbox_offsets_cur=bbox_offsets[proposal_batch_ids==img_idx,:]
            pos_box_offsets = bbox_offsets_cur[pos_masks[img_idx],:]
            proposals_cur = proposals[proposal_batch_ids==img_idx,:]
            pos_proposals = proposals_cur[pos_masks[img_idx],:]
            # print(pos_box_offsets.shape, GT_bbox.shape)
            bbox_loss += BboxRegression(pos_box_offsets, compute_offsets(pos_proposals, GT_bbox), B)
            img_idx += 1
        total_loss=cls_loss+bbox_loss

        ##############################################################################
        #                               END OF YOUR CODE                             #
        ##############################################################################
        return total_loss

    def inference(self, images, proposals, proposal_batch_ids, thresh=0.5, nms_thresh=0.7):
        """"
        Inference-time forward pass for our two-stage Faster R-CNN detector

        Inputs:
        - images: Tensor of shape (B, 3, H, W) giving input images
        - proposals: Tensor of shape (M, 4) giving the proposals for input images,
        where M is the total number of proposals in the batch
        - proposal_batch_ids: Tensor of shape (M, ) giving the index of the image
        that each proposals belongs to
        - thresh: Threshold value on confidence probability. HINT: You can convert the
        classification score to probability using a softmax nonlinearity.
        - nms_thresh: IoU threshold for NMS

        We can output a variable number of predicted boxes per input image.
        In particular we assume that the input images[i] gives rise to P_i final
        predicted boxes.

        Outputs:
        - final_proposals: List of length (B,) where final_proposals[i] is a Tensor
        of shape (P_i, 4) giving the coordinates of the final predicted boxes for
        the input images[i]
        - final_conf_probs: List of length (B,) where final_conf_probs[i] is a
        Tensor of shape (P_i, 1) giving the predicted probabilites that the boxes
        in final_proposals[i] are objects (vs background)
        - final_class: List of length (B,), where final_class[i] is an int64 Tensor
        of shape (P_i, 1) giving the predicted category labels for each box in
        final_proposals[i].
        """
        final_proposals, final_conf_probs, final_class = None, None, None
        ##############################################################################
        # TODO: Predicting the final proposal coordinates `final_proposals`,         #
        # confidence scores `final_conf_probs`, and the class index `final_class`.   #
        # The overall steps are similar to the forward pass, but now you cannot      #
        # decide the activated nor negative proposals without GT boxes.              #
        # You should apply post-processing (thresholding and NMS) to all proposals   #
        # and keep the final proposals.                                               #
        ##############################################################################
        # Replace "pass" statement with your code
        B = images.shape[0]

        # extract image feature
        feat = self.feat_extractor.forward(images)

        # perform RoI Pool & mean pool
        feat=torchvision.ops.roi_pool(feat, torch.cat((proposal_batch_ids.unsqueeze(1), proposals),dim=1), output_size=(self.roi_output_w, self.roi_output_h))
        feat = feat.mean(dim=[2, 3])

        # forward heads, get predicted cls scores & offsets
        cls_scores = self.cls_head(feat)
        # print(cls_scores.shape)
        bbox_offsets = self.bbox_head(feat)
        # print(bbox_offsets.shape)
        # get predicted boxes & class label & confidence probability
        proposals = generate_proposal(proposals, bbox_offsets)

        final_proposals = []
        final_conf_probs = []
        final_class = []
        # post-process to get final predictions
        for img_idx in range(B):

            # filter by threshold
            cls_prob = torch.softmax(cls_scores[proposal_batch_ids == img_idx], dim=1)
            # print(cls_prob)
            # print(torch.max(cls_prob, dim=1)[1].shape)
            # print(torch.max(cls_prob, dim=1)[0])
            # print(torch.max(cls_prob, dim=1)[1])

            pos_mask = torch.max(cls_prob, dim=1)[0] > thresh
            not_bg_mask = torch.max(cls_prob, dim=1)[1] != self.num_classes
            # print(pos_mask)
            # print(not_bg_mask)
            total_mask = pos_mask & not_bg_mask
            # print(final_mask)
            # print(pos_mask.shape)
            proposals_obj = proposals[proposal_batch_ids == img_idx][total_mask]
            conf_probs=torch.max(cls_prob, dim=1)[0][total_mask]
            class_idx = torch.max(cls_prob, dim=1)[1][total_mask]


            # nms
            keep = torchvision.ops.nms(proposals_obj, conf_probs, nms_thresh)
            final_proposals.append(proposals_obj[keep])
            final_conf_probs.append(conf_probs[keep].unsqueeze(1))
            final_class.append(class_idx[keep].unsqueeze(1))


        ##############################################################################
        #                               END OF YOUR CODE                             #
        ##############################################################################
        return final_proposals, final_conf_probs, final_class