Fast-R-CNN/model.py

import math

import torch
import torch.nn as nn
import torchvision
from torchvision import models

from utils import compute_offsets, assign_label, generate_proposal
from loss import ClsScoreRegression, BboxRegression


class FeatureExtractor(nn.Module):
    """
    Image feature extraction with MobileNet.
    """
    def __init__(self, reshape_size=224, pooling=False, verbose=False):
        super().__init__()

        self.mobilenet = models.mobilenet_v2(pretrained=True)
        self.mobilenet = nn.Sequential(*list(self.mobilenet.children())[:-1]) # Remove the last classifier

        # average pooling
        if pooling:
            self.mobilenet.add_module('LastAvgPool', nn.AvgPool2d(math.ceil(reshape_size/32.))) # input: N x 1280 x 7 x 7

        for i in self.mobilenet.named_parameters():
            i[1].requires_grad = True # fine-tune all

    def forward(self, img, verbose=False):
        """
        Inputs:
        - img: Batch of resized images, of shape Nx3x224x224

        Outputs:
        - feat: Image feature, of shape Nx1280 (pooled) or Nx1280x7x7
        """
        num_img = img.shape[0]

        img_prepro = img

        feat = []
        process_batch = 500
        for b in range(math.ceil(num_img/process_batch)):
            feat.append(self.mobilenet(img_prepro[b*process_batch:(b+1)*process_batch]
                                    ).squeeze(-1).squeeze(-1)) # forward and squeeze
        feat = torch.cat(feat)

        if verbose:
            print('Output feature shape: ', feat.shape)

        return feat


class FastRCNN(nn.Module):
    def __init__(self, in_dim=1280, hidden_dim=256, num_classes=20, \
                roi_output_w=2, roi_output_h=2, drop_ratio=0.3):
        super().__init__()

        assert(num_classes != 0)
        self.num_classes = num_classes
        self.roi_output_w, self.roi_output_h = roi_output_w, roi_output_h
        self.feat_extractor = FeatureExtractor()
        ##############################################################################
        # TODO: Declare the cls & bbox heads (in Fast R-CNN).                        #
        # The cls & bbox heads share a sequential module with a Linear layer,        #
        # followed by a Dropout (p=drop_ratio), a ReLU nonlinearity and another      #
        # Linear layer.                                                              #
        # The cls head is a Linear layer that predicts num_classes + 1 (background). #
        # The det head is a Linear layer that predicts offsets(dim=4).               #
        # HINT: The dimension of the two Linear layers are in_dim -> hidden_dim and  #
        # hidden_dim -> hidden_dim.                                                  #
        ##############################################################################
        # Replace "pass" statement with your code
        self.cls_head = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.Dropout(drop_ratio),
            nn.ReLU(),
            nn.Linear(hidden_dim, num_classes+1)
        )
        self.bbox_head = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.Dropout(drop_ratio),
            nn.ReLU(),
            nn.Linear(hidden_dim, 4)
        )
        ##############################################################################
        #                               END OF YOUR CODE                             #
        ##############################################################################

    def forward(self, images, bboxes, bbox_batch_ids, proposals, proposal_batch_ids):
        """
        Training-time forward pass for our two-stage Faster R-CNN detector.

        Inputs:
        - images: Tensor of shape (B, 3, H, W) giving input images
        - bboxes: Tensor of shape (N, 5) giving ground-truth bounding boxes
        and category labels, from the dataloader, where N is the total number
        of GT boxes in the batch
        - bbox_batch_ids: Tensor of shape (N, ) giving the index (in the batch)
        of the image that each GT box belongs to
        - proposals: Tensor of shape (M, 4) giving the proposals for input images, 
        where M is the total number of proposals in the batch
        - proposal_batch_ids: Tensor of shape (M, ) giving the index of the image 
        that each proposals belongs to

        Outputs:
        - total_loss: Torch scalar giving the overall training loss.
        """
        w_cls = 1 # for cls_scores
        w_bbox = 1 # for offsets
        total_loss = None
        ##############################################################################
        # TODO: Implement the forward pass of Fast R-CNN.                            #
        # A few key steps are outlined as follows:                                   #
        # i) Extract image fearure.                                                  #
        # ii) Perform RoI Align on proposals, then meanpool the feature in the       #
        #     spatial dimension.                                                     #
        # iii) Pass the RoI feature through the shared-fc layer. Predict             #
        #      classification scores ans box offsets.                                #
        # iv) Assign the proposals with targets of each image.                       # 
        # v) Compute the cls_loss between the predicted class_prob and GT_class      #
        #    (For poistive & negative proposals)                                     #
        #    Compute the bbox_loss between the offsets and GT_offsets                #
        #    (For positive proposals)                                                #
        #    Compute the total_loss which is formulated as:                          #
        #    total_loss = w_cls*cls_loss + w_bbox*bbox_loss.                         #
        ##############################################################################
        # Replace "pass" statement with your code
        B, _, H, W = images.shape
        
        # extract image feature
        feat=self.feat_extractor.forward(images)
        # print(feat.shape)

        # perform RoI Pool & mean pool
        feat=torchvision.ops.roi_pool(feat, torch.cat((proposal_batch_ids.unsqueeze(1), proposals),dim=1), output_size=(self.roi_output_w, self.roi_output_h))
        # print(feat.shape)
        feat=feat.mean(dim=[2,3])
        # print(feat.shape)

        # forward heads, get predicted cls scores & offsets
        cls_scores=self.cls_head(feat)
        bbox_offsets=self.bbox_head(feat)
        # print(cls_scores.shape, bbox_offsets.shape)

        # assign targets with proposals
        pos_masks, neg_masks, GT_labels, GT_bboxes = [], [], [], []
        for img_idx in range(B):
            # get the positive/negative proposals and corresponding
            # GT box & class label of this image
            pos_mask, neg_mask, GT_label, GT_bbox = assign_label(proposals[proposal_batch_ids==img_idx,:], bboxes[bbox_batch_ids==img_idx,:], self.num_classes)
            # print(pos_mask.shape, neg_mask.shape, GT_label.shape, GT_bbox.shape)
            pos_masks.append(pos_mask)
            neg_masks.append(neg_mask)
            GT_labels.append(GT_label)
            GT_bboxes.append(GT_bbox)

        # compute loss
        cls_loss = 0
        img_idx = 0
        for GT_label in GT_labels:
            # print(cls_scores.shape, GT_label.shape)
            cls_loss += ClsScoreRegression(cls_scores[proposal_batch_ids==img_idx,:], GT_label, B)
            img_idx += 1
        bbox_loss = 0
        img_idx=0
        
        for GT_bbox in GT_bboxes:
            bbox_offsets_cur=bbox_offsets[proposal_batch_ids==img_idx,:]
            pos_box_offsets = bbox_offsets_cur[pos_masks[img_idx],:]
            proposals_cur = proposals[proposal_batch_ids==img_idx,:]
            pos_proposals = proposals_cur[pos_masks[img_idx],:]
            # print(pos_box_offsets.shape, GT_bbox.shape)
            bbox_loss += BboxRegression(pos_box_offsets, compute_offsets(pos_proposals, GT_bbox), B)
            img_idx += 1
        total_loss=cls_loss+bbox_loss
        
        ##############################################################################
        #                               END OF YOUR CODE                             #
        ##############################################################################
        return total_loss

    def inference(self, images, proposals, proposal_batch_ids, thresh=0.5, nms_thresh=0.7):
        """"
        Inference-time forward pass for our two-stage Faster R-CNN detector

        Inputs:
        - images: Tensor of shape (B, 3, H, W) giving input images
        - proposals: Tensor of shape (M, 4) giving the proposals for input images, 
        where M is the total number of proposals in the batch
        - proposal_batch_ids: Tensor of shape (M, ) giving the index of the image 
        that each proposals belongs to
        - thresh: Threshold value on confidence probability. HINT: You can convert the
        classification score to probability using a softmax nonlinearity.
        - nms_thresh: IoU threshold for NMS

        We can output a variable number of predicted boxes per input image.
        In particular we assume that the input images[i] gives rise to P_i final
        predicted boxes.

        Outputs:
        - final_proposals: List of length (B,) where final_proposals[i] is a Tensor
        of shape (P_i, 4) giving the coordinates of the final predicted boxes for
        the input images[i]
        - final_conf_probs: List of length (B,) where final_conf_probs[i] is a
        Tensor of shape (P_i, 1) giving the predicted probabilites that the boxes
        in final_proposals[i] are objects (vs background)
        - final_class: List of length (B,), where final_class[i] is an int64 Tensor
        of shape (P_i, 1) giving the predicted category labels for each box in
        final_proposals[i].
        """
        final_proposals, final_conf_probs, final_class = None, None, None
        ##############################################################################
        # TODO: Predicting the final proposal coordinates `final_proposals`,         #
        # confidence scores `final_conf_probs`, and the class index `final_class`.   #
        # The overall steps are similar to the forward pass, but now you cannot      #
        # decide the activated nor negative proposals without GT boxes.              #
        # You should apply post-processing (thresholding and NMS) to all proposals   #
        # and keep the final proposals.                                               #
        ##############################################################################
        # Replace "pass" statement with your code
        B = images.shape[0]

        # extract image feature
        feat = self.feat_extractor.forward(images)

        # perform RoI Pool & mean pool
        feat=torchvision.ops.roi_pool(feat, torch.cat((proposal_batch_ids.unsqueeze(1), proposals),dim=1), output_size=(self.roi_output_w, self.roi_output_h))
        feat = feat.mean(dim=[2, 3])

        # forward heads, get predicted cls scores & offsets
        cls_scores = self.cls_head(feat)
        print(cls_scores.shape)
        bbox_offsets = self.bbox_head(feat)
        print(bbox_offsets.shape)
        # get predicted boxes & class label & confidence probability
        proposals = generate_proposal(proposals, bbox_offsets)

        final_proposals = []
        final_conf_probs = []
        final_class = []
        # post-process to get final predictions
        for img_idx in range(B):

            # filter by threshold
            cls_prob = torch.softmax(cls_scores[proposal_batch_ids == img_idx], dim=1)
            print(cls_prob.shape)
            pos_mask = cls_prob[:, 1] > thresh
            print(pos_mask.shape)
            proposals_img = proposals[proposal_batch_ids == img_idx][pos_mask]
            print(proposals_img.shape)

            print(cls_prob.shape)
            final_proposals.append(proposals_img)
            final_conf_probs.append(cls_prob[pos_mask, 1].unsqueeze(1))


            # nms
            keep = torchvision.ops.nms(proposals_img, cls_prob[:, 1], nms_thresh)
            proposals_img = proposals_img[keep]
            cls_prob = cls_prob[keep]


        ##############################################################################
        #                               END OF YOUR CODE                             #
        ##############################################################################
        return final_proposals, final_conf_probs, final_class
first commit 2024-11-13 05:46:39 +00:00			`import math`

			`import torch`
			`import torch.nn as nn`
			`import torchvision`
			`from torchvision import models`

			`from utils import compute_offsets, assign_label, generate_proposal`
			`from loss import ClsScoreRegression, BboxRegression`


			`class FeatureExtractor(nn.Module):`
			`"""`
			`Image feature extraction with MobileNet.`
			`"""`
			`def __init__(self, reshape_size=224, pooling=False, verbose=False):`
			`super().__init__()`

			`self.mobilenet = models.mobilenet_v2(pretrained=True)`
			`self.mobilenet = nn.Sequential(*list(self.mobilenet.children())[:-1]) # Remove the last classifier`

			`# average pooling`
			`if pooling:`
			`self.mobilenet.add_module('LastAvgPool', nn.AvgPool2d(math.ceil(reshape_size/32.))) # input: N x 1280 x 7 x 7`

			`for i in self.mobilenet.named_parameters():`
			`i[1].requires_grad = True # fine-tune all`

			`def forward(self, img, verbose=False):`
			`"""`
			`Inputs:`
			`- img: Batch of resized images, of shape Nx3x224x224`

			`Outputs:`
			`- feat: Image feature, of shape Nx1280 (pooled) or Nx1280x7x7`
			`"""`
			`num_img = img.shape[0]`

			`img_prepro = img`

			`feat = []`
			`process_batch = 500`
			`for b in range(math.ceil(num_img/process_batch)):`
			`feat.append(self.mobilenet(img_prepro[bprocess_batch:(b+1)process_batch]`
			`).squeeze(-1).squeeze(-1)) # forward and squeeze`
			`feat = torch.cat(feat)`

			`if verbose:`
			`print('Output feature shape: ', feat.shape)`

			`return feat`


			`class FastRCNN(nn.Module):`
			`def __init__(self, in_dim=1280, hidden_dim=256, num_classes=20, \`
			`roi_output_w=2, roi_output_h=2, drop_ratio=0.3):`
			`super().__init__()`

			`assert(num_classes != 0)`
			`self.num_classes = num_classes`
			`self.roi_output_w, self.roi_output_h = roi_output_w, roi_output_h`
			`self.feat_extractor = FeatureExtractor()`
			`##############################################################################`
			`# TODO: Declare the cls & bbox heads (in Fast R-CNN). #`
			`# The cls & bbox heads share a sequential module with a Linear layer, #`
			`# followed by a Dropout (p=drop_ratio), a ReLU nonlinearity and another #`
			`# Linear layer. #`
			`# The cls head is a Linear layer that predicts num_classes + 1 (background). #`
			`# The det head is a Linear layer that predicts offsets(dim=4). #`
			`# HINT: The dimension of the two Linear layers are in_dim -> hidden_dim and #`
			`# hidden_dim -> hidden_dim. #`
			`##############################################################################`
			`# Replace "pass" statement with your code`
forward completed 2024-11-18 15:46:49 +00:00			`self.cls_head = nn.Sequential(`
			`nn.Linear(in_dim, hidden_dim),`
			`nn.Dropout(drop_ratio),`
			`nn.ReLU(),`
			`nn.Linear(hidden_dim, num_classes+1)`
			`)`
			`self.bbox_head = nn.Sequential(`
			`nn.Linear(in_dim, hidden_dim),`
			`nn.Dropout(drop_ratio),`
			`nn.ReLU(),`
			`nn.Linear(hidden_dim, 4)`
			`)`
first commit 2024-11-13 05:46:39 +00:00			`##############################################################################`
			`# END OF YOUR CODE #`
			`##############################################################################`

			`def forward(self, images, bboxes, bbox_batch_ids, proposals, proposal_batch_ids):`
			`"""`
			`Training-time forward pass for our two-stage Faster R-CNN detector.`

			`Inputs:`
			`- images: Tensor of shape (B, 3, H, W) giving input images`
			`- bboxes: Tensor of shape (N, 5) giving ground-truth bounding boxes`
			`and category labels, from the dataloader, where N is the total number`
			`of GT boxes in the batch`
			`- bbox_batch_ids: Tensor of shape (N, ) giving the index (in the batch)`
			`of the image that each GT box belongs to`
			`- proposals: Tensor of shape (M, 4) giving the proposals for input images,`
			`where M is the total number of proposals in the batch`
			`- proposal_batch_ids: Tensor of shape (M, ) giving the index of the image`
			`that each proposals belongs to`

			`Outputs:`
			`- total_loss: Torch scalar giving the overall training loss.`
			`"""`
			`w_cls = 1 # for cls_scores`
			`w_bbox = 1 # for offsets`
			`total_loss = None`
			`##############################################################################`
			`# TODO: Implement the forward pass of Fast R-CNN. #`
			`# A few key steps are outlined as follows: #`
			`# i) Extract image fearure. #`
			`# ii) Perform RoI Align on proposals, then meanpool the feature in the #`
			`# spatial dimension. #`
			`# iii) Pass the RoI feature through the shared-fc layer. Predict #`
			`# classification scores ans box offsets. #`
			`# iv) Assign the proposals with targets of each image. #`
			`# v) Compute the cls_loss between the predicted class_prob and GT_class #`
			`# (For poistive & negative proposals) #`
			`# Compute the bbox_loss between the offsets and GT_offsets #`
			`# (For positive proposals) #`
			`# Compute the total_loss which is formulated as: #`
			`# total_loss = w_clscls_loss + w_bboxbbox_loss. #`
			`##############################################################################`
			`# Replace "pass" statement with your code`
			`B, _, H, W = images.shape`

			`# extract image feature`
forward completed 2024-11-18 15:46:49 +00:00			`feat=self.feat_extractor.forward(images)`
			`# print(feat.shape)`
first commit 2024-11-13 05:46:39 +00:00
			`# perform RoI Pool & mean pool`
forward completed 2024-11-18 15:46:49 +00:00			`feat=torchvision.ops.roi_pool(feat, torch.cat((proposal_batch_ids.unsqueeze(1), proposals),dim=1), output_size=(self.roi_output_w, self.roi_output_h))`
			`# print(feat.shape)`
			`feat=feat.mean(dim=[2,3])`
			`# print(feat.shape)`
first commit 2024-11-13 05:46:39 +00:00
			`# forward heads, get predicted cls scores & offsets`
forward completed 2024-11-18 15:46:49 +00:00			`cls_scores=self.cls_head(feat)`
			`bbox_offsets=self.bbox_head(feat)`
			`# print(cls_scores.shape, bbox_offsets.shape)`
first commit 2024-11-13 05:46:39 +00:00
			`# assign targets with proposals`
			`pos_masks, neg_masks, GT_labels, GT_bboxes = [], [], [], []`
			`for img_idx in range(B):`
			`# get the positive/negative proposals and corresponding`
			`# GT box & class label of this image`
forward completed 2024-11-18 15:46:49 +00:00			`pos_mask, neg_mask, GT_label, GT_bbox = assign_label(proposals[proposal_batch_ids==img_idx,:], bboxes[bbox_batch_ids==img_idx,:], self.num_classes)`
			`# print(pos_mask.shape, neg_mask.shape, GT_label.shape, GT_bbox.shape)`
			`pos_masks.append(pos_mask)`
			`neg_masks.append(neg_mask)`
			`GT_labels.append(GT_label)`
			`GT_bboxes.append(GT_bbox)`
first commit 2024-11-13 05:46:39 +00:00
			`# compute loss`
forward completed 2024-11-18 15:46:49 +00:00			`cls_loss = 0`
			`img_idx = 0`
			`for GT_label in GT_labels:`
			`# print(cls_scores.shape, GT_label.shape)`
			`cls_loss += ClsScoreRegression(cls_scores[proposal_batch_ids==img_idx,:], GT_label, B)`
			`img_idx += 1`
			`bbox_loss = 0`
			`img_idx=0`

			`for GT_bbox in GT_bboxes:`
			`bbox_offsets_cur=bbox_offsets[proposal_batch_ids==img_idx,:]`
			`pos_box_offsets = bbox_offsets_cur[pos_masks[img_idx],:]`
			`proposals_cur = proposals[proposal_batch_ids==img_idx,:]`
			`pos_proposals = proposals_cur[pos_masks[img_idx],:]`
			`# print(pos_box_offsets.shape, GT_bbox.shape)`
			`bbox_loss += BboxRegression(pos_box_offsets, compute_offsets(pos_proposals, GT_bbox), B)`
			`img_idx += 1`
			`total_loss=cls_loss+bbox_loss`
first commit 2024-11-13 05:46:39 +00:00
			`##############################################################################`
			`# END OF YOUR CODE #`
			`##############################################################################`
			`return total_loss`

			`def inference(self, images, proposals, proposal_batch_ids, thresh=0.5, nms_thresh=0.7):`
			`""""`
			`Inference-time forward pass for our two-stage Faster R-CNN detector`

			`Inputs:`
			`- images: Tensor of shape (B, 3, H, W) giving input images`
			`- proposals: Tensor of shape (M, 4) giving the proposals for input images,`
			`where M is the total number of proposals in the batch`
			`- proposal_batch_ids: Tensor of shape (M, ) giving the index of the image`
			`that each proposals belongs to`
			`- thresh: Threshold value on confidence probability. HINT: You can convert the`
			`classification score to probability using a softmax nonlinearity.`
			`- nms_thresh: IoU threshold for NMS`

			`We can output a variable number of predicted boxes per input image.`
			`In particular we assume that the input images[i] gives rise to P_i final`
			`predicted boxes.`

			`Outputs:`
			`- final_proposals: List of length (B,) where final_proposals[i] is a Tensor`
			`of shape (P_i, 4) giving the coordinates of the final predicted boxes for`
			`the input images[i]`
			`- final_conf_probs: List of length (B,) where final_conf_probs[i] is a`
			`Tensor of shape (P_i, 1) giving the predicted probabilites that the boxes`
			`in final_proposals[i] are objects (vs background)`
			`- final_class: List of length (B,), where final_class[i] is an int64 Tensor`
			`of shape (P_i, 1) giving the predicted category labels for each box in`
			`final_proposals[i].`
			`"""`
			`final_proposals, final_conf_probs, final_class = None, None, None`
			`##############################################################################`
			# TODO: Predicting the final proposal coordinates `final_proposals`, #
			# confidence scores `final_conf_probs`, and the class index `final_class`. #
			`# The overall steps are similar to the forward pass, but now you cannot #`
			`# decide the activated nor negative proposals without GT boxes. #`
			`# You should apply post-processing (thresholding and NMS) to all proposals #`
			`# and keep the final proposals. #`
			`##############################################################################`
			`# Replace "pass" statement with your code`
			`B = images.shape[0]`

			`# extract image feature`
forward completed 2024-11-18 15:46:49 +00:00			`feat = self.feat_extractor.forward(images)`
first commit 2024-11-13 05:46:39 +00:00
			`# perform RoI Pool & mean pool`
forward completed 2024-11-18 15:46:49 +00:00			`feat=torchvision.ops.roi_pool(feat, torch.cat((proposal_batch_ids.unsqueeze(1), proposals),dim=1), output_size=(self.roi_output_w, self.roi_output_h))`
			`feat = feat.mean(dim=[2, 3])`
first commit 2024-11-13 05:46:39 +00:00
			`# forward heads, get predicted cls scores & offsets`
forward completed 2024-11-18 15:46:49 +00:00			`cls_scores = self.cls_head(feat)`
			`print(cls_scores.shape)`
			`bbox_offsets = self.bbox_head(feat)`
			`print(bbox_offsets.shape)`
first commit 2024-11-13 05:46:39 +00:00			`# get predicted boxes & class label & confidence probability`
forward completed 2024-11-18 15:46:49 +00:00			`proposals = generate_proposal(proposals, bbox_offsets)`
first commit 2024-11-13 05:46:39 +00:00
			`final_proposals = []`
			`final_conf_probs = []`
			`final_class = []`
			`# post-process to get final predictions`
			`for img_idx in range(B):`

			`# filter by threshold`
forward completed 2024-11-18 15:46:49 +00:00			`cls_prob = torch.softmax(cls_scores[proposal_batch_ids == img_idx], dim=1)`
			`print(cls_prob.shape)`
			`pos_mask = cls_prob[:, 1] > thresh`
			`print(pos_mask.shape)`
			`proposals_img = proposals[proposal_batch_ids == img_idx][pos_mask]`
			`print(proposals_img.shape)`

			`print(cls_prob.shape)`
			`final_proposals.append(proposals_img)`
			`final_conf_probs.append(cls_prob[pos_mask, 1].unsqueeze(1))`

first commit 2024-11-13 05:46:39 +00:00
			`# nms`
forward completed 2024-11-18 15:46:49 +00:00			`keep = torchvision.ops.nms(proposals_img, cls_prob[:, 1], nms_thresh)`
			`proposals_img = proposals_img[keep]`
			`cls_prob = cls_prob[keep]`

first commit 2024-11-13 05:46:39 +00:00
			`##############################################################################`
			`# END OF YOUR CODE #`
			`##############################################################################`
			`return final_proposals, final_conf_probs, final_class`