267 lines
12 KiB
Python
267 lines
12 KiB
Python
import math
|
|
|
|
import torch
|
|
import torch.nn as nn
|
|
import torchvision
|
|
from torchvision import models
|
|
|
|
from utils import compute_offsets, assign_label, generate_proposal
|
|
from loss import ClsScoreRegression, BboxRegression
|
|
|
|
|
|
class FeatureExtractor(nn.Module):
|
|
"""
|
|
Image feature extraction with MobileNet.
|
|
"""
|
|
def __init__(self, reshape_size=224, pooling=False, verbose=False):
|
|
super().__init__()
|
|
|
|
self.mobilenet = models.mobilenet_v2(pretrained=True)
|
|
self.mobilenet = nn.Sequential(*list(self.mobilenet.children())[:-1]) # Remove the last classifier
|
|
|
|
# average pooling
|
|
if pooling:
|
|
self.mobilenet.add_module('LastAvgPool', nn.AvgPool2d(math.ceil(reshape_size/32.))) # input: N x 1280 x 7 x 7
|
|
|
|
for i in self.mobilenet.named_parameters():
|
|
i[1].requires_grad = True # fine-tune all
|
|
|
|
def forward(self, img, verbose=False):
|
|
"""
|
|
Inputs:
|
|
- img: Batch of resized images, of shape Nx3x224x224
|
|
|
|
Outputs:
|
|
- feat: Image feature, of shape Nx1280 (pooled) or Nx1280x7x7
|
|
"""
|
|
num_img = img.shape[0]
|
|
|
|
img_prepro = img
|
|
|
|
feat = []
|
|
process_batch = 500
|
|
for b in range(math.ceil(num_img/process_batch)):
|
|
feat.append(self.mobilenet(img_prepro[b*process_batch:(b+1)*process_batch]
|
|
).squeeze(-1).squeeze(-1)) # forward and squeeze
|
|
feat = torch.cat(feat)
|
|
|
|
if verbose:
|
|
print('Output feature shape: ', feat.shape)
|
|
|
|
return feat
|
|
|
|
|
|
class FastRCNN(nn.Module):
|
|
def __init__(self, in_dim=1280, hidden_dim=256, num_classes=20, \
|
|
roi_output_w=2, roi_output_h=2, drop_ratio=0.3):
|
|
super().__init__()
|
|
|
|
assert(num_classes != 0)
|
|
self.num_classes = num_classes
|
|
self.roi_output_w, self.roi_output_h = roi_output_w, roi_output_h
|
|
self.feat_extractor = FeatureExtractor()
|
|
##############################################################################
|
|
# TODO: Declare the cls & bbox heads (in Fast R-CNN). #
|
|
# The cls & bbox heads share a sequential module with a Linear layer, #
|
|
# followed by a Dropout (p=drop_ratio), a ReLU nonlinearity and another #
|
|
# Linear layer. #
|
|
# The cls head is a Linear layer that predicts num_classes + 1 (background). #
|
|
# The det head is a Linear layer that predicts offsets(dim=4). #
|
|
# HINT: The dimension of the two Linear layers are in_dim -> hidden_dim and #
|
|
# hidden_dim -> hidden_dim. #
|
|
##############################################################################
|
|
# Replace "pass" statement with your code
|
|
self.cls_head = nn.Sequential(
|
|
nn.Linear(in_dim, hidden_dim),
|
|
nn.Dropout(drop_ratio),
|
|
nn.ReLU(),
|
|
nn.Linear(hidden_dim, num_classes+1)
|
|
)
|
|
self.bbox_head = nn.Sequential(
|
|
nn.Linear(in_dim, hidden_dim),
|
|
nn.Dropout(drop_ratio),
|
|
nn.ReLU(),
|
|
nn.Linear(hidden_dim, 4)
|
|
)
|
|
##############################################################################
|
|
# END OF YOUR CODE #
|
|
##############################################################################
|
|
|
|
def forward(self, images, bboxes, bbox_batch_ids, proposals, proposal_batch_ids):
|
|
"""
|
|
Training-time forward pass for our two-stage Faster R-CNN detector.
|
|
|
|
Inputs:
|
|
- images: Tensor of shape (B, 3, H, W) giving input images
|
|
- bboxes: Tensor of shape (N, 5) giving ground-truth bounding boxes
|
|
and category labels, from the dataloader, where N is the total number
|
|
of GT boxes in the batch
|
|
- bbox_batch_ids: Tensor of shape (N, ) giving the index (in the batch)
|
|
of the image that each GT box belongs to
|
|
- proposals: Tensor of shape (M, 4) giving the proposals for input images,
|
|
where M is the total number of proposals in the batch
|
|
- proposal_batch_ids: Tensor of shape (M, ) giving the index of the image
|
|
that each proposals belongs to
|
|
|
|
Outputs:
|
|
- total_loss: Torch scalar giving the overall training loss.
|
|
"""
|
|
w_cls = 1 # for cls_scores
|
|
w_bbox = 1 # for offsets
|
|
total_loss = None
|
|
##############################################################################
|
|
# TODO: Implement the forward pass of Fast R-CNN. #
|
|
# A few key steps are outlined as follows: #
|
|
# i) Extract image fearure. #
|
|
# ii) Perform RoI Align on proposals, then meanpool the feature in the #
|
|
# spatial dimension. #
|
|
# iii) Pass the RoI feature through the shared-fc layer. Predict #
|
|
# classification scores ans box offsets. #
|
|
# iv) Assign the proposals with targets of each image. #
|
|
# v) Compute the cls_loss between the predicted class_prob and GT_class #
|
|
# (For poistive & negative proposals) #
|
|
# Compute the bbox_loss between the offsets and GT_offsets #
|
|
# (For positive proposals) #
|
|
# Compute the total_loss which is formulated as: #
|
|
# total_loss = w_cls*cls_loss + w_bbox*bbox_loss. #
|
|
##############################################################################
|
|
# Replace "pass" statement with your code
|
|
B, _, H, W = images.shape
|
|
|
|
# extract image feature
|
|
feat=self.feat_extractor.forward(images)
|
|
# print(feat.shape)
|
|
|
|
# perform RoI Pool & mean pool
|
|
feat=torchvision.ops.roi_pool(feat, torch.cat((proposal_batch_ids.unsqueeze(1), proposals),dim=1), output_size=(self.roi_output_w, self.roi_output_h))
|
|
# print(feat.shape)
|
|
feat=feat.mean(dim=[2,3])
|
|
# print(feat.shape)
|
|
|
|
# forward heads, get predicted cls scores & offsets
|
|
cls_scores=self.cls_head(feat)
|
|
bbox_offsets=self.bbox_head(feat)
|
|
# print(cls_scores.shape, bbox_offsets.shape)
|
|
|
|
# assign targets with proposals
|
|
pos_masks, neg_masks, GT_labels, GT_bboxes = [], [], [], []
|
|
for img_idx in range(B):
|
|
# get the positive/negative proposals and corresponding
|
|
# GT box & class label of this image
|
|
pos_mask, neg_mask, GT_label, GT_bbox = assign_label(proposals[proposal_batch_ids==img_idx,:], bboxes[bbox_batch_ids==img_idx,:], self.num_classes)
|
|
# print(pos_mask.shape, neg_mask.shape, GT_label.shape, GT_bbox.shape)
|
|
pos_masks.append(pos_mask)
|
|
neg_masks.append(neg_mask)
|
|
GT_labels.append(GT_label)
|
|
GT_bboxes.append(GT_bbox)
|
|
|
|
# compute loss
|
|
cls_loss = 0
|
|
img_idx = 0
|
|
for GT_label in GT_labels:
|
|
# print(cls_scores.shape, GT_label.shape)
|
|
cls_loss += ClsScoreRegression(cls_scores[proposal_batch_ids==img_idx,:], GT_label, B)
|
|
img_idx += 1
|
|
bbox_loss = 0
|
|
img_idx=0
|
|
|
|
for GT_bbox in GT_bboxes:
|
|
bbox_offsets_cur=bbox_offsets[proposal_batch_ids==img_idx,:]
|
|
pos_box_offsets = bbox_offsets_cur[pos_masks[img_idx],:]
|
|
proposals_cur = proposals[proposal_batch_ids==img_idx,:]
|
|
pos_proposals = proposals_cur[pos_masks[img_idx],:]
|
|
# print(pos_box_offsets.shape, GT_bbox.shape)
|
|
bbox_loss += BboxRegression(pos_box_offsets, compute_offsets(pos_proposals, GT_bbox), B)
|
|
img_idx += 1
|
|
total_loss=cls_loss+bbox_loss
|
|
|
|
##############################################################################
|
|
# END OF YOUR CODE #
|
|
##############################################################################
|
|
return total_loss
|
|
|
|
def inference(self, images, proposals, proposal_batch_ids, thresh=0.5, nms_thresh=0.7):
|
|
""""
|
|
Inference-time forward pass for our two-stage Faster R-CNN detector
|
|
|
|
Inputs:
|
|
- images: Tensor of shape (B, 3, H, W) giving input images
|
|
- proposals: Tensor of shape (M, 4) giving the proposals for input images,
|
|
where M is the total number of proposals in the batch
|
|
- proposal_batch_ids: Tensor of shape (M, ) giving the index of the image
|
|
that each proposals belongs to
|
|
- thresh: Threshold value on confidence probability. HINT: You can convert the
|
|
classification score to probability using a softmax nonlinearity.
|
|
- nms_thresh: IoU threshold for NMS
|
|
|
|
We can output a variable number of predicted boxes per input image.
|
|
In particular we assume that the input images[i] gives rise to P_i final
|
|
predicted boxes.
|
|
|
|
Outputs:
|
|
- final_proposals: List of length (B,) where final_proposals[i] is a Tensor
|
|
of shape (P_i, 4) giving the coordinates of the final predicted boxes for
|
|
the input images[i]
|
|
- final_conf_probs: List of length (B,) where final_conf_probs[i] is a
|
|
Tensor of shape (P_i, 1) giving the predicted probabilites that the boxes
|
|
in final_proposals[i] are objects (vs background)
|
|
- final_class: List of length (B,), where final_class[i] is an int64 Tensor
|
|
of shape (P_i, 1) giving the predicted category labels for each box in
|
|
final_proposals[i].
|
|
"""
|
|
final_proposals, final_conf_probs, final_class = None, None, None
|
|
##############################################################################
|
|
# TODO: Predicting the final proposal coordinates `final_proposals`, #
|
|
# confidence scores `final_conf_probs`, and the class index `final_class`. #
|
|
# The overall steps are similar to the forward pass, but now you cannot #
|
|
# decide the activated nor negative proposals without GT boxes. #
|
|
# You should apply post-processing (thresholding and NMS) to all proposals #
|
|
# and keep the final proposals. #
|
|
##############################################################################
|
|
# Replace "pass" statement with your code
|
|
B = images.shape[0]
|
|
|
|
# extract image feature
|
|
feat = self.feat_extractor.forward(images)
|
|
|
|
# perform RoI Pool & mean pool
|
|
feat=torchvision.ops.roi_pool(feat, torch.cat((proposal_batch_ids.unsqueeze(1), proposals),dim=1), output_size=(self.roi_output_w, self.roi_output_h))
|
|
feat = feat.mean(dim=[2, 3])
|
|
|
|
# forward heads, get predicted cls scores & offsets
|
|
cls_scores = self.cls_head(feat)
|
|
print(cls_scores.shape)
|
|
bbox_offsets = self.bbox_head(feat)
|
|
print(bbox_offsets.shape)
|
|
# get predicted boxes & class label & confidence probability
|
|
proposals = generate_proposal(proposals, bbox_offsets)
|
|
|
|
final_proposals = []
|
|
final_conf_probs = []
|
|
final_class = []
|
|
# post-process to get final predictions
|
|
for img_idx in range(B):
|
|
|
|
# filter by threshold
|
|
cls_prob = torch.softmax(cls_scores[proposal_batch_ids == img_idx], dim=1)
|
|
print(cls_prob.shape)
|
|
pos_mask = (torch.max(cls_prob, dim=1)[0] > thresh) and (torch.max(cls_prob, dim=1)[1] != self.num_classes)
|
|
print(pos_mask.shape)
|
|
proposals_img = proposals[proposal_batch_ids == img_idx][pos_mask]
|
|
print(proposals_img.shape)
|
|
|
|
print(cls_prob.shape)
|
|
final_proposals.append(proposals_img)
|
|
final_conf_probs.append(cls_prob[pos_mask, 1].unsqueeze(1))
|
|
|
|
|
|
# nms
|
|
keep = torchvision.ops.nms(proposals_img, cls_prob[:, 1], nms_thresh)
|
|
proposals_img = proposals_img[keep]
|
|
cls_prob = cls_prob[keep]
|
|
|
|
|
|
##############################################################################
|
|
# END OF YOUR CODE #
|
|
##############################################################################
|
|
return final_proposals, final_conf_probs, final_class |