2024-11-13 05:46:39 +00:00
|
|
|
import numpy as np
|
|
|
|
import cv2
|
|
|
|
from matplotlib import pyplot as plt
|
|
|
|
import torch
|
|
|
|
|
|
|
|
|
|
|
|
def data_visualizer(img, idx_to_class, path, bbox=None, pred=None):
|
|
|
|
"""
|
|
|
|
Data visualizer on the original image. Support both GT box input and proposal input.
|
|
|
|
|
|
|
|
Input:
|
|
|
|
- img: PIL Image input
|
|
|
|
- idx_to_class: Mapping from the index (0-19) to the class name
|
|
|
|
- bbox: GT bbox (in red, optional), a tensor of shape Nx5, where N is
|
|
|
|
the number of GT boxes, 5 indicates (x_tl, y_tl, x_br, y_br, class)
|
|
|
|
- pred: Predicted bbox (in green, optional), a tensor of shape N'x6, where
|
|
|
|
N' is the number of predicted boxes, 6 indicates
|
|
|
|
(x_tl, y_tl, x_br, y_br, class, object confidence score)
|
|
|
|
"""
|
|
|
|
|
|
|
|
img_copy = np.array(img).astype('uint8')
|
|
|
|
|
|
|
|
if bbox is not None:
|
|
|
|
for bbox_idx in range(bbox.shape[0]):
|
|
|
|
one_bbox = bbox[bbox_idx][:4].numpy().astype('int')
|
|
|
|
cv2.rectangle(img_copy, (one_bbox[0], one_bbox[1]), (one_bbox[2],
|
|
|
|
one_bbox[3]), (255, 0, 0), 2)
|
|
|
|
if bbox.shape[1] > 4: # if class info provided
|
|
|
|
obj_cls = idx_to_class[bbox[bbox_idx][4].item()]
|
|
|
|
cv2.putText(img_copy, '%s' % (obj_cls),
|
|
|
|
(one_bbox[0], one_bbox[1]+15),
|
|
|
|
cv2.FONT_HERSHEY_PLAIN, 1.0, (0, 0, 255), thickness=1)
|
|
|
|
|
|
|
|
if pred is not None:
|
|
|
|
for bbox_idx in range(pred.shape[0]):
|
|
|
|
one_bbox = pred[bbox_idx][:4].numpy().astype('int')
|
|
|
|
cv2.rectangle(img_copy, (one_bbox[0], one_bbox[1]), (one_bbox[2],
|
|
|
|
one_bbox[3]), (0, 255, 0), 2)
|
|
|
|
|
|
|
|
if pred.shape[1] > 4: # if class and conf score info provided
|
|
|
|
obj_cls = idx_to_class[pred[bbox_idx][4].item()]
|
|
|
|
conf_score = pred[bbox_idx][5].item()
|
|
|
|
cv2.putText(img_copy, '%s, %.2f' % (obj_cls, conf_score),
|
|
|
|
(one_bbox[0], one_bbox[1]+15),
|
|
|
|
cv2.FONT_HERSHEY_PLAIN, 1.0, (0, 0, 255), thickness=1)
|
|
|
|
|
|
|
|
plt.imshow(img_copy)
|
|
|
|
plt.axis('off')
|
|
|
|
plt.title(path)
|
|
|
|
plt.savefig(path)
|
|
|
|
plt.close()
|
|
|
|
|
|
|
|
|
|
|
|
def coord_trans(bbox, bbox_batch_idx, w_pixel, h_pixel, w_amap=7, h_amap=7, mode='a2p'):
|
|
|
|
"""
|
|
|
|
Coordinate transformation function. It converts the box coordinate from
|
|
|
|
the image coordinate system to the activation map coordinate system and vice versa.
|
|
|
|
In our case, the input image will have a few hundred of pixels in
|
|
|
|
width/height while the activation map is of size 7x7.
|
|
|
|
|
|
|
|
Input:
|
|
|
|
- bbox: Could be either bbox, anchor, or proposal, of shape Mx4
|
|
|
|
- bbox_batch_idx: Index of the image that each bbox belongs to, of shape M
|
|
|
|
- w_pixel: Number of pixels in the width side of the original image, of shape B
|
|
|
|
- h_pixel: Number of pixels in the height side of the original image, of shape B
|
|
|
|
- w_amap: Number of pixels in the width side of the activation map, scalar
|
|
|
|
- h_amap: Number of pixels in the height side of the activation map, scalar
|
|
|
|
- mode: Whether transfer from the original image to activation map ('p2a') or
|
|
|
|
the opposite ('a2p')
|
|
|
|
|
|
|
|
Output:
|
|
|
|
- resized_bbox: Resized box coordinates, of the same shape as the input bbox
|
|
|
|
"""
|
|
|
|
|
|
|
|
assert mode in ('p2a', 'a2p'), 'invalid coordinate transformation mode!'
|
|
|
|
assert bbox.shape[-1] >= 4, 'the transformation is applied to the first 4 values of dim -1'
|
|
|
|
|
|
|
|
if bbox.shape[0] == 0: # corner cases
|
|
|
|
return bbox
|
|
|
|
|
|
|
|
resized_bbox = bbox.clone()
|
|
|
|
|
|
|
|
if mode == 'p2a':
|
|
|
|
# pixel to activation
|
|
|
|
width_ratio = w_pixel[bbox_batch_idx] * 1. / w_amap
|
|
|
|
height_ratio = h_pixel[bbox_batch_idx] * 1. / h_amap
|
|
|
|
resized_bbox[:, [0, 2]] /= width_ratio.view(-1, 1)
|
|
|
|
resized_bbox[:, [1, 3]] /= height_ratio.view(-1, 1)
|
|
|
|
else:
|
|
|
|
# activation to pixel
|
|
|
|
width_ratio = w_pixel[bbox_batch_idx] * 1. / w_amap
|
|
|
|
height_ratio = h_pixel[bbox_batch_idx] * 1. / h_amap
|
|
|
|
resized_bbox[:, [0, 2]] *= width_ratio.view(-1, 1)
|
|
|
|
resized_bbox[:, [1, 3]] *= height_ratio.view(-1, 1)
|
|
|
|
|
|
|
|
return resized_bbox
|
|
|
|
|
|
|
|
|
|
|
|
def generate_anchor(anc_per_grid, grid):
|
|
|
|
"""
|
|
|
|
Anchor generator.
|
|
|
|
|
|
|
|
Inputs:
|
|
|
|
- anc_per_grid: Tensor of shape (A, 2) giving the shapes of anchor boxes to
|
|
|
|
consider at each point in the grid. anc_per_grid[a] = (w, h) gives the width
|
|
|
|
and height of the a'th anchor shape.
|
|
|
|
- grid: Tensor of shape (B, H', W', 2) giving the (x, y) coordinates of the
|
|
|
|
center of each feature from the backbone feature map. This is the tensor
|
|
|
|
returned from GenerateGrid.
|
|
|
|
|
|
|
|
Outputs:
|
|
|
|
- anchors: Tensor of shape (B, A, H', W', 4) giving the positions of all
|
|
|
|
anchor boxes for the entire image. anchors[b, a, h, w] is an anchor box
|
|
|
|
centered at grid[b, h, w], whose shape is given by anc[a]; we parameterize
|
|
|
|
boxes as anchors[b, a, h, w] = (x_tl, y_tl, x_br, y_br), where (x_tl, y_tl)
|
|
|
|
and (x_br, y_br) give the xy coordinates of the top-left and bottom-right
|
|
|
|
corners of the box.
|
|
|
|
"""
|
|
|
|
A, _ = anc_per_grid.shape
|
|
|
|
B, H, W, _ = grid.shape
|
|
|
|
anc_per_grid = anc_per_grid.to(grid)
|
|
|
|
|
|
|
|
anc_per_grid = anc_per_grid.view(1, A, 1, 1, -1).repeat(B, 1, H, W, 1)
|
|
|
|
grid = grid.view(B, 1, H, W, -1).repeat(1, A, 1, 1, 1)
|
|
|
|
|
|
|
|
x1y1 = grid - anc_per_grid / 2
|
|
|
|
x2y2 = grid + anc_per_grid / 2
|
|
|
|
anchors = torch.cat([x1y1, x2y2], dim=-1)
|
|
|
|
|
|
|
|
return anchors
|
|
|
|
|
|
|
|
|
|
|
|
def compute_iou(anchors, bboxes):
|
|
|
|
"""
|
|
|
|
Compute the intersection-over-union between anchors and gts.
|
|
|
|
|
|
|
|
Inputs:
|
|
|
|
- anchors: Anchor boxes, of shape (M, 4), where M is the number of proposals
|
|
|
|
- bboxes: GT boxes of shape (N, 4), where N is the number of GT boxes,
|
|
|
|
4 indicates (x_{lr}^{gt}, y_{lr}^{gt}, x_{rb}^{gt}, y_{rb}^{gt})
|
|
|
|
|
|
|
|
Outputs:
|
|
|
|
- iou: IoU matrix of shape (M, N)
|
|
|
|
"""
|
2024-11-18 15:46:49 +00:00
|
|
|
iou = torch.zeros((anchors.shape[0], bboxes.shape[0]))
|
|
|
|
iou = iou.to(anchors.device)
|
2024-11-13 05:46:39 +00:00
|
|
|
##############################################################################
|
|
|
|
# TODO: Given anchors and gt bboxes, #
|
|
|
|
# compute the iou between each anchor and gt bbox. #
|
|
|
|
##############################################################################
|
2024-11-18 15:46:49 +00:00
|
|
|
for i in range(anchors.shape[0]):
|
|
|
|
for j in range(bboxes.shape[0]):
|
|
|
|
x1 = max(anchors[i][0], bboxes[j][0])
|
|
|
|
y1 = max(anchors[i][1], bboxes[j][1])
|
|
|
|
x2 = min(anchors[i][2], bboxes[j][2])
|
|
|
|
y2 = min(anchors[i][3], bboxes[j][3])
|
|
|
|
inter = max(0, x2 - x1) * max(0, y2 - y1)
|
|
|
|
area1 = (anchors[i][2] - anchors[i][0]) * (anchors[i][3] - anchors[i][1])
|
|
|
|
area2 = (bboxes[j][2] - bboxes[j][0]) * (bboxes[j][3] - bboxes[j][1])
|
|
|
|
iou[i][j] = inter / (area1 + area2 - inter)
|
2024-11-13 05:46:39 +00:00
|
|
|
|
|
|
|
##############################################################################
|
|
|
|
# END OF YOUR CODE #
|
|
|
|
##############################################################################
|
|
|
|
|
|
|
|
return iou
|
|
|
|
|
|
|
|
|
|
|
|
def compute_offsets(anchors, bboxes):
|
|
|
|
"""
|
|
|
|
Compute the offsets between anchors and gts.
|
|
|
|
|
|
|
|
Inputs:
|
|
|
|
- anchors: Anchor boxes, of shape (M, 4)
|
|
|
|
- bboxes: GT boxes of shape (M, 4),
|
|
|
|
4 indicates (x_{lr}^{gt}, y_{lr}^{gt}, x_{rb}^{gt}, y_{rb}^{gt})
|
|
|
|
|
|
|
|
Outputs:
|
|
|
|
- offsets: offsets of shape (M, 4)
|
|
|
|
"""
|
|
|
|
wh_offsets = torch.log((bboxes[:, 2:4] - bboxes[:, :2]) \
|
|
|
|
/ (anchors[:, 2:4] - anchors[:, :2]))
|
|
|
|
|
|
|
|
xy_offsets = (bboxes[:, :2] + bboxes[:, 2:4] - \
|
|
|
|
anchors[:, :2] - anchors[:, 2:4]) / 2.
|
|
|
|
|
|
|
|
xy_offsets /= (anchors[:, 2:4] - anchors[:, :2])
|
|
|
|
|
|
|
|
offsets = torch.cat((xy_offsets, wh_offsets), dim=-1)
|
|
|
|
|
|
|
|
return offsets
|
|
|
|
|
|
|
|
|
|
|
|
def generate_proposal(anchors, offsets):
|
|
|
|
"""
|
|
|
|
Proposal generator.
|
|
|
|
|
|
|
|
Inputs:
|
|
|
|
- anchors: Anchor boxes, of shape (M, 4). Anchors are represented
|
|
|
|
by the coordinates of their top-left and bottom-right corners.
|
|
|
|
- offsets: Transformations of shape (M, 4) that will be used to
|
|
|
|
convert anchor boxes into region proposals. The transformation
|
|
|
|
offsets[m] = (tx, ty, tw, th) will be applied to the anchor
|
|
|
|
anchors[m].
|
|
|
|
|
|
|
|
Outputs:
|
|
|
|
- proposals: Region proposals of shape (M, 4), represented by the
|
|
|
|
coordinates of their top-left and bottom-right corners. Applying the
|
|
|
|
transform offsets[m] to the anchor[m] should give the
|
|
|
|
proposal proposals[m].
|
|
|
|
|
|
|
|
"""
|
|
|
|
proposals = None
|
|
|
|
##############################################################################
|
|
|
|
# TODO: Given anchor coordinates and the proposed offset for each anchor, #
|
|
|
|
# compute the proposal coordinates using the transformation formulas above. #
|
|
|
|
##############################################################################
|
|
|
|
# Replace "pass" statement with your code
|
2024-11-18 15:46:49 +00:00
|
|
|
proposals = torch.zeros_like(anchors)
|
|
|
|
proposals[:, :2] = anchors[:, :2] + offsets[:, :2] * (anchors[:, 2:4] - anchors[:, :2])
|
|
|
|
proposals[:, 2:4] = anchors[:, 2:4] * torch.exp(offsets[:, 2:4])
|
|
|
|
|
2024-11-13 05:46:39 +00:00
|
|
|
|
|
|
|
##############################################################################
|
|
|
|
# END OF YOUR CODE #
|
|
|
|
##############################################################################
|
|
|
|
|
|
|
|
return proposals
|
|
|
|
|
|
|
|
|
|
|
|
@torch.no_grad()
|
|
|
|
def assign_label(proposals, bboxes, background_id, pos_thresh=0.5, neg_thresh=0.5, pos_fraction=0.25):
|
|
|
|
"""
|
|
|
|
Determine the activated (positive) and negative proposals for model training.
|
|
|
|
|
|
|
|
For Fast R-CNN - Positive proposals are defined Any of the two
|
|
|
|
(i) the proposal/proposals with the highest IoU overlap with a GT box, or
|
|
|
|
(ii) a proposal that has an IoU overlap higher than positive threshold with any GT box.
|
|
|
|
Note: One proposal can match at most one GT box (the one with the largest IoU overlapping).
|
|
|
|
|
|
|
|
We assign a negative label to a proposal if its IoU ratio is lower than
|
|
|
|
a threshold value for all GT boxes. Proposals that are neither positive nor negative
|
|
|
|
do not contribute to the training objective.
|
|
|
|
|
|
|
|
Main steps include:
|
|
|
|
i) Decide activated and negative proposals based on the IoU matrix.
|
|
|
|
ii) Compute GT confidence score/offsets/object class on the positive proposals.
|
|
|
|
iii) Compute GT confidence score on the negative proposals.
|
|
|
|
|
|
|
|
Inputs:
|
|
|
|
- proposal: Proposal boxes, of shape (M, 4), where M is the number of proposals
|
|
|
|
- bboxes: GT boxes of shape Nx5, where N is the number of GT boxes,
|
|
|
|
5 indicates (x_{lr}^{gt}, y_{lr}^{gt}, x_{rb}^{gt}, y_{rb}^{gt}) and class index
|
|
|
|
- background_id: Class id of the background class
|
|
|
|
- pos_thresh: Positive threshold value
|
|
|
|
- neg_thresh: Negative threshold value
|
|
|
|
- pos_fraction: a factor balancing pos/neg proposals
|
|
|
|
|
|
|
|
Outputs:
|
|
|
|
- activated_anc_mask: a binary mask indicating the activated proposals, of shape M
|
|
|
|
- negative_anc_mask: a binary mask indicating the negative proposals, of shape M
|
|
|
|
- GT_class: GT class category on all proposals, background class for non-activated proposals,
|
|
|
|
of shape M
|
|
|
|
- bboxes: GT bboxes on activated proposals, of shape M'x4, where M' is the number of
|
|
|
|
activated proposals
|
|
|
|
"""
|
|
|
|
M = proposals.shape[0]
|
|
|
|
N = bboxes.shape[0]
|
|
|
|
iou_mat = compute_iou(proposals, bboxes[:, :4])
|
|
|
|
|
|
|
|
# activated/positive proposals
|
|
|
|
max_iou_per_anc, max_iou_per_anc_ind = iou_mat.max(dim=-1)
|
|
|
|
max_iou_per_box = iou_mat.max(dim=0, keepdim=True)[0]
|
|
|
|
activated_anc_mask = (iou_mat == max_iou_per_box) & (max_iou_per_box > 0)
|
|
|
|
activated_anc_mask |= (iou_mat > pos_thresh) # using the pos_thresh condition as well
|
|
|
|
activated_anc_mask = activated_anc_mask.max(dim=-1)[0] # (M, )
|
|
|
|
activated_anc_ind = torch.nonzero(activated_anc_mask.view(-1)).squeeze(-1)
|
|
|
|
|
|
|
|
# GT class
|
|
|
|
box_cls = bboxes[:, 4].long().view(1, N).expand(M, N)
|
|
|
|
# if a proposal matches multiple GT boxes, choose the box with the largest iou
|
|
|
|
GT_class = torch.gather(box_cls, -1, max_iou_per_anc_ind.unsqueeze(-1)).squeeze(-1) # M
|
|
|
|
GT_class[~activated_anc_mask] = background_id
|
|
|
|
|
|
|
|
# GT bboxes
|
|
|
|
bboxes_expand = bboxes[:, :4].view(1, N, 4).expand((M, N, 4))
|
|
|
|
bboxes = torch.gather(bboxes_expand, -2, max_iou_per_anc_ind.unsqueeze(-1) \
|
|
|
|
.unsqueeze(-1).expand(M, 1, 4)).view(M, 4)
|
|
|
|
bboxes = bboxes[activated_anc_ind]
|
|
|
|
|
|
|
|
# negative anchors
|
|
|
|
negative_anc_mask = (max_iou_per_anc < neg_thresh)
|
|
|
|
negative_anc_ind = torch.nonzero(negative_anc_mask.view(-1)).squeeze(-1)
|
|
|
|
# balance pos/neg anchors, random choose
|
|
|
|
num_neg = int(activated_anc_ind.shape[0] * (1 - pos_fraction) / pos_fraction)
|
|
|
|
negative_anc_ind = negative_anc_ind[torch.randint(0, negative_anc_ind.shape[0], (num_neg,))]
|
|
|
|
negative_anc_mask = torch.zeros_like(negative_anc_mask)
|
|
|
|
negative_anc_mask[negative_anc_ind] = 1
|
|
|
|
|
|
|
|
return activated_anc_mask, negative_anc_mask, GT_class, bboxes
|