first commit
This commit is contained in:
commit
16b08f040b
|
@ -0,0 +1,7 @@
|
|||
.venv
|
||||
__pycache__
|
||||
*.pyc
|
||||
*.pyo
|
||||
ckpts
|
||||
data
|
||||
exp
|
|
@ -0,0 +1,904 @@
|
|||
import glob
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import operator
|
||||
import sys
|
||||
import argparse
|
||||
import math
|
||||
|
||||
import numpy as np
|
||||
|
||||
MINOVERLAP = 0.5 # default value (defined in the PASCAL VOC2012 challenge)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--path', type=str, help="the saving directory to compute mAP")
|
||||
parser.add_argument('-na', '--no-animation', help="no animation is shown.", action="store_true")
|
||||
parser.add_argument('-np', '--no-plot', help="no plot is shown.", action="store_true")
|
||||
parser.add_argument('-q', '--quiet', help="minimalistic console output.", action="store_true")
|
||||
# argparse receiving list of classes to be ignored (e.g., python main.py --ignore person book)
|
||||
parser.add_argument('-i', '--ignore', nargs='+', type=str, help="ignore a list of classes.")
|
||||
# argparse receiving list of classes with specific IoU (e.g., python main.py --set-class-iou person 0.7)
|
||||
parser.add_argument('--set-class-iou', nargs='+', type=str, help="set IoU for a specific class.")
|
||||
args = parser.parse_args()
|
||||
|
||||
'''
|
||||
0,0 ------> x (width)
|
||||
|
|
||||
| (Left,Top)
|
||||
| *_________
|
||||
| | |
|
||||
| |
|
||||
y |_________|
|
||||
(height) *
|
||||
(Right,Bottom)
|
||||
'''
|
||||
|
||||
# if there are no classes to ignore then replace None by empty list
|
||||
if args.ignore is None:
|
||||
args.ignore = []
|
||||
|
||||
specific_iou_flagged = False
|
||||
if args.set_class_iou is not None:
|
||||
specific_iou_flagged = True
|
||||
|
||||
# make sure that the cwd() is the location of the python script (so that every path makes sense)
|
||||
os.chdir(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
GT_PATH = os.path.join(args.path, 'mAP_input', 'ground-truth')
|
||||
DR_PATH = os.path.join(args.path, 'mAP_input', 'detection-results')
|
||||
# if there are no images then no animation can be shown
|
||||
IMG_PATH = os.path.join(args.path, 'mAP_input', 'images-optional')
|
||||
if os.path.exists(IMG_PATH):
|
||||
for dirpath, dirnames, files in os.walk(IMG_PATH):
|
||||
if not files:
|
||||
# no image files found
|
||||
args.no_animation = True
|
||||
else:
|
||||
args.no_animation = True
|
||||
|
||||
# try to import OpenCV if the user didn't choose the option --no-animation
|
||||
show_animation = False
|
||||
if not args.no_animation:
|
||||
try:
|
||||
import cv2
|
||||
show_animation = True
|
||||
except ImportError:
|
||||
print("\"opencv-python\" not found, please install to visualize the results.")
|
||||
args.no_animation = True
|
||||
|
||||
# try to import Matplotlib if the user didn't choose the option --no-plot
|
||||
draw_plot = False
|
||||
if not args.no_plot:
|
||||
try:
|
||||
import matplotlib.pyplot as plt
|
||||
draw_plot = True
|
||||
except ImportError:
|
||||
print("\"matplotlib\" not found, please install it to get the resulting plots.")
|
||||
args.no_plot = True
|
||||
|
||||
|
||||
def log_average_miss_rate(prec, rec, num_images):
|
||||
"""
|
||||
log-average miss rate:
|
||||
Calculated by averaging miss rates at 9 evenly spaced FPPI points
|
||||
between 10e-2 and 10e0, in log-space.
|
||||
|
||||
output:
|
||||
lamr | log-average miss rate
|
||||
mr | miss rate
|
||||
fppi | false positives per image
|
||||
|
||||
references:
|
||||
[1] Dollar, Piotr, et al. "Pedestrian Detection: An Evaluation of the
|
||||
State of the Art." Pattern Analysis and Machine Intelligence, IEEE
|
||||
Transactions on 34.4 (2012): 743 - 761.
|
||||
"""
|
||||
|
||||
# if there were no detections of that class
|
||||
if prec.size == 0:
|
||||
lamr = 0
|
||||
mr = 1
|
||||
fppi = 0
|
||||
return lamr, mr, fppi
|
||||
|
||||
fppi = (1 - prec)
|
||||
mr = (1 - rec)
|
||||
|
||||
fppi_tmp = np.insert(fppi, 0, -1.0)
|
||||
mr_tmp = np.insert(mr, 0, 1.0)
|
||||
|
||||
# Use 9 evenly spaced reference points in log-space
|
||||
ref = np.logspace(-2.0, 0.0, num = 9)
|
||||
for i, ref_i in enumerate(ref):
|
||||
# np.where() will always find at least 1 index, since min(ref) = 0.01 and min(fppi_tmp) = -1.0
|
||||
j = np.where(fppi_tmp <= ref_i)[-1][-1]
|
||||
ref[i] = mr_tmp[j]
|
||||
|
||||
# log(0) is undefined, so we use the np.maximum(1e-10, ref)
|
||||
lamr = math.exp(np.mean(np.log(np.maximum(1e-10, ref))))
|
||||
|
||||
return lamr, mr, fppi
|
||||
|
||||
"""
|
||||
throw error and exit
|
||||
"""
|
||||
def error(msg):
|
||||
print(msg)
|
||||
sys.exit(0)
|
||||
|
||||
"""
|
||||
check if the number is a float between 0.0 and 1.0
|
||||
"""
|
||||
def is_float_between_0_and_1(value):
|
||||
try:
|
||||
val = float(value)
|
||||
if val > 0.0 and val < 1.0:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
"""
|
||||
Calculate the AP given the recall and precision array
|
||||
1st) We compute a version of the measured precision/recall curve with
|
||||
precision monotonically decreasing
|
||||
2nd) We compute the AP as the area under this curve by numerical integration.
|
||||
"""
|
||||
def voc_ap(rec, prec):
|
||||
"""
|
||||
--- Official matlab code VOC2012---
|
||||
mrec=[0 ; rec ; 1];
|
||||
mpre=[0 ; prec ; 0];
|
||||
for i=numel(mpre)-1:-1:1
|
||||
mpre(i)=max(mpre(i),mpre(i+1));
|
||||
end
|
||||
i=find(mrec(2:end)~=mrec(1:end-1))+1;
|
||||
ap=sum((mrec(i)-mrec(i-1)).*mpre(i));
|
||||
"""
|
||||
rec.insert(0, 0.0) # insert 0.0 at begining of list
|
||||
rec.append(1.0) # insert 1.0 at end of list
|
||||
mrec = rec[:]
|
||||
prec.insert(0, 0.0) # insert 0.0 at begining of list
|
||||
prec.append(0.0) # insert 0.0 at end of list
|
||||
mpre = prec[:]
|
||||
"""
|
||||
This part makes the precision monotonically decreasing
|
||||
(goes from the end to the beginning)
|
||||
matlab: for i=numel(mpre)-1:-1:1
|
||||
mpre(i)=max(mpre(i),mpre(i+1));
|
||||
"""
|
||||
# matlab indexes start in 1 but python in 0, so I have to do:
|
||||
# range(start=(len(mpre) - 2), end=0, step=-1)
|
||||
# also the python function range excludes the end, resulting in:
|
||||
# range(start=(len(mpre) - 2), end=-1, step=-1)
|
||||
for i in range(len(mpre)-2, -1, -1):
|
||||
mpre[i] = max(mpre[i], mpre[i+1])
|
||||
"""
|
||||
This part creates a list of indexes where the recall changes
|
||||
matlab: i=find(mrec(2:end)~=mrec(1:end-1))+1;
|
||||
"""
|
||||
i_list = []
|
||||
for i in range(1, len(mrec)):
|
||||
if mrec[i] != mrec[i-1]:
|
||||
i_list.append(i) # if it was matlab would be i + 1
|
||||
"""
|
||||
The Average Precision (AP) is the area under the curve
|
||||
(numerical integration)
|
||||
matlab: ap=sum((mrec(i)-mrec(i-1)).*mpre(i));
|
||||
"""
|
||||
ap = 0.0
|
||||
for i in i_list:
|
||||
ap += ((mrec[i]-mrec[i-1])*mpre[i])
|
||||
return ap, mrec, mpre
|
||||
|
||||
|
||||
"""
|
||||
Convert the lines of a file to a list
|
||||
"""
|
||||
def file_lines_to_list(path):
|
||||
# open txt file lines to a list
|
||||
with open(path) as f:
|
||||
content = f.readlines()
|
||||
# remove whitespace characters like `\n` at the end of each line
|
||||
content = [x.strip() for x in content]
|
||||
return content
|
||||
|
||||
"""
|
||||
Draws text in image
|
||||
"""
|
||||
def draw_text_in_image(img, text, pos, color, line_width):
|
||||
font = cv2.FONT_HERSHEY_PLAIN
|
||||
fontScale = 1
|
||||
lineType = 1
|
||||
bottomLeftCornerOfText = pos
|
||||
cv2.putText(img, text,
|
||||
bottomLeftCornerOfText,
|
||||
font,
|
||||
fontScale,
|
||||
color,
|
||||
lineType)
|
||||
text_width, _ = cv2.getTextSize(text, font, fontScale, lineType)[0]
|
||||
return img, (line_width + text_width)
|
||||
|
||||
"""
|
||||
Plot - adjust axes
|
||||
"""
|
||||
def adjust_axes(r, t, fig, axes):
|
||||
# get text width for re-scaling
|
||||
bb = t.get_window_extent(renderer=r)
|
||||
text_width_inches = bb.width / fig.dpi
|
||||
# get axis width in inches
|
||||
current_fig_width = fig.get_figwidth()
|
||||
new_fig_width = current_fig_width + text_width_inches
|
||||
propotion = new_fig_width / current_fig_width
|
||||
# get axis limit
|
||||
x_lim = axes.get_xlim()
|
||||
axes.set_xlim([x_lim[0], x_lim[1]*propotion])
|
||||
|
||||
"""
|
||||
Draw plot using Matplotlib
|
||||
"""
|
||||
def draw_plot_func(dictionary, n_classes, window_title, plot_title, x_label, output_path, to_show, plot_color, true_p_bar):
|
||||
# sort the dictionary by decreasing value, into a list of tuples
|
||||
sorted_dic_by_value = sorted(dictionary.items(), key=operator.itemgetter(1))
|
||||
# unpacking the list of tuples into two lists
|
||||
sorted_keys, sorted_values = zip(*sorted_dic_by_value)
|
||||
#
|
||||
if true_p_bar != "":
|
||||
"""
|
||||
Special case to draw in:
|
||||
- green -> TP: True Positives (object detected and matches ground-truth)
|
||||
- red -> FP: False Positives (object detected but does not match ground-truth)
|
||||
- pink -> FN: False Negatives (object not detected but present in the ground-truth)
|
||||
"""
|
||||
fp_sorted = []
|
||||
tp_sorted = []
|
||||
for key in sorted_keys:
|
||||
fp_sorted.append(dictionary[key] - true_p_bar[key])
|
||||
tp_sorted.append(true_p_bar[key])
|
||||
plt.barh(range(n_classes), fp_sorted, align='center', color='crimson', label='False Positive')
|
||||
plt.barh(range(n_classes), tp_sorted, align='center', color='forestgreen', label='True Positive', left=fp_sorted)
|
||||
# add legend
|
||||
plt.legend(loc='lower right')
|
||||
"""
|
||||
Write number on side of bar
|
||||
"""
|
||||
fig = plt.gcf() # gcf - get current figure
|
||||
axes = plt.gca()
|
||||
r = fig.canvas.get_renderer()
|
||||
for i, val in enumerate(sorted_values):
|
||||
fp_val = fp_sorted[i]
|
||||
tp_val = tp_sorted[i]
|
||||
fp_str_val = " " + str(fp_val)
|
||||
tp_str_val = fp_str_val + " " + str(tp_val)
|
||||
# trick to paint multicolor with offset:
|
||||
# first paint everything and then repaint the first number
|
||||
t = plt.text(val, i, tp_str_val, color='forestgreen', va='center', fontweight='bold')
|
||||
plt.text(val, i, fp_str_val, color='crimson', va='center', fontweight='bold')
|
||||
if i == (len(sorted_values)-1): # largest bar
|
||||
adjust_axes(r, t, fig, axes)
|
||||
else:
|
||||
plt.barh(range(n_classes), sorted_values, color=plot_color)
|
||||
"""
|
||||
Write number on side of bar
|
||||
"""
|
||||
fig = plt.gcf() # gcf - get current figure
|
||||
axes = plt.gca()
|
||||
r = fig.canvas.get_renderer()
|
||||
for i, val in enumerate(sorted_values):
|
||||
str_val = " " + str(val) # add a space before
|
||||
if val < 1.0:
|
||||
str_val = " {0:.2f}".format(val)
|
||||
t = plt.text(val, i, str_val, color=plot_color, va='center', fontweight='bold')
|
||||
# re-set axes to show number inside the figure
|
||||
if i == (len(sorted_values)-1): # largest bar
|
||||
adjust_axes(r, t, fig, axes)
|
||||
# set window title
|
||||
fig.canvas.manager.set_window_title(window_title)
|
||||
# write classes in y axis
|
||||
tick_font_size = 12
|
||||
plt.yticks(range(n_classes), sorted_keys, fontsize=tick_font_size)
|
||||
"""
|
||||
Re-scale height accordingly
|
||||
"""
|
||||
init_height = fig.get_figheight()
|
||||
# comput the matrix height in points and inches
|
||||
dpi = fig.dpi
|
||||
height_pt = n_classes * (tick_font_size * 1.4) # 1.4 (some spacing)
|
||||
height_in = height_pt / dpi
|
||||
# compute the required figure height
|
||||
top_margin = 0.15 # in percentage of the figure height
|
||||
bottom_margin = 0.05 # in percentage of the figure height
|
||||
figure_height = height_in / (1 - top_margin - bottom_margin)
|
||||
# set new height
|
||||
if figure_height > init_height:
|
||||
fig.set_figheight(figure_height)
|
||||
|
||||
# set plot title
|
||||
plt.title(plot_title, fontsize=14)
|
||||
# set axis titles
|
||||
# plt.xlabel('classes')
|
||||
plt.xlabel(x_label, fontsize='large')
|
||||
# adjust size of window
|
||||
fig.tight_layout()
|
||||
# save the plot
|
||||
fig.savefig(output_path)
|
||||
# show image
|
||||
if to_show:
|
||||
plt.show()
|
||||
# close the plot
|
||||
plt.close()
|
||||
|
||||
"""
|
||||
Create a ".temp_files/" and "output/" directory
|
||||
"""
|
||||
TEMP_FILES_PATH = os.path.join(args.path, ".temp_files")
|
||||
if not os.path.exists(TEMP_FILES_PATH): # if it doesn't exist already
|
||||
os.makedirs(TEMP_FILES_PATH)
|
||||
output_files_path = os.path.join(args.path, "mAP_output")
|
||||
if os.path.exists(output_files_path): # if it exist already
|
||||
# reset the output directory
|
||||
shutil.rmtree(output_files_path)
|
||||
|
||||
os.makedirs(output_files_path)
|
||||
if draw_plot:
|
||||
os.makedirs(os.path.join(output_files_path, "classes"))
|
||||
if show_animation:
|
||||
os.makedirs(os.path.join(output_files_path, "images", "detections_one_by_one"))
|
||||
|
||||
"""
|
||||
ground-truth
|
||||
Load each of the ground-truth files into a temporary ".json" file.
|
||||
Create a list of all the class names present in the ground-truth (gt_classes).
|
||||
"""
|
||||
# get a list with the ground-truth files
|
||||
ground_truth_files_list = glob.glob(GT_PATH + '/*.txt')
|
||||
if len(ground_truth_files_list) == 0:
|
||||
error("Error: No ground-truth files found!")
|
||||
ground_truth_files_list.sort()
|
||||
# dictionary with counter per class
|
||||
gt_counter_per_class = {}
|
||||
counter_images_per_class = {}
|
||||
|
||||
gt_files = []
|
||||
for txt_file in ground_truth_files_list:
|
||||
#print(txt_file)
|
||||
file_id = txt_file.split(".txt", 1)[0]
|
||||
file_id = os.path.basename(os.path.normpath(file_id))
|
||||
# check if there is a correspondent detection-results file
|
||||
temp_path = os.path.join(DR_PATH, (file_id + ".txt"))
|
||||
if not os.path.exists(temp_path):
|
||||
error_msg = "Error. File not found: {}\n".format(temp_path)
|
||||
error_msg += "(You can avoid this error message by running extra/intersect-gt-and-dr.py)"
|
||||
error(error_msg)
|
||||
lines_list = file_lines_to_list(txt_file)
|
||||
# create ground-truth dictionary
|
||||
bounding_boxes = []
|
||||
is_difficult = False
|
||||
already_seen_classes = []
|
||||
for line in lines_list:
|
||||
try:
|
||||
if "difficult" in line:
|
||||
class_name, left, top, right, bottom, _difficult = line.split()
|
||||
is_difficult = True
|
||||
else:
|
||||
class_name, left, top, right, bottom = line.split()
|
||||
except ValueError:
|
||||
error_msg = "Error: File " + txt_file + " in the wrong format.\n"
|
||||
error_msg += " Expected: <class_name> <left> <top> <right> <bottom> ['difficult']\n"
|
||||
error_msg += " Received: " + line
|
||||
error_msg += "\n\nIf you have a <class_name> with spaces between words you should remove them\n"
|
||||
error_msg += "by running the script \"remove_space.py\" or \"rename_class.py\" in the \"extra/\" folder."
|
||||
error(error_msg)
|
||||
# check if class is in the ignore list, if yes skip
|
||||
if class_name in args.ignore:
|
||||
continue
|
||||
bbox = left + " " + top + " " + right + " " +bottom
|
||||
if is_difficult:
|
||||
bounding_boxes.append({"class_name":class_name, "bbox":bbox, "used":False, "difficult":True})
|
||||
is_difficult = False
|
||||
else:
|
||||
bounding_boxes.append({"class_name":class_name, "bbox":bbox, "used":False})
|
||||
# count that object
|
||||
if class_name in gt_counter_per_class:
|
||||
gt_counter_per_class[class_name] += 1
|
||||
else:
|
||||
# if class didn't exist yet
|
||||
gt_counter_per_class[class_name] = 1
|
||||
|
||||
if class_name not in already_seen_classes:
|
||||
if class_name in counter_images_per_class:
|
||||
counter_images_per_class[class_name] += 1
|
||||
else:
|
||||
# if class didn't exist yet
|
||||
counter_images_per_class[class_name] = 1
|
||||
already_seen_classes.append(class_name)
|
||||
|
||||
|
||||
# dump bounding_boxes into a ".json" file
|
||||
new_temp_file = TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json"
|
||||
gt_files.append(new_temp_file)
|
||||
with open(new_temp_file, 'w') as outfile:
|
||||
json.dump(bounding_boxes, outfile)
|
||||
|
||||
gt_classes = list(gt_counter_per_class.keys())
|
||||
# let's sort the classes alphabetically
|
||||
gt_classes = sorted(gt_classes)
|
||||
n_classes = len(gt_classes)
|
||||
#print(gt_classes)
|
||||
#print(gt_counter_per_class)
|
||||
|
||||
"""
|
||||
Check format of the flag --set-class-iou (if used)
|
||||
e.g. check if class exists
|
||||
"""
|
||||
if specific_iou_flagged:
|
||||
n_args = len(args.set_class_iou)
|
||||
error_msg = \
|
||||
'\n --set-class-iou [class_1] [IoU_1] [class_2] [IoU_2] [...]'
|
||||
if n_args % 2 != 0:
|
||||
error('Error, missing arguments. Flag usage:' + error_msg)
|
||||
# [class_1] [IoU_1] [class_2] [IoU_2]
|
||||
# specific_iou_classes = ['class_1', 'class_2']
|
||||
specific_iou_classes = args.set_class_iou[::2] # even
|
||||
# iou_list = ['IoU_1', 'IoU_2']
|
||||
iou_list = args.set_class_iou[1::2] # odd
|
||||
if len(specific_iou_classes) != len(iou_list):
|
||||
error('Error, missing arguments. Flag usage:' + error_msg)
|
||||
for tmp_class in specific_iou_classes:
|
||||
if tmp_class not in gt_classes:
|
||||
error('Error, unknown class \"' + tmp_class + '\". Flag usage:' + error_msg)
|
||||
for num in iou_list:
|
||||
if not is_float_between_0_and_1(num):
|
||||
error('Error, IoU must be between 0.0 and 1.0. Flag usage:' + error_msg)
|
||||
|
||||
"""
|
||||
detection-results
|
||||
Load each of the detection-results files into a temporary ".json" file.
|
||||
"""
|
||||
# get a list with the detection-results files
|
||||
dr_files_list = glob.glob(DR_PATH + '/*.txt')
|
||||
dr_files_list.sort()
|
||||
|
||||
for class_index, class_name in enumerate(gt_classes):
|
||||
bounding_boxes = []
|
||||
for txt_file in dr_files_list:
|
||||
#print(txt_file)
|
||||
# the first time it checks if all the corresponding ground-truth files exist
|
||||
file_id = txt_file.split(".txt",1)[0]
|
||||
file_id = os.path.basename(os.path.normpath(file_id))
|
||||
temp_path = os.path.join(GT_PATH, (file_id + ".txt"))
|
||||
if class_index == 0:
|
||||
if not os.path.exists(temp_path):
|
||||
error_msg = "Error. File not found: {}\n".format(temp_path)
|
||||
error_msg += "(You can avoid this error message by running extra/intersect-gt-and-dr.py)"
|
||||
error(error_msg)
|
||||
lines = file_lines_to_list(txt_file)
|
||||
for line in lines:
|
||||
try:
|
||||
tmp_class_name, confidence, left, top, right, bottom = line.split()
|
||||
except ValueError:
|
||||
error_msg = "Error: File " + txt_file + " in the wrong format.\n"
|
||||
error_msg += " Expected: <class_name> <confidence> <left> <top> <right> <bottom>\n"
|
||||
error_msg += " Received: " + line
|
||||
error(error_msg)
|
||||
if tmp_class_name == class_name:
|
||||
#print("match")
|
||||
bbox = left + " " + top + " " + right + " " +bottom
|
||||
bounding_boxes.append({"confidence":confidence, "file_id":file_id, "bbox":bbox})
|
||||
#print(bounding_boxes)
|
||||
# sort detection-results by decreasing confidence
|
||||
bounding_boxes.sort(key=lambda x:float(x['confidence']), reverse=True)
|
||||
with open(TEMP_FILES_PATH + "/" + class_name + "_dr.json", 'w') as outfile:
|
||||
json.dump(bounding_boxes, outfile)
|
||||
|
||||
"""
|
||||
Calculate the AP for each class
|
||||
"""
|
||||
sum_AP = 0.0
|
||||
ap_dictionary = {}
|
||||
lamr_dictionary = {}
|
||||
# open file to store the output
|
||||
with open(output_files_path + "/output.txt", 'w') as output_file:
|
||||
output_file.write("# AP and precision/recall per class\n")
|
||||
count_true_positives = {}
|
||||
for class_index, class_name in enumerate(gt_classes):
|
||||
count_true_positives[class_name] = 0
|
||||
"""
|
||||
Load detection-results of that class
|
||||
"""
|
||||
dr_file = TEMP_FILES_PATH + "/" + class_name + "_dr.json"
|
||||
dr_data = json.load(open(dr_file))
|
||||
|
||||
"""
|
||||
Assign detection-results to ground-truth objects
|
||||
"""
|
||||
nd = len(dr_data)
|
||||
tp = [0] * nd # creates an array of zeros of size nd
|
||||
fp = [0] * nd
|
||||
for idx, detection in enumerate(dr_data):
|
||||
file_id = detection["file_id"]
|
||||
if show_animation:
|
||||
# find ground truth image
|
||||
ground_truth_img = glob.glob1(IMG_PATH, file_id + ".*")
|
||||
#tifCounter = len(glob.glob1(myPath,"*.tif"))
|
||||
if len(ground_truth_img) == 0:
|
||||
error("Error. Image not found with id: " + file_id)
|
||||
elif len(ground_truth_img) > 1:
|
||||
error("Error. Multiple image with id: " + file_id)
|
||||
else: # found image
|
||||
#print(IMG_PATH + "/" + ground_truth_img[0])
|
||||
# Load image
|
||||
img = cv2.imread(IMG_PATH + "/" + ground_truth_img[0])
|
||||
# load image with draws of multiple detections
|
||||
img_cumulative_path = output_files_path + "/images/" + ground_truth_img[0]
|
||||
if os.path.isfile(img_cumulative_path):
|
||||
img_cumulative = cv2.imread(img_cumulative_path)
|
||||
else:
|
||||
img_cumulative = img.copy()
|
||||
# Add bottom border to image
|
||||
bottom_border = 60
|
||||
BLACK = [0, 0, 0]
|
||||
img = cv2.copyMakeBorder(img, 0, bottom_border, 0, 0, cv2.BORDER_CONSTANT, value=BLACK)
|
||||
# assign detection-results to ground truth object if any
|
||||
# open ground-truth with that file_id
|
||||
gt_file = TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json"
|
||||
ground_truth_data = json.load(open(gt_file))
|
||||
ovmax = -1
|
||||
gt_match = -1
|
||||
# load detected object bounding-box
|
||||
bb = [ float(x) for x in detection["bbox"].split() ]
|
||||
for obj in ground_truth_data:
|
||||
# look for a class_name match
|
||||
if obj["class_name"] == class_name:
|
||||
bbgt = [ float(x) for x in obj["bbox"].split() ]
|
||||
bi = [max(bb[0],bbgt[0]), max(bb[1],bbgt[1]), min(bb[2],bbgt[2]), min(bb[3],bbgt[3])]
|
||||
iw = bi[2] - bi[0] + 1
|
||||
ih = bi[3] - bi[1] + 1
|
||||
if iw > 0 and ih > 0:
|
||||
# compute overlap (IoU) = area of intersection / area of union
|
||||
ua = (bb[2] - bb[0] + 1) * (bb[3] - bb[1] + 1) + (bbgt[2] - bbgt[0]
|
||||
+ 1) * (bbgt[3] - bbgt[1] + 1) - iw * ih
|
||||
ov = iw * ih / ua
|
||||
if ov > ovmax:
|
||||
ovmax = ov
|
||||
gt_match = obj
|
||||
|
||||
# assign detection as true positive/don't care/false positive
|
||||
if show_animation:
|
||||
status = "NO MATCH FOUND!" # status is only used in the animation
|
||||
# set minimum overlap
|
||||
min_overlap = MINOVERLAP
|
||||
if specific_iou_flagged:
|
||||
if class_name in specific_iou_classes:
|
||||
index = specific_iou_classes.index(class_name)
|
||||
min_overlap = float(iou_list[index])
|
||||
if ovmax >= min_overlap:
|
||||
if "difficult" not in gt_match:
|
||||
if not bool(gt_match["used"]):
|
||||
# true positive
|
||||
tp[idx] = 1
|
||||
gt_match["used"] = True
|
||||
count_true_positives[class_name] += 1
|
||||
# update the ".json" file
|
||||
with open(gt_file, 'w') as f:
|
||||
f.write(json.dumps(ground_truth_data))
|
||||
if show_animation:
|
||||
status = "MATCH!"
|
||||
else:
|
||||
# false positive (multiple detection)
|
||||
fp[idx] = 1
|
||||
if show_animation:
|
||||
status = "REPEATED MATCH!"
|
||||
else:
|
||||
# false positive
|
||||
fp[idx] = 1
|
||||
if ovmax > 0:
|
||||
status = "INSUFFICIENT OVERLAP"
|
||||
|
||||
"""
|
||||
Draw image to show animation
|
||||
"""
|
||||
if show_animation:
|
||||
height, widht = img.shape[:2]
|
||||
# colors (OpenCV works with BGR)
|
||||
white = (255,255,255)
|
||||
light_blue = (255,200,100)
|
||||
green = (0,255,0)
|
||||
light_red = (30,30,255)
|
||||
# 1st line
|
||||
margin = 10
|
||||
v_pos = int(height - margin - (bottom_border / 2.0))
|
||||
text = "Image: " + ground_truth_img[0] + " "
|
||||
img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0)
|
||||
text = "Class [" + str(class_index) + "/" + str(n_classes) + "]: " + class_name + " "
|
||||
img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), light_blue, line_width)
|
||||
if ovmax != -1:
|
||||
color = light_red
|
||||
if status == "INSUFFICIENT OVERLAP":
|
||||
text = "IoU: {0:.2f}% ".format(ovmax*100) + "< {0:.2f}% ".format(min_overlap*100)
|
||||
else:
|
||||
text = "IoU: {0:.2f}% ".format(ovmax*100) + ">= {0:.2f}% ".format(min_overlap*100)
|
||||
color = green
|
||||
img, _ = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width)
|
||||
# 2nd line
|
||||
v_pos += int(bottom_border / 2.0)
|
||||
rank_pos = str(idx+1) # rank position (idx starts at 0)
|
||||
text = "Detection #rank: " + rank_pos + " confidence: {0:.2f}% ".format(float(detection["confidence"])*100)
|
||||
img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0)
|
||||
color = light_red
|
||||
if status == "MATCH!":
|
||||
color = green
|
||||
text = "Result: " + status + " "
|
||||
img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width)
|
||||
|
||||
font = cv2.FONT_HERSHEY_SIMPLEX
|
||||
if ovmax > 0: # if there is intersections between the bounding-boxes
|
||||
bbgt = [ int(round(float(x))) for x in gt_match["bbox"].split() ]
|
||||
cv2.rectangle(img,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),light_blue,2)
|
||||
cv2.rectangle(img_cumulative,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),light_blue,2)
|
||||
cv2.putText(img_cumulative, class_name, (bbgt[0],bbgt[1] - 5), font, 0.6, light_blue, 1, cv2.LINE_AA)
|
||||
bb = [int(i) for i in bb]
|
||||
cv2.rectangle(img,(bb[0],bb[1]),(bb[2],bb[3]),color,2)
|
||||
cv2.rectangle(img_cumulative,(bb[0],bb[1]),(bb[2],bb[3]),color,2)
|
||||
cv2.putText(img_cumulative, class_name, (bb[0],bb[1] - 5), font, 0.6, color, 1, cv2.LINE_AA)
|
||||
# show image
|
||||
cv2.imshow("Animation", img)
|
||||
cv2.waitKey(20) # show for 20 ms
|
||||
# save image to output
|
||||
output_img_path = output_files_path + "/images/detections_one_by_one/" + class_name + "_detection" + str(idx) + ".jpg"
|
||||
cv2.imwrite(output_img_path, img)
|
||||
# save the image with all the objects drawn to it
|
||||
cv2.imwrite(img_cumulative_path, img_cumulative)
|
||||
|
||||
#print(tp)
|
||||
# compute precision/recall
|
||||
cumsum = 0
|
||||
for idx, val in enumerate(fp):
|
||||
fp[idx] += cumsum
|
||||
cumsum += val
|
||||
cumsum = 0
|
||||
for idx, val in enumerate(tp):
|
||||
tp[idx] += cumsum
|
||||
cumsum += val
|
||||
#print(tp)
|
||||
rec = tp[:]
|
||||
for idx, val in enumerate(tp):
|
||||
rec[idx] = float(tp[idx]) / gt_counter_per_class[class_name]
|
||||
#print(rec)
|
||||
prec = tp[:]
|
||||
for idx, val in enumerate(tp):
|
||||
prec[idx] = float(tp[idx]) / (fp[idx] + tp[idx])
|
||||
#print(prec)
|
||||
|
||||
ap, mrec, mprec = voc_ap(rec[:], prec[:])
|
||||
sum_AP += ap
|
||||
text = "{0:.2f}%".format(ap*100) + " = " + class_name + " AP " #class_name + " AP = {0:.2f}%".format(ap*100)
|
||||
"""
|
||||
Write to output.txt
|
||||
"""
|
||||
rounded_prec = [ '%.2f' % elem for elem in prec ]
|
||||
rounded_rec = [ '%.2f' % elem for elem in rec ]
|
||||
output_file.write(text + "\n Precision: " + str(rounded_prec) + "\n Recall :" + str(rounded_rec) + "\n\n")
|
||||
if not args.quiet:
|
||||
print(text)
|
||||
ap_dictionary[class_name] = ap
|
||||
|
||||
n_images = counter_images_per_class[class_name]
|
||||
lamr, mr, fppi = log_average_miss_rate(np.array(prec), np.array(rec), n_images)
|
||||
lamr_dictionary[class_name] = lamr
|
||||
|
||||
"""
|
||||
Draw plot
|
||||
"""
|
||||
if draw_plot:
|
||||
plt.plot(rec, prec, '-o')
|
||||
# add a new penultimate point to the list (mrec[-2], 0.0)
|
||||
# since the last line segment (and respective area) do not affect the AP value
|
||||
area_under_curve_x = mrec[:-1] + [mrec[-2]] + [mrec[-1]]
|
||||
area_under_curve_y = mprec[:-1] + [0.0] + [mprec[-1]]
|
||||
plt.fill_between(area_under_curve_x, 0, area_under_curve_y, alpha=0.2, edgecolor='r')
|
||||
# set window title
|
||||
fig = plt.gcf() # gcf - get current figure
|
||||
fig.canvas.manager.set_window_title('AP ' + class_name)
|
||||
# set plot title
|
||||
plt.title('class: ' + text)
|
||||
#plt.suptitle('This is a somewhat long figure title', fontsize=16)
|
||||
# set axis titles
|
||||
plt.xlabel('Recall')
|
||||
plt.ylabel('Precision')
|
||||
# optional - set axes
|
||||
axes = plt.gca() # gca - get current axes
|
||||
axes.set_xlim([0.0,1.0])
|
||||
axes.set_ylim([0.0,1.05]) # .05 to give some extra space
|
||||
# Alternative option -> wait for button to be pressed
|
||||
#while not plt.waitforbuttonpress(): pass # wait for key display
|
||||
# Alternative option -> normal display
|
||||
#plt.show()
|
||||
# save the plot
|
||||
fig.savefig(output_files_path + "/classes/" + class_name + ".png")
|
||||
plt.cla() # clear axes for next plot
|
||||
|
||||
if show_animation:
|
||||
cv2.destroyAllWindows()
|
||||
|
||||
output_file.write("\n# mAP of all classes\n")
|
||||
mAP = sum_AP / n_classes
|
||||
text = "mAP = {0:.2f}%".format(mAP*100)
|
||||
output_file.write(text + "\n")
|
||||
print(text)
|
||||
|
||||
"""
|
||||
Draw false negatives
|
||||
"""
|
||||
if show_animation:
|
||||
pink = (203,192,255)
|
||||
for tmp_file in gt_files:
|
||||
ground_truth_data = json.load(open(tmp_file))
|
||||
#print(ground_truth_data)
|
||||
# get name of corresponding image
|
||||
start = TEMP_FILES_PATH + '/'
|
||||
img_id = tmp_file[tmp_file.find(start)+len(start):tmp_file.rfind('_ground_truth.json')]
|
||||
img_cumulative_path = output_files_path + "/images/" + img_id + ".jpg"
|
||||
img = cv2.imread(img_cumulative_path)
|
||||
if img is None:
|
||||
img_path = IMG_PATH + '/' + img_id + ".jpg"
|
||||
img = cv2.imread(img_path)
|
||||
# draw false negatives
|
||||
for obj in ground_truth_data:
|
||||
if not obj['used']:
|
||||
bbgt = [ int(round(float(x))) for x in obj["bbox"].split() ]
|
||||
cv2.rectangle(img,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),pink,2)
|
||||
cv2.imwrite(img_cumulative_path, img)
|
||||
|
||||
# remove the temp_files directory
|
||||
shutil.rmtree(TEMP_FILES_PATH)
|
||||
|
||||
"""
|
||||
Count total of detection-results
|
||||
"""
|
||||
# iterate through all the files
|
||||
det_counter_per_class = {}
|
||||
for txt_file in dr_files_list:
|
||||
# get lines to list
|
||||
lines_list = file_lines_to_list(txt_file)
|
||||
for line in lines_list:
|
||||
class_name = line.split()[0]
|
||||
# check if class is in the ignore list, if yes skip
|
||||
if class_name in args.ignore:
|
||||
continue
|
||||
# count that object
|
||||
if class_name in det_counter_per_class:
|
||||
det_counter_per_class[class_name] += 1
|
||||
else:
|
||||
# if class didn't exist yet
|
||||
det_counter_per_class[class_name] = 1
|
||||
#print(det_counter_per_class)
|
||||
dr_classes = list(det_counter_per_class.keys())
|
||||
|
||||
|
||||
"""
|
||||
Plot the total number of occurences of each class in the ground-truth
|
||||
"""
|
||||
if draw_plot:
|
||||
window_title = "ground-truth-info"
|
||||
plot_title = "ground-truth\n"
|
||||
plot_title += "(" + str(len(ground_truth_files_list)) + " files and " + str(n_classes) + " classes)"
|
||||
x_label = "Number of objects per class"
|
||||
output_path = output_files_path + "/ground-truth-info.png"
|
||||
to_show = False
|
||||
plot_color = 'forestgreen'
|
||||
draw_plot_func(
|
||||
gt_counter_per_class,
|
||||
n_classes,
|
||||
window_title,
|
||||
plot_title,
|
||||
x_label,
|
||||
output_path,
|
||||
to_show,
|
||||
plot_color,
|
||||
'',
|
||||
)
|
||||
|
||||
"""
|
||||
Write number of ground-truth objects per class to results.txt
|
||||
"""
|
||||
with open(output_files_path + "/output.txt", 'a') as output_file:
|
||||
output_file.write("\n# Number of ground-truth objects per class\n")
|
||||
for class_name in sorted(gt_counter_per_class):
|
||||
output_file.write(class_name + ": " + str(gt_counter_per_class[class_name]) + "\n")
|
||||
|
||||
"""
|
||||
Finish counting true positives
|
||||
"""
|
||||
for class_name in dr_classes:
|
||||
# if class exists in detection-result but not in ground-truth then there are no true positives in that class
|
||||
if class_name not in gt_classes:
|
||||
count_true_positives[class_name] = 0
|
||||
#print(count_true_positives)
|
||||
|
||||
"""
|
||||
Plot the total number of occurences of each class in the "detection-results" folder
|
||||
"""
|
||||
if draw_plot:
|
||||
window_title = "detection-results-info"
|
||||
# Plot title
|
||||
plot_title = "detection-results\n"
|
||||
plot_title += "(" + str(len(dr_files_list)) + " files and "
|
||||
count_non_zero_values_in_dictionary = sum(int(x) > 0 for x in list(det_counter_per_class.values()))
|
||||
plot_title += str(count_non_zero_values_in_dictionary) + " detected classes)"
|
||||
# end Plot title
|
||||
x_label = "Number of objects per class"
|
||||
output_path = output_files_path + "/detection-results-info.png"
|
||||
to_show = False
|
||||
plot_color = 'forestgreen'
|
||||
true_p_bar = count_true_positives
|
||||
draw_plot_func(
|
||||
det_counter_per_class,
|
||||
len(det_counter_per_class),
|
||||
window_title,
|
||||
plot_title,
|
||||
x_label,
|
||||
output_path,
|
||||
to_show,
|
||||
plot_color,
|
||||
true_p_bar
|
||||
)
|
||||
|
||||
"""
|
||||
Write number of detected objects per class to output.txt
|
||||
"""
|
||||
with open(output_files_path + "/output.txt", 'a') as output_file:
|
||||
output_file.write("\n# Number of detected objects per class\n")
|
||||
for class_name in sorted(dr_classes):
|
||||
n_det = det_counter_per_class[class_name]
|
||||
text = class_name + ": " + str(n_det)
|
||||
text += " (tp:" + str(count_true_positives[class_name]) + ""
|
||||
text += ", fp:" + str(n_det - count_true_positives[class_name]) + ")\n"
|
||||
output_file.write(text)
|
||||
|
||||
"""
|
||||
Draw log-average miss rate plot (Show lamr of all classes in decreasing order)
|
||||
"""
|
||||
if draw_plot:
|
||||
window_title = "lamr"
|
||||
plot_title = "log-average miss rate"
|
||||
x_label = "log-average miss rate"
|
||||
output_path = output_files_path + "/lamr.png"
|
||||
to_show = False
|
||||
plot_color = 'royalblue'
|
||||
draw_plot_func(
|
||||
lamr_dictionary,
|
||||
n_classes,
|
||||
window_title,
|
||||
plot_title,
|
||||
x_label,
|
||||
output_path,
|
||||
to_show,
|
||||
plot_color,
|
||||
""
|
||||
)
|
||||
|
||||
"""
|
||||
Draw mAP plot (Show AP's of all classes in decreasing order)
|
||||
"""
|
||||
if draw_plot:
|
||||
window_title = "mAP"
|
||||
plot_title = "mAP = {0:.2f}%".format(mAP*100)
|
||||
x_label = "Average Precision"
|
||||
output_path = output_files_path + "/mAP.png"
|
||||
to_show = True
|
||||
plot_color = 'royalblue'
|
||||
draw_plot_func(
|
||||
ap_dictionary,
|
||||
n_classes,
|
||||
window_title,
|
||||
plot_title,
|
||||
x_label,
|
||||
output_path,
|
||||
to_show,
|
||||
plot_color,
|
||||
""
|
||||
)
|
|
@ -0,0 +1,96 @@
|
|||
import os
|
||||
import json
|
||||
import torch
|
||||
from torchvision import datasets
|
||||
from torchvision import transforms
|
||||
from torch.utils.data import DataLoader
|
||||
from functools import partial
|
||||
|
||||
|
||||
class_to_idx = {
|
||||
'aeroplane':0, 'bicycle':1, 'bird':2, 'boat':3, 'bottle':4,
|
||||
'bus':5, 'car':6, 'cat':7, 'chair':8, 'cow':9, 'diningtable':10,
|
||||
'dog':11, 'horse':12, 'motorbike':13, 'person':14, 'pottedplant':15,
|
||||
'sheep':16, 'sofa':17, 'train':18, 'tvmonitor':19
|
||||
}
|
||||
idx_to_class = {i:c for c, i in class_to_idx.items()}
|
||||
|
||||
|
||||
def get_pascal_voc2007_data(image_root, split='train'):
|
||||
"""
|
||||
Use torchvision.datasets
|
||||
https://pytorch.org/docs/stable/torchvision/datasets.html#torchvision.datasets.VOCDetection
|
||||
"""
|
||||
|
||||
train_dataset = datasets.VOCDetection(image_root, year='2007', image_set=split,
|
||||
download=False)
|
||||
|
||||
return train_dataset
|
||||
|
||||
|
||||
def pascal_voc2007_loader(dataset, batch_size, num_workers=0, shuffle=False, proposal_path=None):
|
||||
"""
|
||||
Data loader for Pascal VOC 2007.
|
||||
https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader
|
||||
"""
|
||||
collate_fn = partial(voc_collate_fn, proposal_path=proposal_path)
|
||||
train_loader = DataLoader(dataset,
|
||||
batch_size=batch_size,
|
||||
shuffle=shuffle, pin_memory=True,
|
||||
num_workers=num_workers,
|
||||
collate_fn=collate_fn)
|
||||
return train_loader
|
||||
|
||||
|
||||
def voc_collate_fn(batch_lst, reshape_size=224, proposal_path=None):
|
||||
preprocess = transforms.Compose([
|
||||
transforms.Resize((reshape_size, reshape_size)),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
|
||||
])
|
||||
|
||||
batch_size = len(batch_lst)
|
||||
|
||||
img_batch = torch.zeros(batch_size, 3, reshape_size, reshape_size)
|
||||
|
||||
box_list = []
|
||||
box_batch_idx = []
|
||||
w_list = []
|
||||
h_list = []
|
||||
img_id_list = []
|
||||
proposal_list = []
|
||||
proposal_batch_idx = []
|
||||
|
||||
for i in range(batch_size):
|
||||
img, ann = batch_lst[i]
|
||||
w_list.append(img.size[0]) # image width
|
||||
h_list.append(img.size[1]) # image height
|
||||
img_id_list.append(ann['annotation']['filename'])
|
||||
img_batch[i] = preprocess(img)
|
||||
all_bbox = ann['annotation']['object']
|
||||
if type(all_bbox) == dict: # inconsistency in the annotation file
|
||||
all_bbox = [all_bbox]
|
||||
for bbox_idx, one_bbox in enumerate(all_bbox):
|
||||
bbox = one_bbox['bndbox']
|
||||
obj_cls = one_bbox['name']
|
||||
box_list.append(torch.Tensor([float(bbox['xmin']), float(bbox['ymin']),
|
||||
float(bbox['xmax']), float(bbox['ymax']), class_to_idx[obj_cls]]))
|
||||
box_batch_idx.append(i)
|
||||
if proposal_path is not None:
|
||||
proposal_fn = ann['annotation']['filename'].replace('.jpg', '.json')
|
||||
with open(os.path.join(proposal_path, proposal_fn), 'r') as f:
|
||||
proposal = json.load(f)
|
||||
for p in proposal:
|
||||
proposal_list.append([p['x_min'], p['y_min'], p['x_max'], p['y_max']])
|
||||
proposal_batch_idx.append(i)
|
||||
|
||||
h_batch = torch.tensor(h_list)
|
||||
w_batch = torch.tensor(w_list)
|
||||
box_batch = torch.stack(box_list)
|
||||
box_batch_ids = torch.tensor(box_batch_idx, dtype=torch.long)
|
||||
proposals = torch.tensor(proposal_list, dtype=box_batch.dtype)
|
||||
proposal_batch_ids = torch.tensor(proposal_batch_idx, dtype=torch.long)
|
||||
assert len(box_batch) == len(box_batch_ids)
|
||||
assert len(proposals) == len(proposal_batch_ids)
|
||||
|
||||
return img_batch, box_batch, box_batch_ids, proposals, proposal_batch_ids, w_batch, h_batch, img_id_list
|
|
@ -0,0 +1,33 @@
|
|||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
def ClsScoreRegression(cls_scores, GT_label, batch_size):
|
||||
"""
|
||||
Multi-class cross-entropy loss
|
||||
|
||||
Inputs:
|
||||
- cls_scores: Predicted class scores, of shape (M, C).
|
||||
- GT_label: GT class labels, of shape (M,).
|
||||
|
||||
Outputs:
|
||||
- cls_score_loss: Torch scalar
|
||||
"""
|
||||
cls_loss = F.cross_entropy(cls_scores, GT_label, \
|
||||
reduction='sum') * 1. / batch_size
|
||||
return cls_loss
|
||||
|
||||
|
||||
def BboxRegression(offsets, GT_offsets, batch_size):
|
||||
""""
|
||||
Use SmoothL1 loss as in Faster R-CNN
|
||||
|
||||
Inputs:
|
||||
- offsets: Predicted box offsets, of shape (M, 4)
|
||||
- GT_offsets: GT box offsets, of shape (M, 4)
|
||||
|
||||
Outputs:
|
||||
- bbox_reg_loss: Torch scalar
|
||||
"""
|
||||
bbox_reg_loss = F.smooth_l1_loss(offsets, GT_offsets, reduction='sum') * 1. / batch_size
|
||||
return bbox_reg_loss
|
|
@ -0,0 +1,204 @@
|
|||
import math
|
||||
import copy
|
||||
import time
|
||||
import shutil
|
||||
import os
|
||||
import random
|
||||
os.environ['TORCH_HOME'] = './ckpts'
|
||||
|
||||
import argparse
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch import optim
|
||||
import torchvision
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import cv2
|
||||
|
||||
from dataset import get_pascal_voc2007_data, pascal_voc2007_loader, idx_to_class
|
||||
from model import FastRCNN
|
||||
from utils import coord_trans, data_visualizer
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser('Faster R-CNN', add_help=False)
|
||||
parser.add_argument('--lr', default=1e-3, type=float)
|
||||
parser.add_argument('--lr_decay', default=1.0, type=float)
|
||||
parser.add_argument('--batch_size', default=16, type=int)
|
||||
parser.add_argument('--epochs', default=200, type=int)
|
||||
parser.add_argument('--num_workers', default=4, type=int)
|
||||
parser.add_argument('--overfit_small_data', default=False, action='store_true')
|
||||
parser.add_argument('--output_dir', default='./exp/fast_rcnn')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main(args):
|
||||
torch.manual_seed(0)
|
||||
torch.cuda.manual_seed(0)
|
||||
random.seed(0)
|
||||
if args.overfit_small_data:
|
||||
args.output_dir = args.output_dir + "_overfit_small"
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
# build dataset & dataloader
|
||||
train_dataset = get_pascal_voc2007_data('./data/VOCtrainval_06-Nov-2007/', 'train')
|
||||
val_dataset = get_pascal_voc2007_data('./data/VOCtrainval_06-Nov-2007/', 'val')
|
||||
|
||||
train_loader = pascal_voc2007_loader(train_dataset, args.batch_size, shuffle=True, num_workers=args.num_workers,
|
||||
proposal_path='data/VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007/Proposals')
|
||||
val_loader = pascal_voc2007_loader(val_dataset, args.batch_size, num_workers=args.num_workers,
|
||||
proposal_path='data/VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007/Proposals')
|
||||
|
||||
if args.overfit_small_data:
|
||||
num_sample = 10
|
||||
small_dataset = torch.utils.data.Subset(
|
||||
train_dataset,
|
||||
torch.linspace(0, len(train_dataset)-1, steps=num_sample).long()
|
||||
)
|
||||
small_train_loader = pascal_voc2007_loader(small_dataset, 10,
|
||||
proposal_path='data/VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007/Proposals')
|
||||
val_dataset = small_dataset
|
||||
train_loader = small_train_loader
|
||||
val_loader = small_train_loader
|
||||
|
||||
model = FastRCNN()
|
||||
model.cuda()
|
||||
|
||||
# build optimizer
|
||||
optimizer = optim.SGD(
|
||||
filter(lambda p: p.requires_grad, model.parameters()),
|
||||
args.lr
|
||||
)
|
||||
lr_scheduler = optim.lr_scheduler.LambdaLR(
|
||||
optimizer,
|
||||
lambda epoch: args.lr_decay ** epoch
|
||||
)
|
||||
|
||||
# load ckpt
|
||||
ckpt_path = os.path.join(args.output_dir, 'checkpoint.pth')
|
||||
start_epoch = 0
|
||||
if os.path.exists(ckpt_path):
|
||||
checkpoint = torch.load(ckpt_path)
|
||||
start_epoch = checkpoint['epoch']
|
||||
model.load_state_dict(checkpoint['model'])
|
||||
optimizer.load_state_dict(checkpoint['optimizer'])
|
||||
lr_scheduler.load_state_dict(checkpoint['lr_sched'])
|
||||
|
||||
if start_epoch < args.epochs:
|
||||
train(args, model, train_loader, optimizer, lr_scheduler, start_epoch)
|
||||
inference(args, model, val_loader, val_dataset, visualize=args.overfit_small_data)
|
||||
|
||||
|
||||
def train(args, model, train_loader, optimizer, lr_scheduler, start_epoch):
|
||||
loss_history = []
|
||||
model.train()
|
||||
for i in range(start_epoch, args.epochs):
|
||||
start_t = time.time()
|
||||
for iter_num, data_batch in enumerate(train_loader):
|
||||
images, boxes, boxes_batch_ids, proposals, proposal_batch_ids, w_batch, h_batch, _ = data_batch
|
||||
resized_boxes = coord_trans(boxes, boxes_batch_ids, w_batch, h_batch, mode='p2a')
|
||||
resized_proposals = coord_trans(proposals, proposal_batch_ids, w_batch, h_batch, mode='p2a')
|
||||
|
||||
images = images.to(dtype=torch.float, device='cuda')
|
||||
resized_boxes = resized_boxes.to(dtype=torch.float, device='cuda')
|
||||
boxes_batch_ids = boxes_batch_ids.cuda()
|
||||
resized_proposals = resized_proposals.to(dtype=torch.float, device='cuda')
|
||||
proposal_batch_ids = proposal_batch_ids.cuda()
|
||||
|
||||
loss = model(images, resized_boxes, boxes_batch_ids, resized_proposals, proposal_batch_ids)
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
loss_history.append(loss.item())
|
||||
optimizer.step()
|
||||
|
||||
if iter_num % 50 == 0:
|
||||
print('(Iter {} / {}) loss: {:.4f}'.format(iter_num, len(train_loader), np.mean(loss_history[-50:])))
|
||||
|
||||
end_t = time.time()
|
||||
print('(Epoch {} / {}) loss: {:.4f}, time per epoch: {:.1f}s'.format(
|
||||
i, args.epochs, np.mean(loss_history[-len(train_loader):]), end_t-start_t))
|
||||
lr_scheduler.step()
|
||||
|
||||
checkpoint = {
|
||||
'epoch': i + 1,
|
||||
'model': model.state_dict(),
|
||||
'optimizer': optimizer.state_dict(),
|
||||
'lr_sched': lr_scheduler.state_dict()}
|
||||
torch.save(checkpoint, os.path.join(args.output_dir, 'checkpoint.pth'))
|
||||
|
||||
# plot the training losses
|
||||
fig, ax = plt.subplots()
|
||||
ax.plot(loss_history)
|
||||
ax.set_xlabel('Iteration')
|
||||
ax.set_ylabel('Loss')
|
||||
ax.set_title('Training loss history')
|
||||
fig.savefig(os.path.join(args.output_dir, 'training_loss.png'))
|
||||
plt.close()
|
||||
|
||||
|
||||
def inference(args, model, val_loader, dataset, thresh=0.5, nms_thresh=0.5, visualize=False):
|
||||
model.eval()
|
||||
start_t = time.time()
|
||||
|
||||
if args.output_dir is not None:
|
||||
det_dir = os.path.join(args.output_dir, 'mAP_input/detection-results')
|
||||
gt_dir = os.path.join(args.output_dir, 'mAP_input/ground-truth')
|
||||
vis_dir = os.path.join(args.output_dir, 'visualize')
|
||||
os.makedirs(det_dir, exist_ok=True)
|
||||
os.makedirs(gt_dir, exist_ok=True)
|
||||
os.makedirs(vis_dir, exist_ok=True)
|
||||
|
||||
for iter_num, data_batch in enumerate(val_loader):
|
||||
images, boxes, boxes_batch_ids, proposals, proposal_batch_ids, w_batch, h_batch, img_ids = data_batch
|
||||
images = images.to(dtype=torch.float, device='cuda')
|
||||
resized_proposals = coord_trans(proposals, proposal_batch_ids, w_batch, h_batch, mode='p2a')
|
||||
resized_proposals = resized_proposals.to(dtype=torch.float, device='cuda')
|
||||
proposal_batch_ids = proposal_batch_ids.cuda()
|
||||
|
||||
with torch.no_grad():
|
||||
final_proposals, final_conf_scores, final_class = \
|
||||
model.inference(images, resized_proposals, proposal_batch_ids, thresh=thresh, nms_thresh=nms_thresh)
|
||||
|
||||
# clamp on the proposal coordinates
|
||||
batch_size = len(images)
|
||||
for idx in range(batch_size):
|
||||
torch.clamp_(final_proposals[idx][:, 0::2], min=0, max=w_batch[idx])
|
||||
torch.clamp_(final_proposals[idx][:, 1::2], min=0, max=h_batch[idx])
|
||||
|
||||
# visualization
|
||||
# get the original image
|
||||
# hack to get the original image so we don't have to load from local again...
|
||||
i = batch_size*iter_num + idx
|
||||
img, _ = dataset.__getitem__(i)
|
||||
|
||||
box_per_img = boxes[boxes_batch_ids==idx]
|
||||
final_all = torch.cat((final_proposals[idx], \
|
||||
final_class[idx].float(), final_conf_scores[idx]), dim=-1).cpu()
|
||||
final_batch_idx = torch.LongTensor([idx] * final_all.shape[0])
|
||||
resized_final_proposals = coord_trans(final_all, final_batch_idx, w_batch, h_batch)
|
||||
|
||||
# write results to file for evaluation (use mAP API https://github.com/Cartucho/mAP for now...)
|
||||
if args.output_dir is not None:
|
||||
file_name = img_ids[idx].replace('.jpg', '.txt')
|
||||
with open(os.path.join(det_dir, file_name), 'w') as f_det, \
|
||||
open(os.path.join(gt_dir, file_name), 'w') as f_gt:
|
||||
print('{}: {} GT bboxes and {} proposals'.format(img_ids[idx], len(box_per_img), resized_final_proposals.shape[0]))
|
||||
for b in box_per_img:
|
||||
f_gt.write('{} {:.2f} {:.2f} {:.2f} {:.2f}\n'.format(idx_to_class[b[4].item()], b[0], b[1], b[2], b[3]))
|
||||
for b in resized_final_proposals:
|
||||
f_det.write('{} {:.6f} {:.2f} {:.2f} {:.2f} {:.2f}\n'.format(idx_to_class[b[4].item()], b[5], b[0], b[1], b[2], b[3]))
|
||||
|
||||
if visualize:
|
||||
data_visualizer(img, idx_to_class, os.path.join(vis_dir, img_ids[idx]), box_per_img, resized_final_proposals)
|
||||
|
||||
end_t = time.time()
|
||||
print('Total inference time: {:.1f}s'.format(end_t-start_t))
|
||||
|
||||
|
||||
if __name__=='__main__':
|
||||
args = parse_args()
|
||||
main(args)
|
|
@ -0,0 +1,212 @@
|
|||
import math
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torchvision
|
||||
from torchvision import models
|
||||
|
||||
from utils import compute_offsets, assign_label, generate_proposal
|
||||
from loss import ClsScoreRegression, BboxRegression
|
||||
|
||||
|
||||
class FeatureExtractor(nn.Module):
|
||||
"""
|
||||
Image feature extraction with MobileNet.
|
||||
"""
|
||||
def __init__(self, reshape_size=224, pooling=False, verbose=False):
|
||||
super().__init__()
|
||||
|
||||
self.mobilenet = models.mobilenet_v2(pretrained=True)
|
||||
self.mobilenet = nn.Sequential(*list(self.mobilenet.children())[:-1]) # Remove the last classifier
|
||||
|
||||
# average pooling
|
||||
if pooling:
|
||||
self.mobilenet.add_module('LastAvgPool', nn.AvgPool2d(math.ceil(reshape_size/32.))) # input: N x 1280 x 7 x 7
|
||||
|
||||
for i in self.mobilenet.named_parameters():
|
||||
i[1].requires_grad = True # fine-tune all
|
||||
|
||||
def forward(self, img, verbose=False):
|
||||
"""
|
||||
Inputs:
|
||||
- img: Batch of resized images, of shape Nx3x224x224
|
||||
|
||||
Outputs:
|
||||
- feat: Image feature, of shape Nx1280 (pooled) or Nx1280x7x7
|
||||
"""
|
||||
num_img = img.shape[0]
|
||||
|
||||
img_prepro = img
|
||||
|
||||
feat = []
|
||||
process_batch = 500
|
||||
for b in range(math.ceil(num_img/process_batch)):
|
||||
feat.append(self.mobilenet(img_prepro[b*process_batch:(b+1)*process_batch]
|
||||
).squeeze(-1).squeeze(-1)) # forward and squeeze
|
||||
feat = torch.cat(feat)
|
||||
|
||||
if verbose:
|
||||
print('Output feature shape: ', feat.shape)
|
||||
|
||||
return feat
|
||||
|
||||
|
||||
class FastRCNN(nn.Module):
|
||||
def __init__(self, in_dim=1280, hidden_dim=256, num_classes=20, \
|
||||
roi_output_w=2, roi_output_h=2, drop_ratio=0.3):
|
||||
super().__init__()
|
||||
|
||||
assert(num_classes != 0)
|
||||
self.num_classes = num_classes
|
||||
self.roi_output_w, self.roi_output_h = roi_output_w, roi_output_h
|
||||
self.feat_extractor = FeatureExtractor()
|
||||
##############################################################################
|
||||
# TODO: Declare the cls & bbox heads (in Fast R-CNN). #
|
||||
# The cls & bbox heads share a sequential module with a Linear layer, #
|
||||
# followed by a Dropout (p=drop_ratio), a ReLU nonlinearity and another #
|
||||
# Linear layer. #
|
||||
# The cls head is a Linear layer that predicts num_classes + 1 (background). #
|
||||
# The det head is a Linear layer that predicts offsets(dim=4). #
|
||||
# HINT: The dimension of the two Linear layers are in_dim -> hidden_dim and #
|
||||
# hidden_dim -> hidden_dim. #
|
||||
##############################################################################
|
||||
# Replace "pass" statement with your code
|
||||
pass
|
||||
##############################################################################
|
||||
# END OF YOUR CODE #
|
||||
##############################################################################
|
||||
|
||||
def forward(self, images, bboxes, bbox_batch_ids, proposals, proposal_batch_ids):
|
||||
"""
|
||||
Training-time forward pass for our two-stage Faster R-CNN detector.
|
||||
|
||||
Inputs:
|
||||
- images: Tensor of shape (B, 3, H, W) giving input images
|
||||
- bboxes: Tensor of shape (N, 5) giving ground-truth bounding boxes
|
||||
and category labels, from the dataloader, where N is the total number
|
||||
of GT boxes in the batch
|
||||
- bbox_batch_ids: Tensor of shape (N, ) giving the index (in the batch)
|
||||
of the image that each GT box belongs to
|
||||
- proposals: Tensor of shape (M, 4) giving the proposals for input images,
|
||||
where M is the total number of proposals in the batch
|
||||
- proposal_batch_ids: Tensor of shape (M, ) giving the index of the image
|
||||
that each proposals belongs to
|
||||
|
||||
Outputs:
|
||||
- total_loss: Torch scalar giving the overall training loss.
|
||||
"""
|
||||
w_cls = 1 # for cls_scores
|
||||
w_bbox = 1 # for offsets
|
||||
total_loss = None
|
||||
##############################################################################
|
||||
# TODO: Implement the forward pass of Fast R-CNN. #
|
||||
# A few key steps are outlined as follows: #
|
||||
# i) Extract image fearure. #
|
||||
# ii) Perform RoI Align on proposals, then meanpool the feature in the #
|
||||
# spatial dimension. #
|
||||
# iii) Pass the RoI feature through the shared-fc layer. Predict #
|
||||
# classification scores ans box offsets. #
|
||||
# iv) Assign the proposals with targets of each image. #
|
||||
# v) Compute the cls_loss between the predicted class_prob and GT_class #
|
||||
# (For poistive & negative proposals) #
|
||||
# Compute the bbox_loss between the offsets and GT_offsets #
|
||||
# (For positive proposals) #
|
||||
# Compute the total_loss which is formulated as: #
|
||||
# total_loss = w_cls*cls_loss + w_bbox*bbox_loss. #
|
||||
##############################################################################
|
||||
# Replace "pass" statement with your code
|
||||
B, _, H, W = images.shape
|
||||
|
||||
# extract image feature
|
||||
pass
|
||||
|
||||
# perform RoI Pool & mean pool
|
||||
pass
|
||||
|
||||
# forward heads, get predicted cls scores & offsets
|
||||
pass
|
||||
|
||||
# assign targets with proposals
|
||||
pos_masks, neg_masks, GT_labels, GT_bboxes = [], [], [], []
|
||||
for img_idx in range(B):
|
||||
# get the positive/negative proposals and corresponding
|
||||
# GT box & class label of this image
|
||||
pass
|
||||
|
||||
# compute loss
|
||||
pass
|
||||
|
||||
##############################################################################
|
||||
# END OF YOUR CODE #
|
||||
##############################################################################
|
||||
return total_loss
|
||||
|
||||
def inference(self, images, proposals, proposal_batch_ids, thresh=0.5, nms_thresh=0.7):
|
||||
""""
|
||||
Inference-time forward pass for our two-stage Faster R-CNN detector
|
||||
|
||||
Inputs:
|
||||
- images: Tensor of shape (B, 3, H, W) giving input images
|
||||
- proposals: Tensor of shape (M, 4) giving the proposals for input images,
|
||||
where M is the total number of proposals in the batch
|
||||
- proposal_batch_ids: Tensor of shape (M, ) giving the index of the image
|
||||
that each proposals belongs to
|
||||
- thresh: Threshold value on confidence probability. HINT: You can convert the
|
||||
classification score to probability using a softmax nonlinearity.
|
||||
- nms_thresh: IoU threshold for NMS
|
||||
|
||||
We can output a variable number of predicted boxes per input image.
|
||||
In particular we assume that the input images[i] gives rise to P_i final
|
||||
predicted boxes.
|
||||
|
||||
Outputs:
|
||||
- final_proposals: List of length (B,) where final_proposals[i] is a Tensor
|
||||
of shape (P_i, 4) giving the coordinates of the final predicted boxes for
|
||||
the input images[i]
|
||||
- final_conf_probs: List of length (B,) where final_conf_probs[i] is a
|
||||
Tensor of shape (P_i, 1) giving the predicted probabilites that the boxes
|
||||
in final_proposals[i] are objects (vs background)
|
||||
- final_class: List of length (B,), where final_class[i] is an int64 Tensor
|
||||
of shape (P_i, 1) giving the predicted category labels for each box in
|
||||
final_proposals[i].
|
||||
"""
|
||||
final_proposals, final_conf_probs, final_class = None, None, None
|
||||
##############################################################################
|
||||
# TODO: Predicting the final proposal coordinates `final_proposals`, #
|
||||
# confidence scores `final_conf_probs`, and the class index `final_class`. #
|
||||
# The overall steps are similar to the forward pass, but now you cannot #
|
||||
# decide the activated nor negative proposals without GT boxes. #
|
||||
# You should apply post-processing (thresholding and NMS) to all proposals #
|
||||
# and keep the final proposals. #
|
||||
##############################################################################
|
||||
# Replace "pass" statement with your code
|
||||
B = images.shape[0]
|
||||
|
||||
# extract image feature
|
||||
pass
|
||||
|
||||
# perform RoI Pool & mean pool
|
||||
pass
|
||||
|
||||
# forward heads, get predicted cls scores & offsets
|
||||
pass
|
||||
|
||||
# get predicted boxes & class label & confidence probability
|
||||
pass
|
||||
|
||||
final_proposals = []
|
||||
final_conf_probs = []
|
||||
final_class = []
|
||||
# post-process to get final predictions
|
||||
for img_idx in range(B):
|
||||
|
||||
# filter by threshold
|
||||
pass
|
||||
|
||||
# nms
|
||||
pass
|
||||
|
||||
##############################################################################
|
||||
# END OF YOUR CODE #
|
||||
##############################################################################
|
||||
return final_proposals, final_conf_probs, final_class
|
|
@ -0,0 +1,287 @@
|
|||
import numpy as np
|
||||
import cv2
|
||||
from matplotlib import pyplot as plt
|
||||
import torch
|
||||
|
||||
|
||||
def data_visualizer(img, idx_to_class, path, bbox=None, pred=None):
|
||||
"""
|
||||
Data visualizer on the original image. Support both GT box input and proposal input.
|
||||
|
||||
Input:
|
||||
- img: PIL Image input
|
||||
- idx_to_class: Mapping from the index (0-19) to the class name
|
||||
- bbox: GT bbox (in red, optional), a tensor of shape Nx5, where N is
|
||||
the number of GT boxes, 5 indicates (x_tl, y_tl, x_br, y_br, class)
|
||||
- pred: Predicted bbox (in green, optional), a tensor of shape N'x6, where
|
||||
N' is the number of predicted boxes, 6 indicates
|
||||
(x_tl, y_tl, x_br, y_br, class, object confidence score)
|
||||
"""
|
||||
|
||||
img_copy = np.array(img).astype('uint8')
|
||||
|
||||
if bbox is not None:
|
||||
for bbox_idx in range(bbox.shape[0]):
|
||||
one_bbox = bbox[bbox_idx][:4].numpy().astype('int')
|
||||
cv2.rectangle(img_copy, (one_bbox[0], one_bbox[1]), (one_bbox[2],
|
||||
one_bbox[3]), (255, 0, 0), 2)
|
||||
if bbox.shape[1] > 4: # if class info provided
|
||||
obj_cls = idx_to_class[bbox[bbox_idx][4].item()]
|
||||
cv2.putText(img_copy, '%s' % (obj_cls),
|
||||
(one_bbox[0], one_bbox[1]+15),
|
||||
cv2.FONT_HERSHEY_PLAIN, 1.0, (0, 0, 255), thickness=1)
|
||||
|
||||
if pred is not None:
|
||||
for bbox_idx in range(pred.shape[0]):
|
||||
one_bbox = pred[bbox_idx][:4].numpy().astype('int')
|
||||
cv2.rectangle(img_copy, (one_bbox[0], one_bbox[1]), (one_bbox[2],
|
||||
one_bbox[3]), (0, 255, 0), 2)
|
||||
|
||||
if pred.shape[1] > 4: # if class and conf score info provided
|
||||
obj_cls = idx_to_class[pred[bbox_idx][4].item()]
|
||||
conf_score = pred[bbox_idx][5].item()
|
||||
cv2.putText(img_copy, '%s, %.2f' % (obj_cls, conf_score),
|
||||
(one_bbox[0], one_bbox[1]+15),
|
||||
cv2.FONT_HERSHEY_PLAIN, 1.0, (0, 0, 255), thickness=1)
|
||||
|
||||
plt.imshow(img_copy)
|
||||
plt.axis('off')
|
||||
plt.title(path)
|
||||
plt.savefig(path)
|
||||
plt.close()
|
||||
|
||||
|
||||
def coord_trans(bbox, bbox_batch_idx, w_pixel, h_pixel, w_amap=7, h_amap=7, mode='a2p'):
|
||||
"""
|
||||
Coordinate transformation function. It converts the box coordinate from
|
||||
the image coordinate system to the activation map coordinate system and vice versa.
|
||||
In our case, the input image will have a few hundred of pixels in
|
||||
width/height while the activation map is of size 7x7.
|
||||
|
||||
Input:
|
||||
- bbox: Could be either bbox, anchor, or proposal, of shape Mx4
|
||||
- bbox_batch_idx: Index of the image that each bbox belongs to, of shape M
|
||||
- w_pixel: Number of pixels in the width side of the original image, of shape B
|
||||
- h_pixel: Number of pixels in the height side of the original image, of shape B
|
||||
- w_amap: Number of pixels in the width side of the activation map, scalar
|
||||
- h_amap: Number of pixels in the height side of the activation map, scalar
|
||||
- mode: Whether transfer from the original image to activation map ('p2a') or
|
||||
the opposite ('a2p')
|
||||
|
||||
Output:
|
||||
- resized_bbox: Resized box coordinates, of the same shape as the input bbox
|
||||
"""
|
||||
|
||||
assert mode in ('p2a', 'a2p'), 'invalid coordinate transformation mode!'
|
||||
assert bbox.shape[-1] >= 4, 'the transformation is applied to the first 4 values of dim -1'
|
||||
|
||||
if bbox.shape[0] == 0: # corner cases
|
||||
return bbox
|
||||
|
||||
resized_bbox = bbox.clone()
|
||||
|
||||
if mode == 'p2a':
|
||||
# pixel to activation
|
||||
width_ratio = w_pixel[bbox_batch_idx] * 1. / w_amap
|
||||
height_ratio = h_pixel[bbox_batch_idx] * 1. / h_amap
|
||||
resized_bbox[:, [0, 2]] /= width_ratio.view(-1, 1)
|
||||
resized_bbox[:, [1, 3]] /= height_ratio.view(-1, 1)
|
||||
else:
|
||||
# activation to pixel
|
||||
width_ratio = w_pixel[bbox_batch_idx] * 1. / w_amap
|
||||
height_ratio = h_pixel[bbox_batch_idx] * 1. / h_amap
|
||||
resized_bbox[:, [0, 2]] *= width_ratio.view(-1, 1)
|
||||
resized_bbox[:, [1, 3]] *= height_ratio.view(-1, 1)
|
||||
|
||||
return resized_bbox
|
||||
|
||||
|
||||
def generate_anchor(anc_per_grid, grid):
|
||||
"""
|
||||
Anchor generator.
|
||||
|
||||
Inputs:
|
||||
- anc_per_grid: Tensor of shape (A, 2) giving the shapes of anchor boxes to
|
||||
consider at each point in the grid. anc_per_grid[a] = (w, h) gives the width
|
||||
and height of the a'th anchor shape.
|
||||
- grid: Tensor of shape (B, H', W', 2) giving the (x, y) coordinates of the
|
||||
center of each feature from the backbone feature map. This is the tensor
|
||||
returned from GenerateGrid.
|
||||
|
||||
Outputs:
|
||||
- anchors: Tensor of shape (B, A, H', W', 4) giving the positions of all
|
||||
anchor boxes for the entire image. anchors[b, a, h, w] is an anchor box
|
||||
centered at grid[b, h, w], whose shape is given by anc[a]; we parameterize
|
||||
boxes as anchors[b, a, h, w] = (x_tl, y_tl, x_br, y_br), where (x_tl, y_tl)
|
||||
and (x_br, y_br) give the xy coordinates of the top-left and bottom-right
|
||||
corners of the box.
|
||||
"""
|
||||
A, _ = anc_per_grid.shape
|
||||
B, H, W, _ = grid.shape
|
||||
anc_per_grid = anc_per_grid.to(grid)
|
||||
|
||||
anc_per_grid = anc_per_grid.view(1, A, 1, 1, -1).repeat(B, 1, H, W, 1)
|
||||
grid = grid.view(B, 1, H, W, -1).repeat(1, A, 1, 1, 1)
|
||||
|
||||
x1y1 = grid - anc_per_grid / 2
|
||||
x2y2 = grid + anc_per_grid / 2
|
||||
anchors = torch.cat([x1y1, x2y2], dim=-1)
|
||||
|
||||
return anchors
|
||||
|
||||
|
||||
def compute_iou(anchors, bboxes):
|
||||
"""
|
||||
Compute the intersection-over-union between anchors and gts.
|
||||
|
||||
Inputs:
|
||||
- anchors: Anchor boxes, of shape (M, 4), where M is the number of proposals
|
||||
- bboxes: GT boxes of shape (N, 4), where N is the number of GT boxes,
|
||||
4 indicates (x_{lr}^{gt}, y_{lr}^{gt}, x_{rb}^{gt}, y_{rb}^{gt})
|
||||
|
||||
Outputs:
|
||||
- iou: IoU matrix of shape (M, N)
|
||||
"""
|
||||
iou = None
|
||||
##############################################################################
|
||||
# TODO: Given anchors and gt bboxes, #
|
||||
# compute the iou between each anchor and gt bbox. #
|
||||
##############################################################################
|
||||
pass
|
||||
|
||||
##############################################################################
|
||||
# END OF YOUR CODE #
|
||||
##############################################################################
|
||||
|
||||
return iou
|
||||
|
||||
|
||||
def compute_offsets(anchors, bboxes):
|
||||
"""
|
||||
Compute the offsets between anchors and gts.
|
||||
|
||||
Inputs:
|
||||
- anchors: Anchor boxes, of shape (M, 4)
|
||||
- bboxes: GT boxes of shape (M, 4),
|
||||
4 indicates (x_{lr}^{gt}, y_{lr}^{gt}, x_{rb}^{gt}, y_{rb}^{gt})
|
||||
|
||||
Outputs:
|
||||
- offsets: offsets of shape (M, 4)
|
||||
"""
|
||||
wh_offsets = torch.log((bboxes[:, 2:4] - bboxes[:, :2]) \
|
||||
/ (anchors[:, 2:4] - anchors[:, :2]))
|
||||
|
||||
xy_offsets = (bboxes[:, :2] + bboxes[:, 2:4] - \
|
||||
anchors[:, :2] - anchors[:, 2:4]) / 2.
|
||||
|
||||
xy_offsets /= (anchors[:, 2:4] - anchors[:, :2])
|
||||
|
||||
offsets = torch.cat((xy_offsets, wh_offsets), dim=-1)
|
||||
|
||||
return offsets
|
||||
|
||||
|
||||
def generate_proposal(anchors, offsets):
|
||||
"""
|
||||
Proposal generator.
|
||||
|
||||
Inputs:
|
||||
- anchors: Anchor boxes, of shape (M, 4). Anchors are represented
|
||||
by the coordinates of their top-left and bottom-right corners.
|
||||
- offsets: Transformations of shape (M, 4) that will be used to
|
||||
convert anchor boxes into region proposals. The transformation
|
||||
offsets[m] = (tx, ty, tw, th) will be applied to the anchor
|
||||
anchors[m].
|
||||
|
||||
Outputs:
|
||||
- proposals: Region proposals of shape (M, 4), represented by the
|
||||
coordinates of their top-left and bottom-right corners. Applying the
|
||||
transform offsets[m] to the anchor[m] should give the
|
||||
proposal proposals[m].
|
||||
|
||||
"""
|
||||
proposals = None
|
||||
##############################################################################
|
||||
# TODO: Given anchor coordinates and the proposed offset for each anchor, #
|
||||
# compute the proposal coordinates using the transformation formulas above. #
|
||||
##############################################################################
|
||||
# Replace "pass" statement with your code
|
||||
pass
|
||||
|
||||
##############################################################################
|
||||
# END OF YOUR CODE #
|
||||
##############################################################################
|
||||
|
||||
return proposals
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def assign_label(proposals, bboxes, background_id, pos_thresh=0.5, neg_thresh=0.5, pos_fraction=0.25):
|
||||
"""
|
||||
Determine the activated (positive) and negative proposals for model training.
|
||||
|
||||
For Fast R-CNN - Positive proposals are defined Any of the two
|
||||
(i) the proposal/proposals with the highest IoU overlap with a GT box, or
|
||||
(ii) a proposal that has an IoU overlap higher than positive threshold with any GT box.
|
||||
Note: One proposal can match at most one GT box (the one with the largest IoU overlapping).
|
||||
|
||||
We assign a negative label to a proposal if its IoU ratio is lower than
|
||||
a threshold value for all GT boxes. Proposals that are neither positive nor negative
|
||||
do not contribute to the training objective.
|
||||
|
||||
Main steps include:
|
||||
i) Decide activated and negative proposals based on the IoU matrix.
|
||||
ii) Compute GT confidence score/offsets/object class on the positive proposals.
|
||||
iii) Compute GT confidence score on the negative proposals.
|
||||
|
||||
Inputs:
|
||||
- proposal: Proposal boxes, of shape (M, 4), where M is the number of proposals
|
||||
- bboxes: GT boxes of shape Nx5, where N is the number of GT boxes,
|
||||
5 indicates (x_{lr}^{gt}, y_{lr}^{gt}, x_{rb}^{gt}, y_{rb}^{gt}) and class index
|
||||
- background_id: Class id of the background class
|
||||
- pos_thresh: Positive threshold value
|
||||
- neg_thresh: Negative threshold value
|
||||
- pos_fraction: a factor balancing pos/neg proposals
|
||||
|
||||
Outputs:
|
||||
- activated_anc_mask: a binary mask indicating the activated proposals, of shape M
|
||||
- negative_anc_mask: a binary mask indicating the negative proposals, of shape M
|
||||
- GT_class: GT class category on all proposals, background class for non-activated proposals,
|
||||
of shape M
|
||||
- bboxes: GT bboxes on activated proposals, of shape M'x4, where M' is the number of
|
||||
activated proposals
|
||||
"""
|
||||
M = proposals.shape[0]
|
||||
N = bboxes.shape[0]
|
||||
iou_mat = compute_iou(proposals, bboxes[:, :4])
|
||||
|
||||
# activated/positive proposals
|
||||
max_iou_per_anc, max_iou_per_anc_ind = iou_mat.max(dim=-1)
|
||||
max_iou_per_box = iou_mat.max(dim=0, keepdim=True)[0]
|
||||
activated_anc_mask = (iou_mat == max_iou_per_box) & (max_iou_per_box > 0)
|
||||
activated_anc_mask |= (iou_mat > pos_thresh) # using the pos_thresh condition as well
|
||||
activated_anc_mask = activated_anc_mask.max(dim=-1)[0] # (M, )
|
||||
activated_anc_ind = torch.nonzero(activated_anc_mask.view(-1)).squeeze(-1)
|
||||
|
||||
# GT class
|
||||
box_cls = bboxes[:, 4].long().view(1, N).expand(M, N)
|
||||
# if a proposal matches multiple GT boxes, choose the box with the largest iou
|
||||
GT_class = torch.gather(box_cls, -1, max_iou_per_anc_ind.unsqueeze(-1)).squeeze(-1) # M
|
||||
GT_class[~activated_anc_mask] = background_id
|
||||
|
||||
# GT bboxes
|
||||
bboxes_expand = bboxes[:, :4].view(1, N, 4).expand((M, N, 4))
|
||||
bboxes = torch.gather(bboxes_expand, -2, max_iou_per_anc_ind.unsqueeze(-1) \
|
||||
.unsqueeze(-1).expand(M, 1, 4)).view(M, 4)
|
||||
bboxes = bboxes[activated_anc_ind]
|
||||
|
||||
# negative anchors
|
||||
negative_anc_mask = (max_iou_per_anc < neg_thresh)
|
||||
negative_anc_ind = torch.nonzero(negative_anc_mask.view(-1)).squeeze(-1)
|
||||
# balance pos/neg anchors, random choose
|
||||
num_neg = int(activated_anc_ind.shape[0] * (1 - pos_fraction) / pos_fraction)
|
||||
negative_anc_ind = negative_anc_ind[torch.randint(0, negative_anc_ind.shape[0], (num_neg,))]
|
||||
negative_anc_mask = torch.zeros_like(negative_anc_mask)
|
||||
negative_anc_mask[negative_anc_ind] = 1
|
||||
|
||||
return activated_anc_mask, negative_anc_mask, GT_class, bboxes
|
Binary file not shown.
Loading…
Reference in New Issue