commit 16b08f040b90a3c7fce5ed0839d5f18dc234765e Author: ClF3 Date: Wed Nov 13 13:46:39 2024 +0800 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b9a52d8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +.venv +__pycache__ +*.pyc +*.pyo +ckpts +data +exp diff --git a/compute_mAP.py b/compute_mAP.py new file mode 100644 index 0000000..dd6ab60 --- /dev/null +++ b/compute_mAP.py @@ -0,0 +1,904 @@ +import glob +import json +import os +import shutil +import operator +import sys +import argparse +import math + +import numpy as np + +MINOVERLAP = 0.5 # default value (defined in the PASCAL VOC2012 challenge) + +parser = argparse.ArgumentParser() +parser.add_argument('--path', type=str, help="the saving directory to compute mAP") +parser.add_argument('-na', '--no-animation', help="no animation is shown.", action="store_true") +parser.add_argument('-np', '--no-plot', help="no plot is shown.", action="store_true") +parser.add_argument('-q', '--quiet', help="minimalistic console output.", action="store_true") +# argparse receiving list of classes to be ignored (e.g., python main.py --ignore person book) +parser.add_argument('-i', '--ignore', nargs='+', type=str, help="ignore a list of classes.") +# argparse receiving list of classes with specific IoU (e.g., python main.py --set-class-iou person 0.7) +parser.add_argument('--set-class-iou', nargs='+', type=str, help="set IoU for a specific class.") +args = parser.parse_args() + +''' + 0,0 ------> x (width) + | + | (Left,Top) + | *_________ + | | | + | | + y |_________| + (height) * + (Right,Bottom) +''' + +# if there are no classes to ignore then replace None by empty list +if args.ignore is None: + args.ignore = [] + +specific_iou_flagged = False +if args.set_class_iou is not None: + specific_iou_flagged = True + +# make sure that the cwd() is the location of the python script (so that every path makes sense) +os.chdir(os.path.dirname(os.path.abspath(__file__))) + +GT_PATH = os.path.join(args.path, 'mAP_input', 'ground-truth') +DR_PATH = os.path.join(args.path, 'mAP_input', 'detection-results') +# if there are no images then no animation can be shown +IMG_PATH = os.path.join(args.path, 'mAP_input', 'images-optional') +if os.path.exists(IMG_PATH): + for dirpath, dirnames, files in os.walk(IMG_PATH): + if not files: + # no image files found + args.no_animation = True +else: + args.no_animation = True + +# try to import OpenCV if the user didn't choose the option --no-animation +show_animation = False +if not args.no_animation: + try: + import cv2 + show_animation = True + except ImportError: + print("\"opencv-python\" not found, please install to visualize the results.") + args.no_animation = True + +# try to import Matplotlib if the user didn't choose the option --no-plot +draw_plot = False +if not args.no_plot: + try: + import matplotlib.pyplot as plt + draw_plot = True + except ImportError: + print("\"matplotlib\" not found, please install it to get the resulting plots.") + args.no_plot = True + + +def log_average_miss_rate(prec, rec, num_images): + """ + log-average miss rate: + Calculated by averaging miss rates at 9 evenly spaced FPPI points + between 10e-2 and 10e0, in log-space. + + output: + lamr | log-average miss rate + mr | miss rate + fppi | false positives per image + + references: + [1] Dollar, Piotr, et al. "Pedestrian Detection: An Evaluation of the + State of the Art." Pattern Analysis and Machine Intelligence, IEEE + Transactions on 34.4 (2012): 743 - 761. + """ + + # if there were no detections of that class + if prec.size == 0: + lamr = 0 + mr = 1 + fppi = 0 + return lamr, mr, fppi + + fppi = (1 - prec) + mr = (1 - rec) + + fppi_tmp = np.insert(fppi, 0, -1.0) + mr_tmp = np.insert(mr, 0, 1.0) + + # Use 9 evenly spaced reference points in log-space + ref = np.logspace(-2.0, 0.0, num = 9) + for i, ref_i in enumerate(ref): + # np.where() will always find at least 1 index, since min(ref) = 0.01 and min(fppi_tmp) = -1.0 + j = np.where(fppi_tmp <= ref_i)[-1][-1] + ref[i] = mr_tmp[j] + + # log(0) is undefined, so we use the np.maximum(1e-10, ref) + lamr = math.exp(np.mean(np.log(np.maximum(1e-10, ref)))) + + return lamr, mr, fppi + +""" + throw error and exit +""" +def error(msg): + print(msg) + sys.exit(0) + +""" + check if the number is a float between 0.0 and 1.0 +""" +def is_float_between_0_and_1(value): + try: + val = float(value) + if val > 0.0 and val < 1.0: + return True + else: + return False + except ValueError: + return False + +""" + Calculate the AP given the recall and precision array + 1st) We compute a version of the measured precision/recall curve with + precision monotonically decreasing + 2nd) We compute the AP as the area under this curve by numerical integration. +""" +def voc_ap(rec, prec): + """ + --- Official matlab code VOC2012--- + mrec=[0 ; rec ; 1]; + mpre=[0 ; prec ; 0]; + for i=numel(mpre)-1:-1:1 + mpre(i)=max(mpre(i),mpre(i+1)); + end + i=find(mrec(2:end)~=mrec(1:end-1))+1; + ap=sum((mrec(i)-mrec(i-1)).*mpre(i)); + """ + rec.insert(0, 0.0) # insert 0.0 at begining of list + rec.append(1.0) # insert 1.0 at end of list + mrec = rec[:] + prec.insert(0, 0.0) # insert 0.0 at begining of list + prec.append(0.0) # insert 0.0 at end of list + mpre = prec[:] + """ + This part makes the precision monotonically decreasing + (goes from the end to the beginning) + matlab: for i=numel(mpre)-1:-1:1 + mpre(i)=max(mpre(i),mpre(i+1)); + """ + # matlab indexes start in 1 but python in 0, so I have to do: + # range(start=(len(mpre) - 2), end=0, step=-1) + # also the python function range excludes the end, resulting in: + # range(start=(len(mpre) - 2), end=-1, step=-1) + for i in range(len(mpre)-2, -1, -1): + mpre[i] = max(mpre[i], mpre[i+1]) + """ + This part creates a list of indexes where the recall changes + matlab: i=find(mrec(2:end)~=mrec(1:end-1))+1; + """ + i_list = [] + for i in range(1, len(mrec)): + if mrec[i] != mrec[i-1]: + i_list.append(i) # if it was matlab would be i + 1 + """ + The Average Precision (AP) is the area under the curve + (numerical integration) + matlab: ap=sum((mrec(i)-mrec(i-1)).*mpre(i)); + """ + ap = 0.0 + for i in i_list: + ap += ((mrec[i]-mrec[i-1])*mpre[i]) + return ap, mrec, mpre + + +""" + Convert the lines of a file to a list +""" +def file_lines_to_list(path): + # open txt file lines to a list + with open(path) as f: + content = f.readlines() + # remove whitespace characters like `\n` at the end of each line + content = [x.strip() for x in content] + return content + +""" + Draws text in image +""" +def draw_text_in_image(img, text, pos, color, line_width): + font = cv2.FONT_HERSHEY_PLAIN + fontScale = 1 + lineType = 1 + bottomLeftCornerOfText = pos + cv2.putText(img, text, + bottomLeftCornerOfText, + font, + fontScale, + color, + lineType) + text_width, _ = cv2.getTextSize(text, font, fontScale, lineType)[0] + return img, (line_width + text_width) + +""" + Plot - adjust axes +""" +def adjust_axes(r, t, fig, axes): + # get text width for re-scaling + bb = t.get_window_extent(renderer=r) + text_width_inches = bb.width / fig.dpi + # get axis width in inches + current_fig_width = fig.get_figwidth() + new_fig_width = current_fig_width + text_width_inches + propotion = new_fig_width / current_fig_width + # get axis limit + x_lim = axes.get_xlim() + axes.set_xlim([x_lim[0], x_lim[1]*propotion]) + +""" + Draw plot using Matplotlib +""" +def draw_plot_func(dictionary, n_classes, window_title, plot_title, x_label, output_path, to_show, plot_color, true_p_bar): + # sort the dictionary by decreasing value, into a list of tuples + sorted_dic_by_value = sorted(dictionary.items(), key=operator.itemgetter(1)) + # unpacking the list of tuples into two lists + sorted_keys, sorted_values = zip(*sorted_dic_by_value) + # + if true_p_bar != "": + """ + Special case to draw in: + - green -> TP: True Positives (object detected and matches ground-truth) + - red -> FP: False Positives (object detected but does not match ground-truth) + - pink -> FN: False Negatives (object not detected but present in the ground-truth) + """ + fp_sorted = [] + tp_sorted = [] + for key in sorted_keys: + fp_sorted.append(dictionary[key] - true_p_bar[key]) + tp_sorted.append(true_p_bar[key]) + plt.barh(range(n_classes), fp_sorted, align='center', color='crimson', label='False Positive') + plt.barh(range(n_classes), tp_sorted, align='center', color='forestgreen', label='True Positive', left=fp_sorted) + # add legend + plt.legend(loc='lower right') + """ + Write number on side of bar + """ + fig = plt.gcf() # gcf - get current figure + axes = plt.gca() + r = fig.canvas.get_renderer() + for i, val in enumerate(sorted_values): + fp_val = fp_sorted[i] + tp_val = tp_sorted[i] + fp_str_val = " " + str(fp_val) + tp_str_val = fp_str_val + " " + str(tp_val) + # trick to paint multicolor with offset: + # first paint everything and then repaint the first number + t = plt.text(val, i, tp_str_val, color='forestgreen', va='center', fontweight='bold') + plt.text(val, i, fp_str_val, color='crimson', va='center', fontweight='bold') + if i == (len(sorted_values)-1): # largest bar + adjust_axes(r, t, fig, axes) + else: + plt.barh(range(n_classes), sorted_values, color=plot_color) + """ + Write number on side of bar + """ + fig = plt.gcf() # gcf - get current figure + axes = plt.gca() + r = fig.canvas.get_renderer() + for i, val in enumerate(sorted_values): + str_val = " " + str(val) # add a space before + if val < 1.0: + str_val = " {0:.2f}".format(val) + t = plt.text(val, i, str_val, color=plot_color, va='center', fontweight='bold') + # re-set axes to show number inside the figure + if i == (len(sorted_values)-1): # largest bar + adjust_axes(r, t, fig, axes) + # set window title + fig.canvas.manager.set_window_title(window_title) + # write classes in y axis + tick_font_size = 12 + plt.yticks(range(n_classes), sorted_keys, fontsize=tick_font_size) + """ + Re-scale height accordingly + """ + init_height = fig.get_figheight() + # comput the matrix height in points and inches + dpi = fig.dpi + height_pt = n_classes * (tick_font_size * 1.4) # 1.4 (some spacing) + height_in = height_pt / dpi + # compute the required figure height + top_margin = 0.15 # in percentage of the figure height + bottom_margin = 0.05 # in percentage of the figure height + figure_height = height_in / (1 - top_margin - bottom_margin) + # set new height + if figure_height > init_height: + fig.set_figheight(figure_height) + + # set plot title + plt.title(plot_title, fontsize=14) + # set axis titles + # plt.xlabel('classes') + plt.xlabel(x_label, fontsize='large') + # adjust size of window + fig.tight_layout() + # save the plot + fig.savefig(output_path) + # show image + if to_show: + plt.show() + # close the plot + plt.close() + +""" + Create a ".temp_files/" and "output/" directory +""" +TEMP_FILES_PATH = os.path.join(args.path, ".temp_files") +if not os.path.exists(TEMP_FILES_PATH): # if it doesn't exist already + os.makedirs(TEMP_FILES_PATH) +output_files_path = os.path.join(args.path, "mAP_output") +if os.path.exists(output_files_path): # if it exist already + # reset the output directory + shutil.rmtree(output_files_path) + +os.makedirs(output_files_path) +if draw_plot: + os.makedirs(os.path.join(output_files_path, "classes")) +if show_animation: + os.makedirs(os.path.join(output_files_path, "images", "detections_one_by_one")) + +""" + ground-truth + Load each of the ground-truth files into a temporary ".json" file. + Create a list of all the class names present in the ground-truth (gt_classes). +""" +# get a list with the ground-truth files +ground_truth_files_list = glob.glob(GT_PATH + '/*.txt') +if len(ground_truth_files_list) == 0: + error("Error: No ground-truth files found!") +ground_truth_files_list.sort() +# dictionary with counter per class +gt_counter_per_class = {} +counter_images_per_class = {} + +gt_files = [] +for txt_file in ground_truth_files_list: + #print(txt_file) + file_id = txt_file.split(".txt", 1)[0] + file_id = os.path.basename(os.path.normpath(file_id)) + # check if there is a correspondent detection-results file + temp_path = os.path.join(DR_PATH, (file_id + ".txt")) + if not os.path.exists(temp_path): + error_msg = "Error. File not found: {}\n".format(temp_path) + error_msg += "(You can avoid this error message by running extra/intersect-gt-and-dr.py)" + error(error_msg) + lines_list = file_lines_to_list(txt_file) + # create ground-truth dictionary + bounding_boxes = [] + is_difficult = False + already_seen_classes = [] + for line in lines_list: + try: + if "difficult" in line: + class_name, left, top, right, bottom, _difficult = line.split() + is_difficult = True + else: + class_name, left, top, right, bottom = line.split() + except ValueError: + error_msg = "Error: File " + txt_file + " in the wrong format.\n" + error_msg += " Expected: ['difficult']\n" + error_msg += " Received: " + line + error_msg += "\n\nIf you have a with spaces between words you should remove them\n" + error_msg += "by running the script \"remove_space.py\" or \"rename_class.py\" in the \"extra/\" folder." + error(error_msg) + # check if class is in the ignore list, if yes skip + if class_name in args.ignore: + continue + bbox = left + " " + top + " " + right + " " +bottom + if is_difficult: + bounding_boxes.append({"class_name":class_name, "bbox":bbox, "used":False, "difficult":True}) + is_difficult = False + else: + bounding_boxes.append({"class_name":class_name, "bbox":bbox, "used":False}) + # count that object + if class_name in gt_counter_per_class: + gt_counter_per_class[class_name] += 1 + else: + # if class didn't exist yet + gt_counter_per_class[class_name] = 1 + + if class_name not in already_seen_classes: + if class_name in counter_images_per_class: + counter_images_per_class[class_name] += 1 + else: + # if class didn't exist yet + counter_images_per_class[class_name] = 1 + already_seen_classes.append(class_name) + + + # dump bounding_boxes into a ".json" file + new_temp_file = TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json" + gt_files.append(new_temp_file) + with open(new_temp_file, 'w') as outfile: + json.dump(bounding_boxes, outfile) + +gt_classes = list(gt_counter_per_class.keys()) +# let's sort the classes alphabetically +gt_classes = sorted(gt_classes) +n_classes = len(gt_classes) +#print(gt_classes) +#print(gt_counter_per_class) + +""" + Check format of the flag --set-class-iou (if used) + e.g. check if class exists +""" +if specific_iou_flagged: + n_args = len(args.set_class_iou) + error_msg = \ + '\n --set-class-iou [class_1] [IoU_1] [class_2] [IoU_2] [...]' + if n_args % 2 != 0: + error('Error, missing arguments. Flag usage:' + error_msg) + # [class_1] [IoU_1] [class_2] [IoU_2] + # specific_iou_classes = ['class_1', 'class_2'] + specific_iou_classes = args.set_class_iou[::2] # even + # iou_list = ['IoU_1', 'IoU_2'] + iou_list = args.set_class_iou[1::2] # odd + if len(specific_iou_classes) != len(iou_list): + error('Error, missing arguments. Flag usage:' + error_msg) + for tmp_class in specific_iou_classes: + if tmp_class not in gt_classes: + error('Error, unknown class \"' + tmp_class + '\". Flag usage:' + error_msg) + for num in iou_list: + if not is_float_between_0_and_1(num): + error('Error, IoU must be between 0.0 and 1.0. Flag usage:' + error_msg) + +""" + detection-results + Load each of the detection-results files into a temporary ".json" file. +""" +# get a list with the detection-results files +dr_files_list = glob.glob(DR_PATH + '/*.txt') +dr_files_list.sort() + +for class_index, class_name in enumerate(gt_classes): + bounding_boxes = [] + for txt_file in dr_files_list: + #print(txt_file) + # the first time it checks if all the corresponding ground-truth files exist + file_id = txt_file.split(".txt",1)[0] + file_id = os.path.basename(os.path.normpath(file_id)) + temp_path = os.path.join(GT_PATH, (file_id + ".txt")) + if class_index == 0: + if not os.path.exists(temp_path): + error_msg = "Error. File not found: {}\n".format(temp_path) + error_msg += "(You can avoid this error message by running extra/intersect-gt-and-dr.py)" + error(error_msg) + lines = file_lines_to_list(txt_file) + for line in lines: + try: + tmp_class_name, confidence, left, top, right, bottom = line.split() + except ValueError: + error_msg = "Error: File " + txt_file + " in the wrong format.\n" + error_msg += " Expected: \n" + error_msg += " Received: " + line + error(error_msg) + if tmp_class_name == class_name: + #print("match") + bbox = left + " " + top + " " + right + " " +bottom + bounding_boxes.append({"confidence":confidence, "file_id":file_id, "bbox":bbox}) + #print(bounding_boxes) + # sort detection-results by decreasing confidence + bounding_boxes.sort(key=lambda x:float(x['confidence']), reverse=True) + with open(TEMP_FILES_PATH + "/" + class_name + "_dr.json", 'w') as outfile: + json.dump(bounding_boxes, outfile) + +""" + Calculate the AP for each class +""" +sum_AP = 0.0 +ap_dictionary = {} +lamr_dictionary = {} +# open file to store the output +with open(output_files_path + "/output.txt", 'w') as output_file: + output_file.write("# AP and precision/recall per class\n") + count_true_positives = {} + for class_index, class_name in enumerate(gt_classes): + count_true_positives[class_name] = 0 + """ + Load detection-results of that class + """ + dr_file = TEMP_FILES_PATH + "/" + class_name + "_dr.json" + dr_data = json.load(open(dr_file)) + + """ + Assign detection-results to ground-truth objects + """ + nd = len(dr_data) + tp = [0] * nd # creates an array of zeros of size nd + fp = [0] * nd + for idx, detection in enumerate(dr_data): + file_id = detection["file_id"] + if show_animation: + # find ground truth image + ground_truth_img = glob.glob1(IMG_PATH, file_id + ".*") + #tifCounter = len(glob.glob1(myPath,"*.tif")) + if len(ground_truth_img) == 0: + error("Error. Image not found with id: " + file_id) + elif len(ground_truth_img) > 1: + error("Error. Multiple image with id: " + file_id) + else: # found image + #print(IMG_PATH + "/" + ground_truth_img[0]) + # Load image + img = cv2.imread(IMG_PATH + "/" + ground_truth_img[0]) + # load image with draws of multiple detections + img_cumulative_path = output_files_path + "/images/" + ground_truth_img[0] + if os.path.isfile(img_cumulative_path): + img_cumulative = cv2.imread(img_cumulative_path) + else: + img_cumulative = img.copy() + # Add bottom border to image + bottom_border = 60 + BLACK = [0, 0, 0] + img = cv2.copyMakeBorder(img, 0, bottom_border, 0, 0, cv2.BORDER_CONSTANT, value=BLACK) + # assign detection-results to ground truth object if any + # open ground-truth with that file_id + gt_file = TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json" + ground_truth_data = json.load(open(gt_file)) + ovmax = -1 + gt_match = -1 + # load detected object bounding-box + bb = [ float(x) for x in detection["bbox"].split() ] + for obj in ground_truth_data: + # look for a class_name match + if obj["class_name"] == class_name: + bbgt = [ float(x) for x in obj["bbox"].split() ] + bi = [max(bb[0],bbgt[0]), max(bb[1],bbgt[1]), min(bb[2],bbgt[2]), min(bb[3],bbgt[3])] + iw = bi[2] - bi[0] + 1 + ih = bi[3] - bi[1] + 1 + if iw > 0 and ih > 0: + # compute overlap (IoU) = area of intersection / area of union + ua = (bb[2] - bb[0] + 1) * (bb[3] - bb[1] + 1) + (bbgt[2] - bbgt[0] + + 1) * (bbgt[3] - bbgt[1] + 1) - iw * ih + ov = iw * ih / ua + if ov > ovmax: + ovmax = ov + gt_match = obj + + # assign detection as true positive/don't care/false positive + if show_animation: + status = "NO MATCH FOUND!" # status is only used in the animation + # set minimum overlap + min_overlap = MINOVERLAP + if specific_iou_flagged: + if class_name in specific_iou_classes: + index = specific_iou_classes.index(class_name) + min_overlap = float(iou_list[index]) + if ovmax >= min_overlap: + if "difficult" not in gt_match: + if not bool(gt_match["used"]): + # true positive + tp[idx] = 1 + gt_match["used"] = True + count_true_positives[class_name] += 1 + # update the ".json" file + with open(gt_file, 'w') as f: + f.write(json.dumps(ground_truth_data)) + if show_animation: + status = "MATCH!" + else: + # false positive (multiple detection) + fp[idx] = 1 + if show_animation: + status = "REPEATED MATCH!" + else: + # false positive + fp[idx] = 1 + if ovmax > 0: + status = "INSUFFICIENT OVERLAP" + + """ + Draw image to show animation + """ + if show_animation: + height, widht = img.shape[:2] + # colors (OpenCV works with BGR) + white = (255,255,255) + light_blue = (255,200,100) + green = (0,255,0) + light_red = (30,30,255) + # 1st line + margin = 10 + v_pos = int(height - margin - (bottom_border / 2.0)) + text = "Image: " + ground_truth_img[0] + " " + img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0) + text = "Class [" + str(class_index) + "/" + str(n_classes) + "]: " + class_name + " " + img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), light_blue, line_width) + if ovmax != -1: + color = light_red + if status == "INSUFFICIENT OVERLAP": + text = "IoU: {0:.2f}% ".format(ovmax*100) + "< {0:.2f}% ".format(min_overlap*100) + else: + text = "IoU: {0:.2f}% ".format(ovmax*100) + ">= {0:.2f}% ".format(min_overlap*100) + color = green + img, _ = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width) + # 2nd line + v_pos += int(bottom_border / 2.0) + rank_pos = str(idx+1) # rank position (idx starts at 0) + text = "Detection #rank: " + rank_pos + " confidence: {0:.2f}% ".format(float(detection["confidence"])*100) + img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0) + color = light_red + if status == "MATCH!": + color = green + text = "Result: " + status + " " + img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width) + + font = cv2.FONT_HERSHEY_SIMPLEX + if ovmax > 0: # if there is intersections between the bounding-boxes + bbgt = [ int(round(float(x))) for x in gt_match["bbox"].split() ] + cv2.rectangle(img,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),light_blue,2) + cv2.rectangle(img_cumulative,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),light_blue,2) + cv2.putText(img_cumulative, class_name, (bbgt[0],bbgt[1] - 5), font, 0.6, light_blue, 1, cv2.LINE_AA) + bb = [int(i) for i in bb] + cv2.rectangle(img,(bb[0],bb[1]),(bb[2],bb[3]),color,2) + cv2.rectangle(img_cumulative,(bb[0],bb[1]),(bb[2],bb[3]),color,2) + cv2.putText(img_cumulative, class_name, (bb[0],bb[1] - 5), font, 0.6, color, 1, cv2.LINE_AA) + # show image + cv2.imshow("Animation", img) + cv2.waitKey(20) # show for 20 ms + # save image to output + output_img_path = output_files_path + "/images/detections_one_by_one/" + class_name + "_detection" + str(idx) + ".jpg" + cv2.imwrite(output_img_path, img) + # save the image with all the objects drawn to it + cv2.imwrite(img_cumulative_path, img_cumulative) + + #print(tp) + # compute precision/recall + cumsum = 0 + for idx, val in enumerate(fp): + fp[idx] += cumsum + cumsum += val + cumsum = 0 + for idx, val in enumerate(tp): + tp[idx] += cumsum + cumsum += val + #print(tp) + rec = tp[:] + for idx, val in enumerate(tp): + rec[idx] = float(tp[idx]) / gt_counter_per_class[class_name] + #print(rec) + prec = tp[:] + for idx, val in enumerate(tp): + prec[idx] = float(tp[idx]) / (fp[idx] + tp[idx]) + #print(prec) + + ap, mrec, mprec = voc_ap(rec[:], prec[:]) + sum_AP += ap + text = "{0:.2f}%".format(ap*100) + " = " + class_name + " AP " #class_name + " AP = {0:.2f}%".format(ap*100) + """ + Write to output.txt + """ + rounded_prec = [ '%.2f' % elem for elem in prec ] + rounded_rec = [ '%.2f' % elem for elem in rec ] + output_file.write(text + "\n Precision: " + str(rounded_prec) + "\n Recall :" + str(rounded_rec) + "\n\n") + if not args.quiet: + print(text) + ap_dictionary[class_name] = ap + + n_images = counter_images_per_class[class_name] + lamr, mr, fppi = log_average_miss_rate(np.array(prec), np.array(rec), n_images) + lamr_dictionary[class_name] = lamr + + """ + Draw plot + """ + if draw_plot: + plt.plot(rec, prec, '-o') + # add a new penultimate point to the list (mrec[-2], 0.0) + # since the last line segment (and respective area) do not affect the AP value + area_under_curve_x = mrec[:-1] + [mrec[-2]] + [mrec[-1]] + area_under_curve_y = mprec[:-1] + [0.0] + [mprec[-1]] + plt.fill_between(area_under_curve_x, 0, area_under_curve_y, alpha=0.2, edgecolor='r') + # set window title + fig = plt.gcf() # gcf - get current figure + fig.canvas.manager.set_window_title('AP ' + class_name) + # set plot title + plt.title('class: ' + text) + #plt.suptitle('This is a somewhat long figure title', fontsize=16) + # set axis titles + plt.xlabel('Recall') + plt.ylabel('Precision') + # optional - set axes + axes = plt.gca() # gca - get current axes + axes.set_xlim([0.0,1.0]) + axes.set_ylim([0.0,1.05]) # .05 to give some extra space + # Alternative option -> wait for button to be pressed + #while not plt.waitforbuttonpress(): pass # wait for key display + # Alternative option -> normal display + #plt.show() + # save the plot + fig.savefig(output_files_path + "/classes/" + class_name + ".png") + plt.cla() # clear axes for next plot + + if show_animation: + cv2.destroyAllWindows() + + output_file.write("\n# mAP of all classes\n") + mAP = sum_AP / n_classes + text = "mAP = {0:.2f}%".format(mAP*100) + output_file.write(text + "\n") + print(text) + +""" + Draw false negatives +""" +if show_animation: + pink = (203,192,255) + for tmp_file in gt_files: + ground_truth_data = json.load(open(tmp_file)) + #print(ground_truth_data) + # get name of corresponding image + start = TEMP_FILES_PATH + '/' + img_id = tmp_file[tmp_file.find(start)+len(start):tmp_file.rfind('_ground_truth.json')] + img_cumulative_path = output_files_path + "/images/" + img_id + ".jpg" + img = cv2.imread(img_cumulative_path) + if img is None: + img_path = IMG_PATH + '/' + img_id + ".jpg" + img = cv2.imread(img_path) + # draw false negatives + for obj in ground_truth_data: + if not obj['used']: + bbgt = [ int(round(float(x))) for x in obj["bbox"].split() ] + cv2.rectangle(img,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),pink,2) + cv2.imwrite(img_cumulative_path, img) + +# remove the temp_files directory +shutil.rmtree(TEMP_FILES_PATH) + +""" + Count total of detection-results +""" +# iterate through all the files +det_counter_per_class = {} +for txt_file in dr_files_list: + # get lines to list + lines_list = file_lines_to_list(txt_file) + for line in lines_list: + class_name = line.split()[0] + # check if class is in the ignore list, if yes skip + if class_name in args.ignore: + continue + # count that object + if class_name in det_counter_per_class: + det_counter_per_class[class_name] += 1 + else: + # if class didn't exist yet + det_counter_per_class[class_name] = 1 +#print(det_counter_per_class) +dr_classes = list(det_counter_per_class.keys()) + + +""" + Plot the total number of occurences of each class in the ground-truth +""" +if draw_plot: + window_title = "ground-truth-info" + plot_title = "ground-truth\n" + plot_title += "(" + str(len(ground_truth_files_list)) + " files and " + str(n_classes) + " classes)" + x_label = "Number of objects per class" + output_path = output_files_path + "/ground-truth-info.png" + to_show = False + plot_color = 'forestgreen' + draw_plot_func( + gt_counter_per_class, + n_classes, + window_title, + plot_title, + x_label, + output_path, + to_show, + plot_color, + '', + ) + +""" + Write number of ground-truth objects per class to results.txt +""" +with open(output_files_path + "/output.txt", 'a') as output_file: + output_file.write("\n# Number of ground-truth objects per class\n") + for class_name in sorted(gt_counter_per_class): + output_file.write(class_name + ": " + str(gt_counter_per_class[class_name]) + "\n") + +""" + Finish counting true positives +""" +for class_name in dr_classes: + # if class exists in detection-result but not in ground-truth then there are no true positives in that class + if class_name not in gt_classes: + count_true_positives[class_name] = 0 +#print(count_true_positives) + +""" + Plot the total number of occurences of each class in the "detection-results" folder +""" +if draw_plot: + window_title = "detection-results-info" + # Plot title + plot_title = "detection-results\n" + plot_title += "(" + str(len(dr_files_list)) + " files and " + count_non_zero_values_in_dictionary = sum(int(x) > 0 for x in list(det_counter_per_class.values())) + plot_title += str(count_non_zero_values_in_dictionary) + " detected classes)" + # end Plot title + x_label = "Number of objects per class" + output_path = output_files_path + "/detection-results-info.png" + to_show = False + plot_color = 'forestgreen' + true_p_bar = count_true_positives + draw_plot_func( + det_counter_per_class, + len(det_counter_per_class), + window_title, + plot_title, + x_label, + output_path, + to_show, + plot_color, + true_p_bar + ) + +""" + Write number of detected objects per class to output.txt +""" +with open(output_files_path + "/output.txt", 'a') as output_file: + output_file.write("\n# Number of detected objects per class\n") + for class_name in sorted(dr_classes): + n_det = det_counter_per_class[class_name] + text = class_name + ": " + str(n_det) + text += " (tp:" + str(count_true_positives[class_name]) + "" + text += ", fp:" + str(n_det - count_true_positives[class_name]) + ")\n" + output_file.write(text) + +""" + Draw log-average miss rate plot (Show lamr of all classes in decreasing order) +""" +if draw_plot: + window_title = "lamr" + plot_title = "log-average miss rate" + x_label = "log-average miss rate" + output_path = output_files_path + "/lamr.png" + to_show = False + plot_color = 'royalblue' + draw_plot_func( + lamr_dictionary, + n_classes, + window_title, + plot_title, + x_label, + output_path, + to_show, + plot_color, + "" + ) + +""" + Draw mAP plot (Show AP's of all classes in decreasing order) +""" +if draw_plot: + window_title = "mAP" + plot_title = "mAP = {0:.2f}%".format(mAP*100) + x_label = "Average Precision" + output_path = output_files_path + "/mAP.png" + to_show = True + plot_color = 'royalblue' + draw_plot_func( + ap_dictionary, + n_classes, + window_title, + plot_title, + x_label, + output_path, + to_show, + plot_color, + "" + ) \ No newline at end of file diff --git a/dataset.py b/dataset.py new file mode 100644 index 0000000..98ffc5a --- /dev/null +++ b/dataset.py @@ -0,0 +1,96 @@ +import os +import json +import torch +from torchvision import datasets +from torchvision import transforms +from torch.utils.data import DataLoader +from functools import partial + + +class_to_idx = { + 'aeroplane':0, 'bicycle':1, 'bird':2, 'boat':3, 'bottle':4, + 'bus':5, 'car':6, 'cat':7, 'chair':8, 'cow':9, 'diningtable':10, + 'dog':11, 'horse':12, 'motorbike':13, 'person':14, 'pottedplant':15, + 'sheep':16, 'sofa':17, 'train':18, 'tvmonitor':19 +} +idx_to_class = {i:c for c, i in class_to_idx.items()} + + +def get_pascal_voc2007_data(image_root, split='train'): + """ + Use torchvision.datasets + https://pytorch.org/docs/stable/torchvision/datasets.html#torchvision.datasets.VOCDetection + """ + + train_dataset = datasets.VOCDetection(image_root, year='2007', image_set=split, + download=False) + + return train_dataset + + +def pascal_voc2007_loader(dataset, batch_size, num_workers=0, shuffle=False, proposal_path=None): + """ + Data loader for Pascal VOC 2007. + https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader + """ + collate_fn = partial(voc_collate_fn, proposal_path=proposal_path) + train_loader = DataLoader(dataset, + batch_size=batch_size, + shuffle=shuffle, pin_memory=True, + num_workers=num_workers, + collate_fn=collate_fn) + return train_loader + + +def voc_collate_fn(batch_lst, reshape_size=224, proposal_path=None): + preprocess = transforms.Compose([ + transforms.Resize((reshape_size, reshape_size)), + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ]) + + batch_size = len(batch_lst) + + img_batch = torch.zeros(batch_size, 3, reshape_size, reshape_size) + + box_list = [] + box_batch_idx = [] + w_list = [] + h_list = [] + img_id_list = [] + proposal_list = [] + proposal_batch_idx = [] + + for i in range(batch_size): + img, ann = batch_lst[i] + w_list.append(img.size[0]) # image width + h_list.append(img.size[1]) # image height + img_id_list.append(ann['annotation']['filename']) + img_batch[i] = preprocess(img) + all_bbox = ann['annotation']['object'] + if type(all_bbox) == dict: # inconsistency in the annotation file + all_bbox = [all_bbox] + for bbox_idx, one_bbox in enumerate(all_bbox): + bbox = one_bbox['bndbox'] + obj_cls = one_bbox['name'] + box_list.append(torch.Tensor([float(bbox['xmin']), float(bbox['ymin']), + float(bbox['xmax']), float(bbox['ymax']), class_to_idx[obj_cls]])) + box_batch_idx.append(i) + if proposal_path is not None: + proposal_fn = ann['annotation']['filename'].replace('.jpg', '.json') + with open(os.path.join(proposal_path, proposal_fn), 'r') as f: + proposal = json.load(f) + for p in proposal: + proposal_list.append([p['x_min'], p['y_min'], p['x_max'], p['y_max']]) + proposal_batch_idx.append(i) + + h_batch = torch.tensor(h_list) + w_batch = torch.tensor(w_list) + box_batch = torch.stack(box_list) + box_batch_ids = torch.tensor(box_batch_idx, dtype=torch.long) + proposals = torch.tensor(proposal_list, dtype=box_batch.dtype) + proposal_batch_ids = torch.tensor(proposal_batch_idx, dtype=torch.long) + assert len(box_batch) == len(box_batch_ids) + assert len(proposals) == len(proposal_batch_ids) + + return img_batch, box_batch, box_batch_ids, proposals, proposal_batch_ids, w_batch, h_batch, img_id_list \ No newline at end of file diff --git a/loss.py b/loss.py new file mode 100644 index 0000000..d1ea3eb --- /dev/null +++ b/loss.py @@ -0,0 +1,33 @@ +import torch +import torch.nn.functional as F + + +def ClsScoreRegression(cls_scores, GT_label, batch_size): + """ + Multi-class cross-entropy loss + + Inputs: + - cls_scores: Predicted class scores, of shape (M, C). + - GT_label: GT class labels, of shape (M,). + + Outputs: + - cls_score_loss: Torch scalar + """ + cls_loss = F.cross_entropy(cls_scores, GT_label, \ + reduction='sum') * 1. / batch_size + return cls_loss + + +def BboxRegression(offsets, GT_offsets, batch_size): + """" + Use SmoothL1 loss as in Faster R-CNN + + Inputs: + - offsets: Predicted box offsets, of shape (M, 4) + - GT_offsets: GT box offsets, of shape (M, 4) + + Outputs: + - bbox_reg_loss: Torch scalar + """ + bbox_reg_loss = F.smooth_l1_loss(offsets, GT_offsets, reduction='sum') * 1. / batch_size + return bbox_reg_loss \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..276e2a3 --- /dev/null +++ b/main.py @@ -0,0 +1,204 @@ +import math +import copy +import time +import shutil +import os +import random +os.environ['TORCH_HOME'] = './ckpts' + +import argparse +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import optim +import torchvision +import matplotlib.pyplot as plt +import numpy as np +import cv2 + +from dataset import get_pascal_voc2007_data, pascal_voc2007_loader, idx_to_class +from model import FastRCNN +from utils import coord_trans, data_visualizer + + +def parse_args(): + parser = argparse.ArgumentParser('Faster R-CNN', add_help=False) + parser.add_argument('--lr', default=1e-3, type=float) + parser.add_argument('--lr_decay', default=1.0, type=float) + parser.add_argument('--batch_size', default=16, type=int) + parser.add_argument('--epochs', default=200, type=int) + parser.add_argument('--num_workers', default=4, type=int) + parser.add_argument('--overfit_small_data', default=False, action='store_true') + parser.add_argument('--output_dir', default='./exp/fast_rcnn') + + args = parser.parse_args() + + return args + + +def main(args): + torch.manual_seed(0) + torch.cuda.manual_seed(0) + random.seed(0) + if args.overfit_small_data: + args.output_dir = args.output_dir + "_overfit_small" + os.makedirs(args.output_dir, exist_ok=True) + + # build dataset & dataloader + train_dataset = get_pascal_voc2007_data('./data/VOCtrainval_06-Nov-2007/', 'train') + val_dataset = get_pascal_voc2007_data('./data/VOCtrainval_06-Nov-2007/', 'val') + + train_loader = pascal_voc2007_loader(train_dataset, args.batch_size, shuffle=True, num_workers=args.num_workers, + proposal_path='data/VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007/Proposals') + val_loader = pascal_voc2007_loader(val_dataset, args.batch_size, num_workers=args.num_workers, + proposal_path='data/VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007/Proposals') + + if args.overfit_small_data: + num_sample = 10 + small_dataset = torch.utils.data.Subset( + train_dataset, + torch.linspace(0, len(train_dataset)-1, steps=num_sample).long() + ) + small_train_loader = pascal_voc2007_loader(small_dataset, 10, + proposal_path='data/VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007/Proposals') + val_dataset = small_dataset + train_loader = small_train_loader + val_loader = small_train_loader + + model = FastRCNN() + model.cuda() + + # build optimizer + optimizer = optim.SGD( + filter(lambda p: p.requires_grad, model.parameters()), + args.lr + ) + lr_scheduler = optim.lr_scheduler.LambdaLR( + optimizer, + lambda epoch: args.lr_decay ** epoch + ) + + # load ckpt + ckpt_path = os.path.join(args.output_dir, 'checkpoint.pth') + start_epoch = 0 + if os.path.exists(ckpt_path): + checkpoint = torch.load(ckpt_path) + start_epoch = checkpoint['epoch'] + model.load_state_dict(checkpoint['model']) + optimizer.load_state_dict(checkpoint['optimizer']) + lr_scheduler.load_state_dict(checkpoint['lr_sched']) + + if start_epoch < args.epochs: + train(args, model, train_loader, optimizer, lr_scheduler, start_epoch) + inference(args, model, val_loader, val_dataset, visualize=args.overfit_small_data) + + +def train(args, model, train_loader, optimizer, lr_scheduler, start_epoch): + loss_history = [] + model.train() + for i in range(start_epoch, args.epochs): + start_t = time.time() + for iter_num, data_batch in enumerate(train_loader): + images, boxes, boxes_batch_ids, proposals, proposal_batch_ids, w_batch, h_batch, _ = data_batch + resized_boxes = coord_trans(boxes, boxes_batch_ids, w_batch, h_batch, mode='p2a') + resized_proposals = coord_trans(proposals, proposal_batch_ids, w_batch, h_batch, mode='p2a') + + images = images.to(dtype=torch.float, device='cuda') + resized_boxes = resized_boxes.to(dtype=torch.float, device='cuda') + boxes_batch_ids = boxes_batch_ids.cuda() + resized_proposals = resized_proposals.to(dtype=torch.float, device='cuda') + proposal_batch_ids = proposal_batch_ids.cuda() + + loss = model(images, resized_boxes, boxes_batch_ids, resized_proposals, proposal_batch_ids) + optimizer.zero_grad() + loss.backward() + loss_history.append(loss.item()) + optimizer.step() + + if iter_num % 50 == 0: + print('(Iter {} / {}) loss: {:.4f}'.format(iter_num, len(train_loader), np.mean(loss_history[-50:]))) + + end_t = time.time() + print('(Epoch {} / {}) loss: {:.4f}, time per epoch: {:.1f}s'.format( + i, args.epochs, np.mean(loss_history[-len(train_loader):]), end_t-start_t)) + lr_scheduler.step() + + checkpoint = { + 'epoch': i + 1, + 'model': model.state_dict(), + 'optimizer': optimizer.state_dict(), + 'lr_sched': lr_scheduler.state_dict()} + torch.save(checkpoint, os.path.join(args.output_dir, 'checkpoint.pth')) + + # plot the training losses + fig, ax = plt.subplots() + ax.plot(loss_history) + ax.set_xlabel('Iteration') + ax.set_ylabel('Loss') + ax.set_title('Training loss history') + fig.savefig(os.path.join(args.output_dir, 'training_loss.png')) + plt.close() + + +def inference(args, model, val_loader, dataset, thresh=0.5, nms_thresh=0.5, visualize=False): + model.eval() + start_t = time.time() + + if args.output_dir is not None: + det_dir = os.path.join(args.output_dir, 'mAP_input/detection-results') + gt_dir = os.path.join(args.output_dir, 'mAP_input/ground-truth') + vis_dir = os.path.join(args.output_dir, 'visualize') + os.makedirs(det_dir, exist_ok=True) + os.makedirs(gt_dir, exist_ok=True) + os.makedirs(vis_dir, exist_ok=True) + + for iter_num, data_batch in enumerate(val_loader): + images, boxes, boxes_batch_ids, proposals, proposal_batch_ids, w_batch, h_batch, img_ids = data_batch + images = images.to(dtype=torch.float, device='cuda') + resized_proposals = coord_trans(proposals, proposal_batch_ids, w_batch, h_batch, mode='p2a') + resized_proposals = resized_proposals.to(dtype=torch.float, device='cuda') + proposal_batch_ids = proposal_batch_ids.cuda() + + with torch.no_grad(): + final_proposals, final_conf_scores, final_class = \ + model.inference(images, resized_proposals, proposal_batch_ids, thresh=thresh, nms_thresh=nms_thresh) + + # clamp on the proposal coordinates + batch_size = len(images) + for idx in range(batch_size): + torch.clamp_(final_proposals[idx][:, 0::2], min=0, max=w_batch[idx]) + torch.clamp_(final_proposals[idx][:, 1::2], min=0, max=h_batch[idx]) + + # visualization + # get the original image + # hack to get the original image so we don't have to load from local again... + i = batch_size*iter_num + idx + img, _ = dataset.__getitem__(i) + + box_per_img = boxes[boxes_batch_ids==idx] + final_all = torch.cat((final_proposals[idx], \ + final_class[idx].float(), final_conf_scores[idx]), dim=-1).cpu() + final_batch_idx = torch.LongTensor([idx] * final_all.shape[0]) + resized_final_proposals = coord_trans(final_all, final_batch_idx, w_batch, h_batch) + + # write results to file for evaluation (use mAP API https://github.com/Cartucho/mAP for now...) + if args.output_dir is not None: + file_name = img_ids[idx].replace('.jpg', '.txt') + with open(os.path.join(det_dir, file_name), 'w') as f_det, \ + open(os.path.join(gt_dir, file_name), 'w') as f_gt: + print('{}: {} GT bboxes and {} proposals'.format(img_ids[idx], len(box_per_img), resized_final_proposals.shape[0])) + for b in box_per_img: + f_gt.write('{} {:.2f} {:.2f} {:.2f} {:.2f}\n'.format(idx_to_class[b[4].item()], b[0], b[1], b[2], b[3])) + for b in resized_final_proposals: + f_det.write('{} {:.6f} {:.2f} {:.2f} {:.2f} {:.2f}\n'.format(idx_to_class[b[4].item()], b[5], b[0], b[1], b[2], b[3])) + + if visualize: + data_visualizer(img, idx_to_class, os.path.join(vis_dir, img_ids[idx]), box_per_img, resized_final_proposals) + + end_t = time.time() + print('Total inference time: {:.1f}s'.format(end_t-start_t)) + + +if __name__=='__main__': + args = parse_args() + main(args) \ No newline at end of file diff --git a/model.py b/model.py new file mode 100644 index 0000000..f7323dd --- /dev/null +++ b/model.py @@ -0,0 +1,212 @@ +import math + +import torch +import torch.nn as nn +import torchvision +from torchvision import models + +from utils import compute_offsets, assign_label, generate_proposal +from loss import ClsScoreRegression, BboxRegression + + +class FeatureExtractor(nn.Module): + """ + Image feature extraction with MobileNet. + """ + def __init__(self, reshape_size=224, pooling=False, verbose=False): + super().__init__() + + self.mobilenet = models.mobilenet_v2(pretrained=True) + self.mobilenet = nn.Sequential(*list(self.mobilenet.children())[:-1]) # Remove the last classifier + + # average pooling + if pooling: + self.mobilenet.add_module('LastAvgPool', nn.AvgPool2d(math.ceil(reshape_size/32.))) # input: N x 1280 x 7 x 7 + + for i in self.mobilenet.named_parameters(): + i[1].requires_grad = True # fine-tune all + + def forward(self, img, verbose=False): + """ + Inputs: + - img: Batch of resized images, of shape Nx3x224x224 + + Outputs: + - feat: Image feature, of shape Nx1280 (pooled) or Nx1280x7x7 + """ + num_img = img.shape[0] + + img_prepro = img + + feat = [] + process_batch = 500 + for b in range(math.ceil(num_img/process_batch)): + feat.append(self.mobilenet(img_prepro[b*process_batch:(b+1)*process_batch] + ).squeeze(-1).squeeze(-1)) # forward and squeeze + feat = torch.cat(feat) + + if verbose: + print('Output feature shape: ', feat.shape) + + return feat + + +class FastRCNN(nn.Module): + def __init__(self, in_dim=1280, hidden_dim=256, num_classes=20, \ + roi_output_w=2, roi_output_h=2, drop_ratio=0.3): + super().__init__() + + assert(num_classes != 0) + self.num_classes = num_classes + self.roi_output_w, self.roi_output_h = roi_output_w, roi_output_h + self.feat_extractor = FeatureExtractor() + ############################################################################## + # TODO: Declare the cls & bbox heads (in Fast R-CNN). # + # The cls & bbox heads share a sequential module with a Linear layer, # + # followed by a Dropout (p=drop_ratio), a ReLU nonlinearity and another # + # Linear layer. # + # The cls head is a Linear layer that predicts num_classes + 1 (background). # + # The det head is a Linear layer that predicts offsets(dim=4). # + # HINT: The dimension of the two Linear layers are in_dim -> hidden_dim and # + # hidden_dim -> hidden_dim. # + ############################################################################## + # Replace "pass" statement with your code + pass + ############################################################################## + # END OF YOUR CODE # + ############################################################################## + + def forward(self, images, bboxes, bbox_batch_ids, proposals, proposal_batch_ids): + """ + Training-time forward pass for our two-stage Faster R-CNN detector. + + Inputs: + - images: Tensor of shape (B, 3, H, W) giving input images + - bboxes: Tensor of shape (N, 5) giving ground-truth bounding boxes + and category labels, from the dataloader, where N is the total number + of GT boxes in the batch + - bbox_batch_ids: Tensor of shape (N, ) giving the index (in the batch) + of the image that each GT box belongs to + - proposals: Tensor of shape (M, 4) giving the proposals for input images, + where M is the total number of proposals in the batch + - proposal_batch_ids: Tensor of shape (M, ) giving the index of the image + that each proposals belongs to + + Outputs: + - total_loss: Torch scalar giving the overall training loss. + """ + w_cls = 1 # for cls_scores + w_bbox = 1 # for offsets + total_loss = None + ############################################################################## + # TODO: Implement the forward pass of Fast R-CNN. # + # A few key steps are outlined as follows: # + # i) Extract image fearure. # + # ii) Perform RoI Align on proposals, then meanpool the feature in the # + # spatial dimension. # + # iii) Pass the RoI feature through the shared-fc layer. Predict # + # classification scores ans box offsets. # + # iv) Assign the proposals with targets of each image. # + # v) Compute the cls_loss between the predicted class_prob and GT_class # + # (For poistive & negative proposals) # + # Compute the bbox_loss between the offsets and GT_offsets # + # (For positive proposals) # + # Compute the total_loss which is formulated as: # + # total_loss = w_cls*cls_loss + w_bbox*bbox_loss. # + ############################################################################## + # Replace "pass" statement with your code + B, _, H, W = images.shape + + # extract image feature + pass + + # perform RoI Pool & mean pool + pass + + # forward heads, get predicted cls scores & offsets + pass + + # assign targets with proposals + pos_masks, neg_masks, GT_labels, GT_bboxes = [], [], [], [] + for img_idx in range(B): + # get the positive/negative proposals and corresponding + # GT box & class label of this image + pass + + # compute loss + pass + + ############################################################################## + # END OF YOUR CODE # + ############################################################################## + return total_loss + + def inference(self, images, proposals, proposal_batch_ids, thresh=0.5, nms_thresh=0.7): + """" + Inference-time forward pass for our two-stage Faster R-CNN detector + + Inputs: + - images: Tensor of shape (B, 3, H, W) giving input images + - proposals: Tensor of shape (M, 4) giving the proposals for input images, + where M is the total number of proposals in the batch + - proposal_batch_ids: Tensor of shape (M, ) giving the index of the image + that each proposals belongs to + - thresh: Threshold value on confidence probability. HINT: You can convert the + classification score to probability using a softmax nonlinearity. + - nms_thresh: IoU threshold for NMS + + We can output a variable number of predicted boxes per input image. + In particular we assume that the input images[i] gives rise to P_i final + predicted boxes. + + Outputs: + - final_proposals: List of length (B,) where final_proposals[i] is a Tensor + of shape (P_i, 4) giving the coordinates of the final predicted boxes for + the input images[i] + - final_conf_probs: List of length (B,) where final_conf_probs[i] is a + Tensor of shape (P_i, 1) giving the predicted probabilites that the boxes + in final_proposals[i] are objects (vs background) + - final_class: List of length (B,), where final_class[i] is an int64 Tensor + of shape (P_i, 1) giving the predicted category labels for each box in + final_proposals[i]. + """ + final_proposals, final_conf_probs, final_class = None, None, None + ############################################################################## + # TODO: Predicting the final proposal coordinates `final_proposals`, # + # confidence scores `final_conf_probs`, and the class index `final_class`. # + # The overall steps are similar to the forward pass, but now you cannot # + # decide the activated nor negative proposals without GT boxes. # + # You should apply post-processing (thresholding and NMS) to all proposals # + # and keep the final proposals. # + ############################################################################## + # Replace "pass" statement with your code + B = images.shape[0] + + # extract image feature + pass + + # perform RoI Pool & mean pool + pass + + # forward heads, get predicted cls scores & offsets + pass + + # get predicted boxes & class label & confidence probability + pass + + final_proposals = [] + final_conf_probs = [] + final_class = [] + # post-process to get final predictions + for img_idx in range(B): + + # filter by threshold + pass + + # nms + pass + + ############################################################################## + # END OF YOUR CODE # + ############################################################################## + return final_proposals, final_conf_probs, final_class \ No newline at end of file diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..858cf26 --- /dev/null +++ b/utils.py @@ -0,0 +1,287 @@ +import numpy as np +import cv2 +from matplotlib import pyplot as plt +import torch + + +def data_visualizer(img, idx_to_class, path, bbox=None, pred=None): + """ + Data visualizer on the original image. Support both GT box input and proposal input. + + Input: + - img: PIL Image input + - idx_to_class: Mapping from the index (0-19) to the class name + - bbox: GT bbox (in red, optional), a tensor of shape Nx5, where N is + the number of GT boxes, 5 indicates (x_tl, y_tl, x_br, y_br, class) + - pred: Predicted bbox (in green, optional), a tensor of shape N'x6, where + N' is the number of predicted boxes, 6 indicates + (x_tl, y_tl, x_br, y_br, class, object confidence score) + """ + + img_copy = np.array(img).astype('uint8') + + if bbox is not None: + for bbox_idx in range(bbox.shape[0]): + one_bbox = bbox[bbox_idx][:4].numpy().astype('int') + cv2.rectangle(img_copy, (one_bbox[0], one_bbox[1]), (one_bbox[2], + one_bbox[3]), (255, 0, 0), 2) + if bbox.shape[1] > 4: # if class info provided + obj_cls = idx_to_class[bbox[bbox_idx][4].item()] + cv2.putText(img_copy, '%s' % (obj_cls), + (one_bbox[0], one_bbox[1]+15), + cv2.FONT_HERSHEY_PLAIN, 1.0, (0, 0, 255), thickness=1) + + if pred is not None: + for bbox_idx in range(pred.shape[0]): + one_bbox = pred[bbox_idx][:4].numpy().astype('int') + cv2.rectangle(img_copy, (one_bbox[0], one_bbox[1]), (one_bbox[2], + one_bbox[3]), (0, 255, 0), 2) + + if pred.shape[1] > 4: # if class and conf score info provided + obj_cls = idx_to_class[pred[bbox_idx][4].item()] + conf_score = pred[bbox_idx][5].item() + cv2.putText(img_copy, '%s, %.2f' % (obj_cls, conf_score), + (one_bbox[0], one_bbox[1]+15), + cv2.FONT_HERSHEY_PLAIN, 1.0, (0, 0, 255), thickness=1) + + plt.imshow(img_copy) + plt.axis('off') + plt.title(path) + plt.savefig(path) + plt.close() + + +def coord_trans(bbox, bbox_batch_idx, w_pixel, h_pixel, w_amap=7, h_amap=7, mode='a2p'): + """ + Coordinate transformation function. It converts the box coordinate from + the image coordinate system to the activation map coordinate system and vice versa. + In our case, the input image will have a few hundred of pixels in + width/height while the activation map is of size 7x7. + + Input: + - bbox: Could be either bbox, anchor, or proposal, of shape Mx4 + - bbox_batch_idx: Index of the image that each bbox belongs to, of shape M + - w_pixel: Number of pixels in the width side of the original image, of shape B + - h_pixel: Number of pixels in the height side of the original image, of shape B + - w_amap: Number of pixels in the width side of the activation map, scalar + - h_amap: Number of pixels in the height side of the activation map, scalar + - mode: Whether transfer from the original image to activation map ('p2a') or + the opposite ('a2p') + + Output: + - resized_bbox: Resized box coordinates, of the same shape as the input bbox + """ + + assert mode in ('p2a', 'a2p'), 'invalid coordinate transformation mode!' + assert bbox.shape[-1] >= 4, 'the transformation is applied to the first 4 values of dim -1' + + if bbox.shape[0] == 0: # corner cases + return bbox + + resized_bbox = bbox.clone() + + if mode == 'p2a': + # pixel to activation + width_ratio = w_pixel[bbox_batch_idx] * 1. / w_amap + height_ratio = h_pixel[bbox_batch_idx] * 1. / h_amap + resized_bbox[:, [0, 2]] /= width_ratio.view(-1, 1) + resized_bbox[:, [1, 3]] /= height_ratio.view(-1, 1) + else: + # activation to pixel + width_ratio = w_pixel[bbox_batch_idx] * 1. / w_amap + height_ratio = h_pixel[bbox_batch_idx] * 1. / h_amap + resized_bbox[:, [0, 2]] *= width_ratio.view(-1, 1) + resized_bbox[:, [1, 3]] *= height_ratio.view(-1, 1) + + return resized_bbox + + +def generate_anchor(anc_per_grid, grid): + """ + Anchor generator. + + Inputs: + - anc_per_grid: Tensor of shape (A, 2) giving the shapes of anchor boxes to + consider at each point in the grid. anc_per_grid[a] = (w, h) gives the width + and height of the a'th anchor shape. + - grid: Tensor of shape (B, H', W', 2) giving the (x, y) coordinates of the + center of each feature from the backbone feature map. This is the tensor + returned from GenerateGrid. + + Outputs: + - anchors: Tensor of shape (B, A, H', W', 4) giving the positions of all + anchor boxes for the entire image. anchors[b, a, h, w] is an anchor box + centered at grid[b, h, w], whose shape is given by anc[a]; we parameterize + boxes as anchors[b, a, h, w] = (x_tl, y_tl, x_br, y_br), where (x_tl, y_tl) + and (x_br, y_br) give the xy coordinates of the top-left and bottom-right + corners of the box. + """ + A, _ = anc_per_grid.shape + B, H, W, _ = grid.shape + anc_per_grid = anc_per_grid.to(grid) + + anc_per_grid = anc_per_grid.view(1, A, 1, 1, -1).repeat(B, 1, H, W, 1) + grid = grid.view(B, 1, H, W, -1).repeat(1, A, 1, 1, 1) + + x1y1 = grid - anc_per_grid / 2 + x2y2 = grid + anc_per_grid / 2 + anchors = torch.cat([x1y1, x2y2], dim=-1) + + return anchors + + +def compute_iou(anchors, bboxes): + """ + Compute the intersection-over-union between anchors and gts. + + Inputs: + - anchors: Anchor boxes, of shape (M, 4), where M is the number of proposals + - bboxes: GT boxes of shape (N, 4), where N is the number of GT boxes, + 4 indicates (x_{lr}^{gt}, y_{lr}^{gt}, x_{rb}^{gt}, y_{rb}^{gt}) + + Outputs: + - iou: IoU matrix of shape (M, N) + """ + iou = None + ############################################################################## + # TODO: Given anchors and gt bboxes, # + # compute the iou between each anchor and gt bbox. # + ############################################################################## + pass + + ############################################################################## + # END OF YOUR CODE # + ############################################################################## + + return iou + + +def compute_offsets(anchors, bboxes): + """ + Compute the offsets between anchors and gts. + + Inputs: + - anchors: Anchor boxes, of shape (M, 4) + - bboxes: GT boxes of shape (M, 4), + 4 indicates (x_{lr}^{gt}, y_{lr}^{gt}, x_{rb}^{gt}, y_{rb}^{gt}) + + Outputs: + - offsets: offsets of shape (M, 4) + """ + wh_offsets = torch.log((bboxes[:, 2:4] - bboxes[:, :2]) \ + / (anchors[:, 2:4] - anchors[:, :2])) + + xy_offsets = (bboxes[:, :2] + bboxes[:, 2:4] - \ + anchors[:, :2] - anchors[:, 2:4]) / 2. + + xy_offsets /= (anchors[:, 2:4] - anchors[:, :2]) + + offsets = torch.cat((xy_offsets, wh_offsets), dim=-1) + + return offsets + + +def generate_proposal(anchors, offsets): + """ + Proposal generator. + + Inputs: + - anchors: Anchor boxes, of shape (M, 4). Anchors are represented + by the coordinates of their top-left and bottom-right corners. + - offsets: Transformations of shape (M, 4) that will be used to + convert anchor boxes into region proposals. The transformation + offsets[m] = (tx, ty, tw, th) will be applied to the anchor + anchors[m]. + + Outputs: + - proposals: Region proposals of shape (M, 4), represented by the + coordinates of their top-left and bottom-right corners. Applying the + transform offsets[m] to the anchor[m] should give the + proposal proposals[m]. + + """ + proposals = None + ############################################################################## + # TODO: Given anchor coordinates and the proposed offset for each anchor, # + # compute the proposal coordinates using the transformation formulas above. # + ############################################################################## + # Replace "pass" statement with your code + pass + + ############################################################################## + # END OF YOUR CODE # + ############################################################################## + + return proposals + + +@torch.no_grad() +def assign_label(proposals, bboxes, background_id, pos_thresh=0.5, neg_thresh=0.5, pos_fraction=0.25): + """ + Determine the activated (positive) and negative proposals for model training. + + For Fast R-CNN - Positive proposals are defined Any of the two + (i) the proposal/proposals with the highest IoU overlap with a GT box, or + (ii) a proposal that has an IoU overlap higher than positive threshold with any GT box. + Note: One proposal can match at most one GT box (the one with the largest IoU overlapping). + + We assign a negative label to a proposal if its IoU ratio is lower than + a threshold value for all GT boxes. Proposals that are neither positive nor negative + do not contribute to the training objective. + + Main steps include: + i) Decide activated and negative proposals based on the IoU matrix. + ii) Compute GT confidence score/offsets/object class on the positive proposals. + iii) Compute GT confidence score on the negative proposals. + + Inputs: + - proposal: Proposal boxes, of shape (M, 4), where M is the number of proposals + - bboxes: GT boxes of shape Nx5, where N is the number of GT boxes, + 5 indicates (x_{lr}^{gt}, y_{lr}^{gt}, x_{rb}^{gt}, y_{rb}^{gt}) and class index + - background_id: Class id of the background class + - pos_thresh: Positive threshold value + - neg_thresh: Negative threshold value + - pos_fraction: a factor balancing pos/neg proposals + + Outputs: + - activated_anc_mask: a binary mask indicating the activated proposals, of shape M + - negative_anc_mask: a binary mask indicating the negative proposals, of shape M + - GT_class: GT class category on all proposals, background class for non-activated proposals, + of shape M + - bboxes: GT bboxes on activated proposals, of shape M'x4, where M' is the number of + activated proposals + """ + M = proposals.shape[0] + N = bboxes.shape[0] + iou_mat = compute_iou(proposals, bboxes[:, :4]) + + # activated/positive proposals + max_iou_per_anc, max_iou_per_anc_ind = iou_mat.max(dim=-1) + max_iou_per_box = iou_mat.max(dim=0, keepdim=True)[0] + activated_anc_mask = (iou_mat == max_iou_per_box) & (max_iou_per_box > 0) + activated_anc_mask |= (iou_mat > pos_thresh) # using the pos_thresh condition as well + activated_anc_mask = activated_anc_mask.max(dim=-1)[0] # (M, ) + activated_anc_ind = torch.nonzero(activated_anc_mask.view(-1)).squeeze(-1) + + # GT class + box_cls = bboxes[:, 4].long().view(1, N).expand(M, N) + # if a proposal matches multiple GT boxes, choose the box with the largest iou + GT_class = torch.gather(box_cls, -1, max_iou_per_anc_ind.unsqueeze(-1)).squeeze(-1) # M + GT_class[~activated_anc_mask] = background_id + + # GT bboxes + bboxes_expand = bboxes[:, :4].view(1, N, 4).expand((M, N, 4)) + bboxes = torch.gather(bboxes_expand, -2, max_iou_per_anc_ind.unsqueeze(-1) \ + .unsqueeze(-1).expand(M, 1, 4)).view(M, 4) + bboxes = bboxes[activated_anc_ind] + + # negative anchors + negative_anc_mask = (max_iou_per_anc < neg_thresh) + negative_anc_ind = torch.nonzero(negative_anc_mask.view(-1)).squeeze(-1) + # balance pos/neg anchors, random choose + num_neg = int(activated_anc_ind.shape[0] * (1 - pos_fraction) / pos_fraction) + negative_anc_ind = negative_anc_ind[torch.randint(0, negative_anc_ind.shape[0], (num_neg,))] + negative_anc_mask = torch.zeros_like(negative_anc_mask) + negative_anc_mask[negative_anc_ind] = 1 + + return activated_anc_mask, negative_anc_mask, GT_class, bboxes \ No newline at end of file diff --git a/视听导_编程作业2_说明文档.pdf b/视听导_编程作业2_说明文档.pdf new file mode 100644 index 0000000..d3f9ae2 Binary files /dev/null and b/视听导_编程作业2_说明文档.pdf differ