first commit

2024-11-13 13:46:39 +08:00 · 2024-11-13 13:46:39 +08:00 · 16b08f040b
commit 16b08f040b
8 changed files with 1743 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,7 @@
+.venv
+__pycache__
+*.pyc
+*.pyo
+ckpts
+data
+exp
--- a/compute_mAP.py
+++ b/compute_mAP.py
@ -0,0 +1,904 @@
+import glob
+import json
+import os
+import shutil
+import operator
+import sys
+import argparse
+import math
+
+import numpy as np
+
+MINOVERLAP = 0.5 # default value (defined in the PASCAL VOC2012 challenge)
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--path', type=str, help="the saving directory to compute mAP")
+parser.add_argument('-na', '--no-animation', help="no animation is shown.", action="store_true")
+parser.add_argument('-np', '--no-plot', help="no plot is shown.", action="store_true")
+parser.add_argument('-q', '--quiet', help="minimalistic console output.", action="store_true")
+# argparse receiving list of classes to be ignored (e.g., python main.py --ignore person book)
+parser.add_argument('-i', '--ignore', nargs='+', type=str, help="ignore a list of classes.")
+# argparse receiving list of classes with specific IoU (e.g., python main.py --set-class-iou person 0.7)
+parser.add_argument('--set-class-iou', nargs='+', type=str, help="set IoU for a specific class.")
+args = parser.parse_args()
+
+'''
+    0,0 ------> x (width)
+     |
+     |  (Left,Top)
+     |      *_________
+     |      |         |
+            |         |
+     y      |_________|
+  (height)            *
+                (Right,Bottom)
+'''
+
+# if there are no classes to ignore then replace None by empty list
+if args.ignore is None:
+    args.ignore = []
+
+specific_iou_flagged = False
+if args.set_class_iou is not None:
+    specific_iou_flagged = True
+
+# make sure that the cwd() is the location of the python script (so that every path makes sense)
+os.chdir(os.path.dirname(os.path.abspath(__file__)))
+
+GT_PATH = os.path.join(args.path, 'mAP_input', 'ground-truth')
+DR_PATH = os.path.join(args.path, 'mAP_input', 'detection-results')
+# if there are no images then no animation can be shown
+IMG_PATH = os.path.join(args.path, 'mAP_input', 'images-optional')
+if os.path.exists(IMG_PATH): 
+    for dirpath, dirnames, files in os.walk(IMG_PATH):
+        if not files:
+            # no image files found
+            args.no_animation = True
+else:
+    args.no_animation = True
+
+# try to import OpenCV if the user didn't choose the option --no-animation
+show_animation = False
+if not args.no_animation:
+    try:
+        import cv2
+        show_animation = True
+    except ImportError:
+        print("\"opencv-python\" not found, please install to visualize the results.")
+        args.no_animation = True
+
+# try to import Matplotlib if the user didn't choose the option --no-plot
+draw_plot = False
+if not args.no_plot:
+    try:
+        import matplotlib.pyplot as plt
+        draw_plot = True
+    except ImportError:
+        print("\"matplotlib\" not found, please install it to get the resulting plots.")
+        args.no_plot = True
+
+
+def log_average_miss_rate(prec, rec, num_images):
+    """
+        log-average miss rate:
+            Calculated by averaging miss rates at 9 evenly spaced FPPI points
+            between 10e-2 and 10e0, in log-space.
+
+        output:
+                lamr | log-average miss rate
+                mr | miss rate
+                fppi | false positives per image
+
+        references:
+            [1] Dollar, Piotr, et al. "Pedestrian Detection: An Evaluation of the
+               State of the Art." Pattern Analysis and Machine Intelligence, IEEE
+               Transactions on 34.4 (2012): 743 - 761.
+    """
+
+    # if there were no detections of that class
+    if prec.size == 0:
+        lamr = 0
+        mr = 1
+        fppi = 0
+        return lamr, mr, fppi
+
+    fppi = (1 - prec)
+    mr = (1 - rec)
+
+    fppi_tmp = np.insert(fppi, 0, -1.0)
+    mr_tmp = np.insert(mr, 0, 1.0)
+
+    # Use 9 evenly spaced reference points in log-space
+    ref = np.logspace(-2.0, 0.0, num = 9)
+    for i, ref_i in enumerate(ref):
+        # np.where() will always find at least 1 index, since min(ref) = 0.01 and min(fppi_tmp) = -1.0
+        j = np.where(fppi_tmp <= ref_i)[-1][-1]
+        ref[i] = mr_tmp[j]
+
+    # log(0) is undefined, so we use the np.maximum(1e-10, ref)
+    lamr = math.exp(np.mean(np.log(np.maximum(1e-10, ref))))
+
+    return lamr, mr, fppi
+
+"""
+ throw error and exit
+"""
+def error(msg):
+    print(msg)
+    sys.exit(0)
+
+"""
+ check if the number is a float between 0.0 and 1.0
+"""
+def is_float_between_0_and_1(value):
+    try:
+        val = float(value)
+        if val > 0.0 and val < 1.0:
+            return True
+        else:
+            return False
+    except ValueError:
+        return False
+
+"""
+ Calculate the AP given the recall and precision array
+    1st) We compute a version of the measured precision/recall curve with
+         precision monotonically decreasing
+    2nd) We compute the AP as the area under this curve by numerical integration.
+"""
+def voc_ap(rec, prec):
+    """
+    --- Official matlab code VOC2012---
+    mrec=[0 ; rec ; 1];
+    mpre=[0 ; prec ; 0];
+    for i=numel(mpre)-1:-1:1
+            mpre(i)=max(mpre(i),mpre(i+1));
+    end
+    i=find(mrec(2:end)~=mrec(1:end-1))+1;
+    ap=sum((mrec(i)-mrec(i-1)).*mpre(i));
+    """
+    rec.insert(0, 0.0) # insert 0.0 at begining of list
+    rec.append(1.0) # insert 1.0 at end of list
+    mrec = rec[:]
+    prec.insert(0, 0.0) # insert 0.0 at begining of list
+    prec.append(0.0) # insert 0.0 at end of list
+    mpre = prec[:]
+    """
+     This part makes the precision monotonically decreasing
+        (goes from the end to the beginning)
+        matlab: for i=numel(mpre)-1:-1:1
+                    mpre(i)=max(mpre(i),mpre(i+1));
+    """
+    # matlab indexes start in 1 but python in 0, so I have to do:
+    #     range(start=(len(mpre) - 2), end=0, step=-1)
+    # also the python function range excludes the end, resulting in:
+    #     range(start=(len(mpre) - 2), end=-1, step=-1)
+    for i in range(len(mpre)-2, -1, -1):
+        mpre[i] = max(mpre[i], mpre[i+1])
+    """
+     This part creates a list of indexes where the recall changes
+        matlab: i=find(mrec(2:end)~=mrec(1:end-1))+1;
+    """
+    i_list = []
+    for i in range(1, len(mrec)):
+        if mrec[i] != mrec[i-1]:
+            i_list.append(i) # if it was matlab would be i + 1
+    """
+     The Average Precision (AP) is the area under the curve
+        (numerical integration)
+        matlab: ap=sum((mrec(i)-mrec(i-1)).*mpre(i));
+    """
+    ap = 0.0
+    for i in i_list:
+        ap += ((mrec[i]-mrec[i-1])*mpre[i])
+    return ap, mrec, mpre
+
+
+"""
+ Convert the lines of a file to a list
+"""
+def file_lines_to_list(path):
+    # open txt file lines to a list
+    with open(path) as f:
+        content = f.readlines()
+    # remove whitespace characters like `\n` at the end of each line
+    content = [x.strip() for x in content]
+    return content
+
+"""
+ Draws text in image
+"""
+def draw_text_in_image(img, text, pos, color, line_width):
+    font = cv2.FONT_HERSHEY_PLAIN
+    fontScale = 1
+    lineType = 1
+    bottomLeftCornerOfText = pos
+    cv2.putText(img, text,
+            bottomLeftCornerOfText,
+            font,
+            fontScale,
+            color,
+            lineType)
+    text_width, _ = cv2.getTextSize(text, font, fontScale, lineType)[0]
+    return img, (line_width + text_width)
+
+"""
+ Plot - adjust axes
+"""
+def adjust_axes(r, t, fig, axes):
+    # get text width for re-scaling
+    bb = t.get_window_extent(renderer=r)
+    text_width_inches = bb.width / fig.dpi
+    # get axis width in inches
+    current_fig_width = fig.get_figwidth()
+    new_fig_width = current_fig_width + text_width_inches
+    propotion = new_fig_width / current_fig_width
+    # get axis limit
+    x_lim = axes.get_xlim()
+    axes.set_xlim([x_lim[0], x_lim[1]*propotion])
+
+"""
+ Draw plot using Matplotlib
+"""
+def draw_plot_func(dictionary, n_classes, window_title, plot_title, x_label, output_path, to_show, plot_color, true_p_bar):
+    # sort the dictionary by decreasing value, into a list of tuples
+    sorted_dic_by_value = sorted(dictionary.items(), key=operator.itemgetter(1))
+    # unpacking the list of tuples into two lists
+    sorted_keys, sorted_values = zip(*sorted_dic_by_value)
+    # 
+    if true_p_bar != "":
+        """
+         Special case to draw in:
+            - green -> TP: True Positives (object detected and matches ground-truth)
+            - red -> FP: False Positives (object detected but does not match ground-truth)
+            - pink -> FN: False Negatives (object not detected but present in the ground-truth)
+        """
+        fp_sorted = []
+        tp_sorted = []
+        for key in sorted_keys:
+            fp_sorted.append(dictionary[key] - true_p_bar[key])
+            tp_sorted.append(true_p_bar[key])
+        plt.barh(range(n_classes), fp_sorted, align='center', color='crimson', label='False Positive')
+        plt.barh(range(n_classes), tp_sorted, align='center', color='forestgreen', label='True Positive', left=fp_sorted)
+        # add legend
+        plt.legend(loc='lower right')
+        """
+         Write number on side of bar
+        """
+        fig = plt.gcf() # gcf - get current figure
+        axes = plt.gca()
+        r = fig.canvas.get_renderer()
+        for i, val in enumerate(sorted_values):
+            fp_val = fp_sorted[i]
+            tp_val = tp_sorted[i]
+            fp_str_val = " " + str(fp_val)
+            tp_str_val = fp_str_val + " " + str(tp_val)
+            # trick to paint multicolor with offset:
+            # first paint everything and then repaint the first number
+            t = plt.text(val, i, tp_str_val, color='forestgreen', va='center', fontweight='bold')
+            plt.text(val, i, fp_str_val, color='crimson', va='center', fontweight='bold')
+            if i == (len(sorted_values)-1): # largest bar
+                adjust_axes(r, t, fig, axes)
+    else:
+        plt.barh(range(n_classes), sorted_values, color=plot_color)
+        """
+         Write number on side of bar
+        """
+        fig = plt.gcf() # gcf - get current figure
+        axes = plt.gca()
+        r = fig.canvas.get_renderer()
+        for i, val in enumerate(sorted_values):
+            str_val = " " + str(val) # add a space before
+            if val < 1.0:
+                str_val = " {0:.2f}".format(val)
+            t = plt.text(val, i, str_val, color=plot_color, va='center', fontweight='bold')
+            # re-set axes to show number inside the figure
+            if i == (len(sorted_values)-1): # largest bar
+                adjust_axes(r, t, fig, axes)
+    # set window title
+    fig.canvas.manager.set_window_title(window_title)
+    # write classes in y axis
+    tick_font_size = 12
+    plt.yticks(range(n_classes), sorted_keys, fontsize=tick_font_size)
+    """
+     Re-scale height accordingly
+    """
+    init_height = fig.get_figheight()
+    # comput the matrix height in points and inches
+    dpi = fig.dpi
+    height_pt = n_classes * (tick_font_size * 1.4) # 1.4 (some spacing)
+    height_in = height_pt / dpi
+    # compute the required figure height 
+    top_margin = 0.15 # in percentage of the figure height
+    bottom_margin = 0.05 # in percentage of the figure height
+    figure_height = height_in / (1 - top_margin - bottom_margin)
+    # set new height
+    if figure_height > init_height:
+        fig.set_figheight(figure_height)
+
+    # set plot title
+    plt.title(plot_title, fontsize=14)
+    # set axis titles
+    # plt.xlabel('classes')
+    plt.xlabel(x_label, fontsize='large')
+    # adjust size of window
+    fig.tight_layout()
+    # save the plot
+    fig.savefig(output_path)
+    # show image
+    if to_show:
+        plt.show()
+    # close the plot
+    plt.close()
+
+"""
+ Create a ".temp_files/" and "output/" directory
+"""
+TEMP_FILES_PATH = os.path.join(args.path, ".temp_files")
+if not os.path.exists(TEMP_FILES_PATH): # if it doesn't exist already
+    os.makedirs(TEMP_FILES_PATH)
+output_files_path = os.path.join(args.path, "mAP_output")
+if os.path.exists(output_files_path): # if it exist already
+    # reset the output directory
+    shutil.rmtree(output_files_path)
+
+os.makedirs(output_files_path)
+if draw_plot:
+    os.makedirs(os.path.join(output_files_path, "classes"))
+if show_animation:
+    os.makedirs(os.path.join(output_files_path, "images", "detections_one_by_one"))
+
+"""
+ ground-truth
+     Load each of the ground-truth files into a temporary ".json" file.
+     Create a list of all the class names present in the ground-truth (gt_classes).
+"""
+# get a list with the ground-truth files
+ground_truth_files_list = glob.glob(GT_PATH + '/*.txt')
+if len(ground_truth_files_list) == 0:
+    error("Error: No ground-truth files found!")
+ground_truth_files_list.sort()
+# dictionary with counter per class
+gt_counter_per_class = {}
+counter_images_per_class = {}
+
+gt_files = []
+for txt_file in ground_truth_files_list:
+    #print(txt_file)
+    file_id = txt_file.split(".txt", 1)[0]
+    file_id = os.path.basename(os.path.normpath(file_id))
+    # check if there is a correspondent detection-results file
+    temp_path = os.path.join(DR_PATH, (file_id + ".txt"))
+    if not os.path.exists(temp_path):
+        error_msg = "Error. File not found: {}\n".format(temp_path)
+        error_msg += "(You can avoid this error message by running extra/intersect-gt-and-dr.py)"
+        error(error_msg)
+    lines_list = file_lines_to_list(txt_file)
+    # create ground-truth dictionary
+    bounding_boxes = []
+    is_difficult = False
+    already_seen_classes = []
+    for line in lines_list:
+        try:
+            if "difficult" in line:
+                    class_name, left, top, right, bottom, _difficult = line.split()
+                    is_difficult = True
+            else:
+                    class_name, left, top, right, bottom = line.split()
+        except ValueError:
+            error_msg = "Error: File " + txt_file + " in the wrong format.\n"
+            error_msg += " Expected: <class_name> <left> <top> <right> <bottom> ['difficult']\n"
+            error_msg += " Received: " + line
+            error_msg += "\n\nIf you have a <class_name> with spaces between words you should remove them\n"
+            error_msg += "by running the script \"remove_space.py\" or \"rename_class.py\" in the \"extra/\" folder."
+            error(error_msg)
+        # check if class is in the ignore list, if yes skip
+        if class_name in args.ignore:
+            continue
+        bbox = left + " " + top + " " + right + " " +bottom
+        if is_difficult:
+            bounding_boxes.append({"class_name":class_name, "bbox":bbox, "used":False, "difficult":True})
+            is_difficult = False
+        else:
+            bounding_boxes.append({"class_name":class_name, "bbox":bbox, "used":False})
+            # count that object
+            if class_name in gt_counter_per_class:
+                gt_counter_per_class[class_name] += 1
+            else:
+                # if class didn't exist yet
+                gt_counter_per_class[class_name] = 1
+
+            if class_name not in already_seen_classes:
+                if class_name in counter_images_per_class:
+                    counter_images_per_class[class_name] += 1
+                else:
+                    # if class didn't exist yet
+                    counter_images_per_class[class_name] = 1
+                already_seen_classes.append(class_name)
+
+
+    # dump bounding_boxes into a ".json" file
+    new_temp_file = TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json"
+    gt_files.append(new_temp_file)
+    with open(new_temp_file, 'w') as outfile:
+        json.dump(bounding_boxes, outfile)
+
+gt_classes = list(gt_counter_per_class.keys())
+# let's sort the classes alphabetically
+gt_classes = sorted(gt_classes)
+n_classes = len(gt_classes)
+#print(gt_classes)
+#print(gt_counter_per_class)
+
+"""
+ Check format of the flag --set-class-iou (if used)
+    e.g. check if class exists
+"""
+if specific_iou_flagged:
+    n_args = len(args.set_class_iou)
+    error_msg = \
+        '\n --set-class-iou [class_1] [IoU_1] [class_2] [IoU_2] [...]'
+    if n_args % 2 != 0:
+        error('Error, missing arguments. Flag usage:' + error_msg)
+    # [class_1] [IoU_1] [class_2] [IoU_2]
+    # specific_iou_classes = ['class_1', 'class_2']
+    specific_iou_classes = args.set_class_iou[::2] # even
+    # iou_list = ['IoU_1', 'IoU_2']
+    iou_list = args.set_class_iou[1::2] # odd
+    if len(specific_iou_classes) != len(iou_list):
+        error('Error, missing arguments. Flag usage:' + error_msg)
+    for tmp_class in specific_iou_classes:
+        if tmp_class not in gt_classes:
+                    error('Error, unknown class \"' + tmp_class + '\". Flag usage:' + error_msg)
+    for num in iou_list:
+        if not is_float_between_0_and_1(num):
+            error('Error, IoU must be between 0.0 and 1.0. Flag usage:' + error_msg)
+
+"""
+ detection-results
+     Load each of the detection-results files into a temporary ".json" file.
+"""
+# get a list with the detection-results files
+dr_files_list = glob.glob(DR_PATH + '/*.txt')
+dr_files_list.sort()
+
+for class_index, class_name in enumerate(gt_classes):
+    bounding_boxes = []
+    for txt_file in dr_files_list:
+        #print(txt_file)
+        # the first time it checks if all the corresponding ground-truth files exist
+        file_id = txt_file.split(".txt",1)[0]
+        file_id = os.path.basename(os.path.normpath(file_id))
+        temp_path = os.path.join(GT_PATH, (file_id + ".txt"))
+        if class_index == 0:
+            if not os.path.exists(temp_path):
+                error_msg = "Error. File not found: {}\n".format(temp_path)
+                error_msg += "(You can avoid this error message by running extra/intersect-gt-and-dr.py)"
+                error(error_msg)
+        lines = file_lines_to_list(txt_file)
+        for line in lines:
+            try:
+                tmp_class_name, confidence, left, top, right, bottom = line.split()
+            except ValueError:
+                error_msg = "Error: File " + txt_file + " in the wrong format.\n"
+                error_msg += " Expected: <class_name> <confidence> <left> <top> <right> <bottom>\n"
+                error_msg += " Received: " + line
+                error(error_msg)
+            if tmp_class_name == class_name:
+                #print("match")
+                bbox = left + " " + top + " " + right + " " +bottom
+                bounding_boxes.append({"confidence":confidence, "file_id":file_id, "bbox":bbox})
+                #print(bounding_boxes)
+    # sort detection-results by decreasing confidence
+    bounding_boxes.sort(key=lambda x:float(x['confidence']), reverse=True)
+    with open(TEMP_FILES_PATH + "/" + class_name + "_dr.json", 'w') as outfile:
+        json.dump(bounding_boxes, outfile)
+
+"""
+ Calculate the AP for each class
+"""
+sum_AP = 0.0
+ap_dictionary = {}
+lamr_dictionary = {}
+# open file to store the output
+with open(output_files_path + "/output.txt", 'w') as output_file:
+    output_file.write("# AP and precision/recall per class\n")
+    count_true_positives = {}
+    for class_index, class_name in enumerate(gt_classes):
+        count_true_positives[class_name] = 0
+        """
+         Load detection-results of that class
+        """
+        dr_file = TEMP_FILES_PATH + "/" + class_name + "_dr.json"
+        dr_data = json.load(open(dr_file))
+
+        """
+         Assign detection-results to ground-truth objects
+        """
+        nd = len(dr_data)
+        tp = [0] * nd # creates an array of zeros of size nd
+        fp = [0] * nd
+        for idx, detection in enumerate(dr_data):
+            file_id = detection["file_id"]
+            if show_animation:
+                # find ground truth image
+                ground_truth_img = glob.glob1(IMG_PATH, file_id + ".*")
+                #tifCounter = len(glob.glob1(myPath,"*.tif"))
+                if len(ground_truth_img) == 0:
+                    error("Error. Image not found with id: " + file_id)
+                elif len(ground_truth_img) > 1:
+                    error("Error. Multiple image with id: " + file_id)
+                else: # found image
+                    #print(IMG_PATH + "/" + ground_truth_img[0])
+                    # Load image
+                    img = cv2.imread(IMG_PATH + "/" + ground_truth_img[0])
+                    # load image with draws of multiple detections
+                    img_cumulative_path = output_files_path + "/images/" + ground_truth_img[0]
+                    if os.path.isfile(img_cumulative_path):
+                        img_cumulative = cv2.imread(img_cumulative_path)
+                    else:
+                        img_cumulative = img.copy()
+                    # Add bottom border to image
+                    bottom_border = 60
+                    BLACK = [0, 0, 0]
+                    img = cv2.copyMakeBorder(img, 0, bottom_border, 0, 0, cv2.BORDER_CONSTANT, value=BLACK)
+            # assign detection-results to ground truth object if any
+            # open ground-truth with that file_id
+            gt_file = TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json"
+            ground_truth_data = json.load(open(gt_file))
+            ovmax = -1
+            gt_match = -1
+            # load detected object bounding-box
+            bb = [ float(x) for x in detection["bbox"].split() ]
+            for obj in ground_truth_data:
+                # look for a class_name match
+                if obj["class_name"] == class_name:
+                    bbgt = [ float(x) for x in obj["bbox"].split() ]
+                    bi = [max(bb[0],bbgt[0]), max(bb[1],bbgt[1]), min(bb[2],bbgt[2]), min(bb[3],bbgt[3])]
+                    iw = bi[2] - bi[0] + 1
+                    ih = bi[3] - bi[1] + 1
+                    if iw > 0 and ih > 0:
+                        # compute overlap (IoU) = area of intersection / area of union
+                        ua = (bb[2] - bb[0] + 1) * (bb[3] - bb[1] + 1) + (bbgt[2] - bbgt[0]
+                                        + 1) * (bbgt[3] - bbgt[1] + 1) - iw * ih
+                        ov = iw * ih / ua
+                        if ov > ovmax:
+                            ovmax = ov
+                            gt_match = obj
+
+            # assign detection as true positive/don't care/false positive
+            if show_animation:
+                status = "NO MATCH FOUND!" # status is only used in the animation
+            # set minimum overlap
+            min_overlap = MINOVERLAP
+            if specific_iou_flagged:
+                if class_name in specific_iou_classes:
+                    index = specific_iou_classes.index(class_name)
+                    min_overlap = float(iou_list[index])
+            if ovmax >= min_overlap:
+                if "difficult" not in gt_match:
+                        if not bool(gt_match["used"]):
+                            # true positive
+                            tp[idx] = 1
+                            gt_match["used"] = True
+                            count_true_positives[class_name] += 1
+                            # update the ".json" file
+                            with open(gt_file, 'w') as f:
+                                    f.write(json.dumps(ground_truth_data))
+                            if show_animation:
+                                status = "MATCH!"
+                        else:
+                            # false positive (multiple detection)
+                            fp[idx] = 1
+                            if show_animation:
+                                status = "REPEATED MATCH!"
+            else:
+                # false positive
+                fp[idx] = 1
+                if ovmax > 0:
+                    status = "INSUFFICIENT OVERLAP"
+
+            """
+             Draw image to show animation
+            """
+            if show_animation:
+                height, widht = img.shape[:2]
+                # colors (OpenCV works with BGR)
+                white = (255,255,255)
+                light_blue = (255,200,100)
+                green = (0,255,0)
+                light_red = (30,30,255)
+                # 1st line
+                margin = 10
+                v_pos = int(height - margin - (bottom_border / 2.0))
+                text = "Image: " + ground_truth_img[0] + " "
+                img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0)
+                text = "Class [" + str(class_index) + "/" + str(n_classes) + "]: " + class_name + " "
+                img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), light_blue, line_width)
+                if ovmax != -1:
+                    color = light_red
+                    if status == "INSUFFICIENT OVERLAP":
+                        text = "IoU: {0:.2f}% ".format(ovmax*100) + "< {0:.2f}% ".format(min_overlap*100)
+                    else:
+                        text = "IoU: {0:.2f}% ".format(ovmax*100) + ">= {0:.2f}% ".format(min_overlap*100)
+                        color = green
+                    img, _ = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width)
+                # 2nd line
+                v_pos += int(bottom_border / 2.0)
+                rank_pos = str(idx+1) # rank position (idx starts at 0)
+                text = "Detection #rank: " + rank_pos + " confidence: {0:.2f}% ".format(float(detection["confidence"])*100)
+                img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0)
+                color = light_red
+                if status == "MATCH!":
+                    color = green
+                text = "Result: " + status + " "
+                img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width)
+
+                font = cv2.FONT_HERSHEY_SIMPLEX
+                if ovmax > 0: # if there is intersections between the bounding-boxes
+                    bbgt = [ int(round(float(x))) for x in gt_match["bbox"].split() ]
+                    cv2.rectangle(img,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),light_blue,2)
+                    cv2.rectangle(img_cumulative,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),light_blue,2)
+                    cv2.putText(img_cumulative, class_name, (bbgt[0],bbgt[1] - 5), font, 0.6, light_blue, 1, cv2.LINE_AA)
+                bb = [int(i) for i in bb]
+                cv2.rectangle(img,(bb[0],bb[1]),(bb[2],bb[3]),color,2)
+                cv2.rectangle(img_cumulative,(bb[0],bb[1]),(bb[2],bb[3]),color,2)
+                cv2.putText(img_cumulative, class_name, (bb[0],bb[1] - 5), font, 0.6, color, 1, cv2.LINE_AA)
+                # show image
+                cv2.imshow("Animation", img)
+                cv2.waitKey(20) # show for 20 ms
+                # save image to output
+                output_img_path = output_files_path + "/images/detections_one_by_one/" + class_name + "_detection" + str(idx) + ".jpg"
+                cv2.imwrite(output_img_path, img)
+                # save the image with all the objects drawn to it
+                cv2.imwrite(img_cumulative_path, img_cumulative)
+
+        #print(tp)
+        # compute precision/recall
+        cumsum = 0
+        for idx, val in enumerate(fp):
+            fp[idx] += cumsum
+            cumsum += val
+        cumsum = 0
+        for idx, val in enumerate(tp):
+            tp[idx] += cumsum
+            cumsum += val
+        #print(tp)
+        rec = tp[:]
+        for idx, val in enumerate(tp):
+            rec[idx] = float(tp[idx]) / gt_counter_per_class[class_name]
+        #print(rec)
+        prec = tp[:]
+        for idx, val in enumerate(tp):
+            prec[idx] = float(tp[idx]) / (fp[idx] + tp[idx])
+        #print(prec)
+
+        ap, mrec, mprec = voc_ap(rec[:], prec[:])
+        sum_AP += ap
+        text = "{0:.2f}%".format(ap*100) + " = " + class_name + " AP " #class_name + " AP = {0:.2f}%".format(ap*100)
+        """
+         Write to output.txt
+        """
+        rounded_prec = [ '%.2f' % elem for elem in prec ]
+        rounded_rec = [ '%.2f' % elem for elem in rec ]
+        output_file.write(text + "\n Precision: " + str(rounded_prec) + "\n Recall :" + str(rounded_rec) + "\n\n")
+        if not args.quiet:
+            print(text)
+        ap_dictionary[class_name] = ap
+
+        n_images = counter_images_per_class[class_name]
+        lamr, mr, fppi = log_average_miss_rate(np.array(prec), np.array(rec), n_images)
+        lamr_dictionary[class_name] = lamr
+
+        """
+         Draw plot
+        """
+        if draw_plot:
+            plt.plot(rec, prec, '-o')
+            # add a new penultimate point to the list (mrec[-2], 0.0)
+            # since the last line segment (and respective area) do not affect the AP value
+            area_under_curve_x = mrec[:-1] + [mrec[-2]] + [mrec[-1]]
+            area_under_curve_y = mprec[:-1] + [0.0] + [mprec[-1]]
+            plt.fill_between(area_under_curve_x, 0, area_under_curve_y, alpha=0.2, edgecolor='r')
+            # set window title
+            fig = plt.gcf() # gcf - get current figure
+            fig.canvas.manager.set_window_title('AP ' + class_name)
+            # set plot title
+            plt.title('class: ' + text)
+            #plt.suptitle('This is a somewhat long figure title', fontsize=16)
+            # set axis titles
+            plt.xlabel('Recall')
+            plt.ylabel('Precision')
+            # optional - set axes
+            axes = plt.gca() # gca - get current axes
+            axes.set_xlim([0.0,1.0])
+            axes.set_ylim([0.0,1.05]) # .05 to give some extra space
+            # Alternative option -> wait for button to be pressed
+            #while not plt.waitforbuttonpress(): pass # wait for key display
+            # Alternative option -> normal display
+            #plt.show()
+            # save the plot
+            fig.savefig(output_files_path + "/classes/" + class_name + ".png")
+            plt.cla() # clear axes for next plot
+
+    if show_animation:
+        cv2.destroyAllWindows()
+
+    output_file.write("\n# mAP of all classes\n")
+    mAP = sum_AP / n_classes
+    text = "mAP = {0:.2f}%".format(mAP*100)
+    output_file.write(text + "\n")
+    print(text)
+
+"""
+ Draw false negatives
+"""
+if show_animation:
+    pink = (203,192,255)
+    for tmp_file in gt_files:
+        ground_truth_data = json.load(open(tmp_file))
+        #print(ground_truth_data)
+        # get name of corresponding image
+        start = TEMP_FILES_PATH + '/'
+        img_id = tmp_file[tmp_file.find(start)+len(start):tmp_file.rfind('_ground_truth.json')]
+        img_cumulative_path = output_files_path + "/images/" + img_id + ".jpg"
+        img = cv2.imread(img_cumulative_path)
+        if img is None:
+            img_path = IMG_PATH + '/' + img_id + ".jpg"
+            img = cv2.imread(img_path)
+        # draw false negatives
+        for obj in ground_truth_data:
+            if not obj['used']:
+                bbgt = [ int(round(float(x))) for x in obj["bbox"].split() ]
+                cv2.rectangle(img,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),pink,2)
+        cv2.imwrite(img_cumulative_path, img)
+
+# remove the temp_files directory
+shutil.rmtree(TEMP_FILES_PATH)
+
+"""
+ Count total of detection-results
+"""
+# iterate through all the files
+det_counter_per_class = {}
+for txt_file in dr_files_list:
+    # get lines to list
+    lines_list = file_lines_to_list(txt_file)
+    for line in lines_list:
+        class_name = line.split()[0]
+        # check if class is in the ignore list, if yes skip
+        if class_name in args.ignore:
+            continue
+        # count that object
+        if class_name in det_counter_per_class:
+            det_counter_per_class[class_name] += 1
+        else:
+            # if class didn't exist yet
+            det_counter_per_class[class_name] = 1
+#print(det_counter_per_class)
+dr_classes = list(det_counter_per_class.keys())
+
+
+"""
+ Plot the total number of occurences of each class in the ground-truth
+"""
+if draw_plot:
+    window_title = "ground-truth-info"
+    plot_title = "ground-truth\n"
+    plot_title += "(" + str(len(ground_truth_files_list)) + " files and " + str(n_classes) + " classes)"
+    x_label = "Number of objects per class"
+    output_path = output_files_path + "/ground-truth-info.png"
+    to_show = False
+    plot_color = 'forestgreen'
+    draw_plot_func(
+        gt_counter_per_class,
+        n_classes,
+        window_title,
+        plot_title,
+        x_label,
+        output_path,
+        to_show,
+        plot_color,
+        '',
+        )
+
+"""
+ Write number of ground-truth objects per class to results.txt
+"""
+with open(output_files_path + "/output.txt", 'a') as output_file:
+    output_file.write("\n# Number of ground-truth objects per class\n")
+    for class_name in sorted(gt_counter_per_class):
+        output_file.write(class_name + ": " + str(gt_counter_per_class[class_name]) + "\n")
+
+"""
+ Finish counting true positives
+"""
+for class_name in dr_classes:
+    # if class exists in detection-result but not in ground-truth then there are no true positives in that class
+    if class_name not in gt_classes:
+        count_true_positives[class_name] = 0
+#print(count_true_positives)
+
+"""
+ Plot the total number of occurences of each class in the "detection-results" folder
+"""
+if draw_plot:
+    window_title = "detection-results-info"
+    # Plot title
+    plot_title = "detection-results\n"
+    plot_title += "(" + str(len(dr_files_list)) + " files and "
+    count_non_zero_values_in_dictionary = sum(int(x) > 0 for x in list(det_counter_per_class.values()))
+    plot_title += str(count_non_zero_values_in_dictionary) + " detected classes)"
+    # end Plot title
+    x_label = "Number of objects per class"
+    output_path = output_files_path + "/detection-results-info.png"
+    to_show = False
+    plot_color = 'forestgreen'
+    true_p_bar = count_true_positives
+    draw_plot_func(
+        det_counter_per_class,
+        len(det_counter_per_class),
+        window_title,
+        plot_title,
+        x_label,
+        output_path,
+        to_show,
+        plot_color,
+        true_p_bar
+        )
+
+"""
+ Write number of detected objects per class to output.txt
+"""
+with open(output_files_path + "/output.txt", 'a') as output_file:
+    output_file.write("\n# Number of detected objects per class\n")
+    for class_name in sorted(dr_classes):
+        n_det = det_counter_per_class[class_name]
+        text = class_name + ": " + str(n_det)
+        text += " (tp:" + str(count_true_positives[class_name]) + ""
+        text += ", fp:" + str(n_det - count_true_positives[class_name]) + ")\n"
+        output_file.write(text)
+
+"""
+ Draw log-average miss rate plot (Show lamr of all classes in decreasing order)
+"""
+if draw_plot:
+    window_title = "lamr"
+    plot_title = "log-average miss rate"
+    x_label = "log-average miss rate"
+    output_path = output_files_path + "/lamr.png"
+    to_show = False
+    plot_color = 'royalblue'
+    draw_plot_func(
+        lamr_dictionary,
+        n_classes,
+        window_title,
+        plot_title,
+        x_label,
+        output_path,
+        to_show,
+        plot_color,
+        ""
+        )
+
+"""
+ Draw mAP plot (Show AP's of all classes in decreasing order)
+"""
+if draw_plot:
+    window_title = "mAP"
+    plot_title = "mAP = {0:.2f}%".format(mAP*100)
+    x_label = "Average Precision"
+    output_path = output_files_path + "/mAP.png"
+    to_show = True
+    plot_color = 'royalblue'
+    draw_plot_func(
+        ap_dictionary,
+        n_classes,
+        window_title,
+        plot_title,
+        x_label,
+        output_path,
+        to_show,
+        plot_color,
+        ""
+        )
--- a/dataset.py
+++ b/dataset.py
@ -0,0 +1,96 @@
+import os
+import json
+import torch
+from torchvision import datasets
+from torchvision import transforms
+from torch.utils.data import DataLoader
+from functools import partial
+
+
+class_to_idx = {
+  'aeroplane':0, 'bicycle':1, 'bird':2, 'boat':3, 'bottle':4,
+  'bus':5, 'car':6, 'cat':7, 'chair':8, 'cow':9, 'diningtable':10,
+  'dog':11, 'horse':12, 'motorbike':13, 'person':14, 'pottedplant':15,
+  'sheep':16, 'sofa':17, 'train':18, 'tvmonitor':19
+}
+idx_to_class = {i:c for c, i in class_to_idx.items()}
+
+
+def get_pascal_voc2007_data(image_root, split='train'):
+    """
+    Use torchvision.datasets
+    https://pytorch.org/docs/stable/torchvision/datasets.html#torchvision.datasets.VOCDetection
+    """
+
+    train_dataset = datasets.VOCDetection(image_root, year='2007', image_set=split,
+                                    download=False)
+
+    return train_dataset
+
+
+def pascal_voc2007_loader(dataset, batch_size, num_workers=0, shuffle=False, proposal_path=None):
+    """
+    Data loader for Pascal VOC 2007.
+    https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader
+    """
+    collate_fn = partial(voc_collate_fn, proposal_path=proposal_path)
+    train_loader = DataLoader(dataset,
+                            batch_size=batch_size,
+                            shuffle=shuffle, pin_memory=True,
+                            num_workers=num_workers,
+                            collate_fn=collate_fn)
+    return train_loader
+
+
+def voc_collate_fn(batch_lst, reshape_size=224, proposal_path=None):
+    preprocess = transforms.Compose([
+      transforms.Resize((reshape_size, reshape_size)),
+      transforms.ToTensor(),
+      transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+      ])
+    
+    batch_size = len(batch_lst)
+    
+    img_batch = torch.zeros(batch_size, 3, reshape_size, reshape_size)
+    
+    box_list = []
+    box_batch_idx = []
+    w_list = []
+    h_list = []
+    img_id_list = []
+    proposal_list = []
+    proposal_batch_idx = []
+    
+    for i in range(batch_size):
+      img, ann = batch_lst[i]
+      w_list.append(img.size[0]) # image width
+      h_list.append(img.size[1]) # image height
+      img_id_list.append(ann['annotation']['filename'])
+      img_batch[i] = preprocess(img)
+      all_bbox = ann['annotation']['object']
+      if type(all_bbox) == dict: # inconsistency in the annotation file
+        all_bbox = [all_bbox]
+      for bbox_idx, one_bbox in enumerate(all_bbox):
+        bbox = one_bbox['bndbox']
+        obj_cls = one_bbox['name']
+        box_list.append(torch.Tensor([float(bbox['xmin']), float(bbox['ymin']),
+          float(bbox['xmax']), float(bbox['ymax']), class_to_idx[obj_cls]]))
+        box_batch_idx.append(i)
+      if proposal_path is not None:
+        proposal_fn = ann['annotation']['filename'].replace('.jpg', '.json')
+        with open(os.path.join(proposal_path, proposal_fn), 'r') as f:
+          proposal = json.load(f)
+        for p in proposal:
+          proposal_list.append([p['x_min'], p['y_min'], p['x_max'], p['y_max']])
+          proposal_batch_idx.append(i)
+    
+    h_batch = torch.tensor(h_list)
+    w_batch = torch.tensor(w_list)
+    box_batch = torch.stack(box_list)
+    box_batch_ids = torch.tensor(box_batch_idx, dtype=torch.long)
+    proposals = torch.tensor(proposal_list, dtype=box_batch.dtype)
+    proposal_batch_ids = torch.tensor(proposal_batch_idx, dtype=torch.long)
+    assert len(box_batch) == len(box_batch_ids)
+    assert len(proposals) == len(proposal_batch_ids)
+
+    return img_batch, box_batch, box_batch_ids, proposals, proposal_batch_ids, w_batch, h_batch, img_id_list
--- a/loss.py
+++ b/loss.py
@ -0,0 +1,33 @@
+import torch
+import torch.nn.functional as F
+
+
+def ClsScoreRegression(cls_scores, GT_label, batch_size):
+    """
+    Multi-class cross-entropy loss
+
+    Inputs:
+    - cls_scores: Predicted class scores, of shape (M, C).
+    - GT_label: GT class labels, of shape (M,).
+
+    Outputs:
+    - cls_score_loss: Torch scalar
+    """
+    cls_loss = F.cross_entropy(cls_scores, GT_label, \
+                                        reduction='sum') * 1. / batch_size
+    return cls_loss
+
+
+def BboxRegression(offsets, GT_offsets, batch_size):
+    """"
+    Use SmoothL1 loss as in Faster R-CNN
+
+    Inputs:
+    - offsets: Predicted box offsets, of shape (M, 4)
+    - GT_offsets: GT box offsets, of shape (M, 4)
+
+    Outputs:
+    - bbox_reg_loss: Torch scalar
+    """
+    bbox_reg_loss = F.smooth_l1_loss(offsets, GT_offsets, reduction='sum') * 1. / batch_size
+    return bbox_reg_loss
--- a/main.py
+++ b/main.py
@ -0,0 +1,204 @@
+import math
+import copy
+import time
+import shutil
+import os
+import random
+os.environ['TORCH_HOME'] = './ckpts'
+
+import argparse
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import optim
+import torchvision
+import matplotlib.pyplot as plt
+import numpy as np
+import cv2
+
+from dataset import get_pascal_voc2007_data, pascal_voc2007_loader, idx_to_class
+from model import FastRCNN
+from utils import coord_trans, data_visualizer
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('Faster R-CNN', add_help=False)
+    parser.add_argument('--lr', default=1e-3, type=float)
+    parser.add_argument('--lr_decay', default=1.0, type=float)
+    parser.add_argument('--batch_size', default=16, type=int)
+    parser.add_argument('--epochs', default=200, type=int)
+    parser.add_argument('--num_workers', default=4, type=int)
+    parser.add_argument('--overfit_small_data', default=False, action='store_true')
+    parser.add_argument('--output_dir', default='./exp/fast_rcnn')
+
+    args = parser.parse_args()
+
+    return args
+
+
+def main(args):
+    torch.manual_seed(0)
+    torch.cuda.manual_seed(0)
+    random.seed(0)
+    if args.overfit_small_data:
+        args.output_dir = args.output_dir + "_overfit_small"
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # build dataset & dataloader
+    train_dataset = get_pascal_voc2007_data('./data/VOCtrainval_06-Nov-2007/', 'train')
+    val_dataset = get_pascal_voc2007_data('./data/VOCtrainval_06-Nov-2007/', 'val')
+
+    train_loader = pascal_voc2007_loader(train_dataset, args.batch_size, shuffle=True, num_workers=args.num_workers,
+                                         proposal_path='data/VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007/Proposals')
+    val_loader = pascal_voc2007_loader(val_dataset, args.batch_size, num_workers=args.num_workers,
+                                       proposal_path='data/VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007/Proposals')
+
+    if args.overfit_small_data:
+        num_sample = 10
+        small_dataset = torch.utils.data.Subset(
+            train_dataset,
+            torch.linspace(0, len(train_dataset)-1, steps=num_sample).long()
+        )
+        small_train_loader = pascal_voc2007_loader(small_dataset, 10,
+                                                   proposal_path='data/VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007/Proposals')
+        val_dataset = small_dataset
+        train_loader = small_train_loader
+        val_loader = small_train_loader
+
+    model = FastRCNN()
+    model.cuda()
+
+    # build optimizer
+    optimizer = optim.SGD(
+        filter(lambda p: p.requires_grad, model.parameters()),
+        args.lr
+    )
+    lr_scheduler = optim.lr_scheduler.LambdaLR(
+        optimizer,
+        lambda epoch: args.lr_decay ** epoch
+    )
+
+    # load ckpt
+    ckpt_path = os.path.join(args.output_dir, 'checkpoint.pth')
+    start_epoch = 0
+    if os.path.exists(ckpt_path):
+        checkpoint = torch.load(ckpt_path)
+        start_epoch = checkpoint['epoch']
+        model.load_state_dict(checkpoint['model'])
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        lr_scheduler.load_state_dict(checkpoint['lr_sched'])
+
+    if start_epoch < args.epochs:
+        train(args, model, train_loader, optimizer, lr_scheduler, start_epoch)
+    inference(args, model, val_loader, val_dataset, visualize=args.overfit_small_data)
+
+
+def train(args, model, train_loader, optimizer, lr_scheduler, start_epoch):
+    loss_history = []
+    model.train()
+    for i in range(start_epoch, args.epochs):
+        start_t = time.time()
+        for iter_num, data_batch in enumerate(train_loader):
+            images, boxes, boxes_batch_ids, proposals, proposal_batch_ids, w_batch, h_batch, _ = data_batch
+            resized_boxes = coord_trans(boxes, boxes_batch_ids, w_batch, h_batch, mode='p2a')
+            resized_proposals = coord_trans(proposals, proposal_batch_ids, w_batch, h_batch, mode='p2a')
+            
+            images = images.to(dtype=torch.float, device='cuda')
+            resized_boxes = resized_boxes.to(dtype=torch.float, device='cuda')
+            boxes_batch_ids = boxes_batch_ids.cuda()
+            resized_proposals = resized_proposals.to(dtype=torch.float, device='cuda')
+            proposal_batch_ids = proposal_batch_ids.cuda()
+
+            loss = model(images, resized_boxes, boxes_batch_ids, resized_proposals, proposal_batch_ids)
+            optimizer.zero_grad()
+            loss.backward()
+            loss_history.append(loss.item())
+            optimizer.step()
+
+            if iter_num % 50 == 0:
+                print('(Iter {} / {}) loss: {:.4f}'.format(iter_num, len(train_loader), np.mean(loss_history[-50:])))
+
+        end_t = time.time()
+        print('(Epoch {} / {}) loss: {:.4f}, time per epoch: {:.1f}s'.format(
+            i, args.epochs, np.mean(loss_history[-len(train_loader):]), end_t-start_t))
+        lr_scheduler.step()
+
+        checkpoint = { 
+            'epoch': i + 1,
+            'model': model.state_dict(),
+            'optimizer': optimizer.state_dict(),
+            'lr_sched': lr_scheduler.state_dict()}
+        torch.save(checkpoint, os.path.join(args.output_dir, 'checkpoint.pth'))
+
+    # plot the training losses
+    fig, ax = plt.subplots()
+    ax.plot(loss_history)
+    ax.set_xlabel('Iteration')
+    ax.set_ylabel('Loss')
+    ax.set_title('Training loss history')
+    fig.savefig(os.path.join(args.output_dir, 'training_loss.png'))
+    plt.close()
+
+
+def inference(args, model, val_loader, dataset, thresh=0.5, nms_thresh=0.5, visualize=False):
+    model.eval()
+    start_t = time.time()
+
+    if args.output_dir is not None:
+        det_dir = os.path.join(args.output_dir, 'mAP_input/detection-results')
+        gt_dir = os.path.join(args.output_dir, 'mAP_input/ground-truth')
+        vis_dir = os.path.join(args.output_dir, 'visualize')
+        os.makedirs(det_dir, exist_ok=True)
+        os.makedirs(gt_dir, exist_ok=True)
+        os.makedirs(vis_dir, exist_ok=True)
+
+    for iter_num, data_batch in enumerate(val_loader):
+        images, boxes, boxes_batch_ids, proposals, proposal_batch_ids, w_batch, h_batch, img_ids = data_batch
+        images = images.to(dtype=torch.float, device='cuda')
+        resized_proposals = coord_trans(proposals, proposal_batch_ids, w_batch, h_batch, mode='p2a')
+        resized_proposals = resized_proposals.to(dtype=torch.float, device='cuda')
+        proposal_batch_ids = proposal_batch_ids.cuda()
+
+        with torch.no_grad():
+            final_proposals, final_conf_scores, final_class = \
+                model.inference(images, resized_proposals, proposal_batch_ids, thresh=thresh, nms_thresh=nms_thresh)
+
+        # clamp on the proposal coordinates
+        batch_size = len(images)
+        for idx in range(batch_size):
+            torch.clamp_(final_proposals[idx][:, 0::2], min=0, max=w_batch[idx])
+            torch.clamp_(final_proposals[idx][:, 1::2], min=0, max=h_batch[idx])
+
+            # visualization
+            # get the original image
+            # hack to get the original image so we don't have to load from local again...
+            i = batch_size*iter_num + idx
+            img, _ = dataset.__getitem__(i)
+
+            box_per_img = boxes[boxes_batch_ids==idx]
+            final_all = torch.cat((final_proposals[idx], \
+                final_class[idx].float(), final_conf_scores[idx]), dim=-1).cpu()
+            final_batch_idx = torch.LongTensor([idx] * final_all.shape[0])
+            resized_final_proposals = coord_trans(final_all, final_batch_idx, w_batch, h_batch)
+
+            # write results to file for evaluation (use mAP API https://github.com/Cartucho/mAP for now...)
+            if args.output_dir is not None:
+                file_name = img_ids[idx].replace('.jpg', '.txt')
+                with open(os.path.join(det_dir, file_name), 'w') as f_det, \
+                open(os.path.join(gt_dir, file_name), 'w') as f_gt:
+                    print('{}: {} GT bboxes and {} proposals'.format(img_ids[idx], len(box_per_img), resized_final_proposals.shape[0]))
+                    for b in box_per_img:
+                        f_gt.write('{} {:.2f} {:.2f} {:.2f} {:.2f}\n'.format(idx_to_class[b[4].item()], b[0], b[1], b[2], b[3]))
+                    for b in resized_final_proposals:
+                        f_det.write('{} {:.6f} {:.2f} {:.2f} {:.2f} {:.2f}\n'.format(idx_to_class[b[4].item()], b[5], b[0], b[1], b[2], b[3]))
+
+                if visualize:
+                    data_visualizer(img, idx_to_class, os.path.join(vis_dir, img_ids[idx]), box_per_img, resized_final_proposals)
+
+    end_t = time.time()
+    print('Total inference time: {:.1f}s'.format(end_t-start_t))
+
+
+if __name__=='__main__':
+    args = parse_args()
+    main(args)
--- a/model.py
+++ b/model.py
@ -0,0 +1,212 @@
+import math
+
+import torch
+import torch.nn as nn
+import torchvision
+from torchvision import models
+
+from utils import compute_offsets, assign_label, generate_proposal
+from loss import ClsScoreRegression, BboxRegression
+
+
+class FeatureExtractor(nn.Module):
+    """
+    Image feature extraction with MobileNet.
+    """
+    def __init__(self, reshape_size=224, pooling=False, verbose=False):
+        super().__init__()
+
+        self.mobilenet = models.mobilenet_v2(pretrained=True)
+        self.mobilenet = nn.Sequential(*list(self.mobilenet.children())[:-1]) # Remove the last classifier
+
+        # average pooling
+        if pooling:
+            self.mobilenet.add_module('LastAvgPool', nn.AvgPool2d(math.ceil(reshape_size/32.))) # input: N x 1280 x 7 x 7
+
+        for i in self.mobilenet.named_parameters():
+            i[1].requires_grad = True # fine-tune all
+
+    def forward(self, img, verbose=False):
+        """
+        Inputs:
+        - img: Batch of resized images, of shape Nx3x224x224
+
+        Outputs:
+        - feat: Image feature, of shape Nx1280 (pooled) or Nx1280x7x7
+        """
+        num_img = img.shape[0]
+
+        img_prepro = img
+
+        feat = []
+        process_batch = 500
+        for b in range(math.ceil(num_img/process_batch)):
+            feat.append(self.mobilenet(img_prepro[b*process_batch:(b+1)*process_batch]
+                                    ).squeeze(-1).squeeze(-1)) # forward and squeeze
+        feat = torch.cat(feat)
+
+        if verbose:
+            print('Output feature shape: ', feat.shape)
+
+        return feat
+
+
+class FastRCNN(nn.Module):
+    def __init__(self, in_dim=1280, hidden_dim=256, num_classes=20, \
+                roi_output_w=2, roi_output_h=2, drop_ratio=0.3):
+        super().__init__()
+
+        assert(num_classes != 0)
+        self.num_classes = num_classes
+        self.roi_output_w, self.roi_output_h = roi_output_w, roi_output_h
+        self.feat_extractor = FeatureExtractor()
+        ##############################################################################
+        # TODO: Declare the cls & bbox heads (in Fast R-CNN).                        #
+        # The cls & bbox heads share a sequential module with a Linear layer,        #
+        # followed by a Dropout (p=drop_ratio), a ReLU nonlinearity and another      #
+        # Linear layer.                                                              #
+        # The cls head is a Linear layer that predicts num_classes + 1 (background). #
+        # The det head is a Linear layer that predicts offsets(dim=4).               #
+        # HINT: The dimension of the two Linear layers are in_dim -> hidden_dim and  #
+        # hidden_dim -> hidden_dim.                                                  #
+        ##############################################################################
+        # Replace "pass" statement with your code
+        pass
+        ##############################################################################
+        #                               END OF YOUR CODE                             #
+        ##############################################################################
+
+    def forward(self, images, bboxes, bbox_batch_ids, proposals, proposal_batch_ids):
+        """
+        Training-time forward pass for our two-stage Faster R-CNN detector.
+
+        Inputs:
+        - images: Tensor of shape (B, 3, H, W) giving input images
+        - bboxes: Tensor of shape (N, 5) giving ground-truth bounding boxes
+        and category labels, from the dataloader, where N is the total number
+        of GT boxes in the batch
+        - bbox_batch_ids: Tensor of shape (N, ) giving the index (in the batch)
+        of the image that each GT box belongs to
+        - proposals: Tensor of shape (M, 4) giving the proposals for input images, 
+        where M is the total number of proposals in the batch
+        - proposal_batch_ids: Tensor of shape (M, ) giving the index of the image 
+        that each proposals belongs to
+
+        Outputs:
+        - total_loss: Torch scalar giving the overall training loss.
+        """
+        w_cls = 1 # for cls_scores
+        w_bbox = 1 # for offsets
+        total_loss = None
+        ##############################################################################
+        # TODO: Implement the forward pass of Fast R-CNN.                            #
+        # A few key steps are outlined as follows:                                   #
+        # i) Extract image fearure.                                                  #
+        # ii) Perform RoI Align on proposals, then meanpool the feature in the       #
+        #     spatial dimension.                                                     #
+        # iii) Pass the RoI feature through the shared-fc layer. Predict             #
+        #      classification scores ans box offsets.                                #
+        # iv) Assign the proposals with targets of each image.                       # 
+        # v) Compute the cls_loss between the predicted class_prob and GT_class      #
+        #    (For poistive & negative proposals)                                     #
+        #    Compute the bbox_loss between the offsets and GT_offsets                #
+        #    (For positive proposals)                                                #
+        #    Compute the total_loss which is formulated as:                          #
+        #    total_loss = w_cls*cls_loss + w_bbox*bbox_loss.                         #
+        ##############################################################################
+        # Replace "pass" statement with your code
+        B, _, H, W = images.shape
+        
+        # extract image feature
+        pass
+
+        # perform RoI Pool & mean pool
+        pass
+
+        # forward heads, get predicted cls scores & offsets
+        pass
+
+        # assign targets with proposals
+        pos_masks, neg_masks, GT_labels, GT_bboxes = [], [], [], []
+        for img_idx in range(B):
+            # get the positive/negative proposals and corresponding
+            # GT box & class label of this image
+            pass
+
+        # compute loss
+        pass
+        
+        ##############################################################################
+        #                               END OF YOUR CODE                             #
+        ##############################################################################
+        return total_loss
+
+    def inference(self, images, proposals, proposal_batch_ids, thresh=0.5, nms_thresh=0.7):
+        """"
+        Inference-time forward pass for our two-stage Faster R-CNN detector
+
+        Inputs:
+        - images: Tensor of shape (B, 3, H, W) giving input images
+        - proposals: Tensor of shape (M, 4) giving the proposals for input images, 
+        where M is the total number of proposals in the batch
+        - proposal_batch_ids: Tensor of shape (M, ) giving the index of the image 
+        that each proposals belongs to
+        - thresh: Threshold value on confidence probability. HINT: You can convert the
+        classification score to probability using a softmax nonlinearity.
+        - nms_thresh: IoU threshold for NMS
+
+        We can output a variable number of predicted boxes per input image.
+        In particular we assume that the input images[i] gives rise to P_i final
+        predicted boxes.
+
+        Outputs:
+        - final_proposals: List of length (B,) where final_proposals[i] is a Tensor
+        of shape (P_i, 4) giving the coordinates of the final predicted boxes for
+        the input images[i]
+        - final_conf_probs: List of length (B,) where final_conf_probs[i] is a
+        Tensor of shape (P_i, 1) giving the predicted probabilites that the boxes
+        in final_proposals[i] are objects (vs background)
+        - final_class: List of length (B,), where final_class[i] is an int64 Tensor
+        of shape (P_i, 1) giving the predicted category labels for each box in
+        final_proposals[i].
+        """
+        final_proposals, final_conf_probs, final_class = None, None, None
+        ##############################################################################
+        # TODO: Predicting the final proposal coordinates `final_proposals`,         #
+        # confidence scores `final_conf_probs`, and the class index `final_class`.   #
+        # The overall steps are similar to the forward pass, but now you cannot      #
+        # decide the activated nor negative proposals without GT boxes.              #
+        # You should apply post-processing (thresholding and NMS) to all proposals   #
+        # and keep the final proposals.                                               #
+        ##############################################################################
+        # Replace "pass" statement with your code
+        B = images.shape[0]
+
+        # extract image feature
+        pass
+
+        # perform RoI Pool & mean pool
+        pass
+
+        # forward heads, get predicted cls scores & offsets
+        pass
+
+        # get predicted boxes & class label & confidence probability
+        pass
+
+        final_proposals = []
+        final_conf_probs = []
+        final_class = []
+        # post-process to get final predictions
+        for img_idx in range(B):
+
+            # filter by threshold
+            pass
+
+            # nms
+            pass
+
+        ##############################################################################
+        #                               END OF YOUR CODE                             #
+        ##############################################################################
+        return final_proposals, final_conf_probs, final_class
--- a/utils.py
+++ b/utils.py
@ -0,0 +1,287 @@
+import numpy as np
+import cv2
+from matplotlib import pyplot as plt
+import torch
+
+
+def data_visualizer(img, idx_to_class, path, bbox=None, pred=None):
+    """
+    Data visualizer on the original image. Support both GT box input and proposal input.
+
+    Input:
+    - img: PIL Image input
+    - idx_to_class: Mapping from the index (0-19) to the class name
+    - bbox: GT bbox (in red, optional), a tensor of shape Nx5, where N is
+            the number of GT boxes, 5 indicates (x_tl, y_tl, x_br, y_br, class)
+    - pred: Predicted bbox (in green, optional), a tensor of shape N'x6, where
+            N' is the number of predicted boxes, 6 indicates
+            (x_tl, y_tl, x_br, y_br, class, object confidence score)
+    """
+
+    img_copy = np.array(img).astype('uint8')
+
+    if bbox is not None:
+        for bbox_idx in range(bbox.shape[0]):
+            one_bbox = bbox[bbox_idx][:4].numpy().astype('int')
+            cv2.rectangle(img_copy, (one_bbox[0], one_bbox[1]), (one_bbox[2],
+                        one_bbox[3]), (255, 0, 0), 2)
+            if bbox.shape[1] > 4: # if class info provided
+                obj_cls = idx_to_class[bbox[bbox_idx][4].item()]
+                cv2.putText(img_copy, '%s' % (obj_cls),
+                            (one_bbox[0], one_bbox[1]+15),
+                            cv2.FONT_HERSHEY_PLAIN, 1.0, (0, 0, 255), thickness=1)
+
+    if pred is not None:
+        for bbox_idx in range(pred.shape[0]):
+            one_bbox = pred[bbox_idx][:4].numpy().astype('int')
+            cv2.rectangle(img_copy, (one_bbox[0], one_bbox[1]), (one_bbox[2],
+                        one_bbox[3]), (0, 255, 0), 2)
+            
+            if pred.shape[1] > 4: # if class and conf score info provided
+                obj_cls = idx_to_class[pred[bbox_idx][4].item()]
+                conf_score = pred[bbox_idx][5].item()
+                cv2.putText(img_copy, '%s, %.2f' % (obj_cls, conf_score),
+                            (one_bbox[0], one_bbox[1]+15),
+                            cv2.FONT_HERSHEY_PLAIN, 1.0, (0, 0, 255), thickness=1)
+
+    plt.imshow(img_copy)
+    plt.axis('off')
+    plt.title(path)
+    plt.savefig(path)
+    plt.close()
+
+
+def coord_trans(bbox, bbox_batch_idx, w_pixel, h_pixel, w_amap=7, h_amap=7, mode='a2p'):
+    """
+    Coordinate transformation function. It converts the box coordinate from
+    the image coordinate system to the activation map coordinate system and vice versa.
+    In our case, the input image will have a few hundred of pixels in
+    width/height while the activation map is of size 7x7.
+
+    Input:
+    - bbox: Could be either bbox, anchor, or proposal, of shape Mx4
+    - bbox_batch_idx: Index of the image that each bbox belongs to, of shape M
+    - w_pixel: Number of pixels in the width side of the original image, of shape B
+    - h_pixel: Number of pixels in the height side of the original image, of shape B
+    - w_amap: Number of pixels in the width side of the activation map, scalar
+    - h_amap: Number of pixels in the height side of the activation map, scalar
+    - mode: Whether transfer from the original image to activation map ('p2a') or
+            the opposite ('a2p')
+
+    Output:
+    - resized_bbox: Resized box coordinates, of the same shape as the input bbox
+    """
+
+    assert mode in ('p2a', 'a2p'), 'invalid coordinate transformation mode!'
+    assert bbox.shape[-1] >= 4, 'the transformation is applied to the first 4 values of dim -1'
+
+    if bbox.shape[0] == 0: # corner cases
+        return bbox
+
+    resized_bbox = bbox.clone()
+
+    if mode == 'p2a':
+        # pixel to activation
+        width_ratio = w_pixel[bbox_batch_idx] * 1. / w_amap
+        height_ratio = h_pixel[bbox_batch_idx] * 1. / h_amap
+        resized_bbox[:, [0, 2]] /= width_ratio.view(-1, 1)
+        resized_bbox[:, [1, 3]] /= height_ratio.view(-1, 1)
+    else:
+        # activation to pixel
+        width_ratio = w_pixel[bbox_batch_idx] * 1. / w_amap
+        height_ratio = h_pixel[bbox_batch_idx] * 1. / h_amap
+        resized_bbox[:, [0, 2]] *= width_ratio.view(-1, 1)
+        resized_bbox[:, [1, 3]] *= height_ratio.view(-1, 1)
+
+    return resized_bbox
+
+
+def generate_anchor(anc_per_grid, grid):
+    """
+    Anchor generator.
+
+    Inputs:
+    - anc_per_grid: Tensor of shape (A, 2) giving the shapes of anchor boxes to 
+        consider at each point in the grid. anc_per_grid[a] = (w, h) gives the width
+        and height of the a'th anchor shape.
+    - grid: Tensor of shape (B, H', W', 2) giving the (x, y) coordinates of the
+        center of each feature from the backbone feature map. This is the tensor
+        returned from GenerateGrid.
+
+    Outputs:
+    - anchors: Tensor of shape (B, A, H', W', 4) giving the positions of all
+        anchor boxes for the entire image. anchors[b, a, h, w] is an anchor box
+        centered at grid[b, h, w], whose shape is given by anc[a]; we parameterize
+        boxes as anchors[b, a, h, w] = (x_tl, y_tl, x_br, y_br), where (x_tl, y_tl)
+        and (x_br, y_br) give the xy coordinates of the top-left and bottom-right
+        corners of the box.
+    """
+    A, _ = anc_per_grid.shape
+    B, H, W, _ = grid.shape
+    anc_per_grid = anc_per_grid.to(grid)
+
+    anc_per_grid = anc_per_grid.view(1, A, 1, 1, -1).repeat(B, 1, H, W, 1)
+    grid = grid.view(B, 1, H, W, -1).repeat(1, A, 1, 1, 1)
+
+    x1y1 = grid - anc_per_grid / 2
+    x2y2 = grid + anc_per_grid / 2
+    anchors = torch.cat([x1y1, x2y2], dim=-1)
+
+    return anchors
+
+
+def compute_iou(anchors, bboxes):
+    """
+    Compute the intersection-over-union between anchors and gts.
+    
+    Inputs:
+    - anchors: Anchor boxes, of shape (M, 4), where M is the number of proposals
+    - bboxes: GT boxes of shape (N, 4), where N is the number of GT boxes,
+                4 indicates (x_{lr}^{gt}, y_{lr}^{gt}, x_{rb}^{gt}, y_{rb}^{gt})
+    
+    Outputs:
+    - iou: IoU matrix of shape (M, N)
+    """
+    iou = None
+    ##############################################################################
+    # TODO: Given anchors and gt bboxes,                                         #
+    # compute the iou between each anchor and gt bbox.                           #
+    ##############################################################################
+    pass
+
+    ##############################################################################
+    #                               END OF YOUR CODE                             #
+    ##############################################################################
+
+    return iou
+
+
+def compute_offsets(anchors, bboxes):
+    """
+    Compute the offsets between anchors and gts.
+    
+    Inputs:
+    - anchors: Anchor boxes, of shape (M, 4)
+    - bboxes: GT boxes of shape (M, 4),
+                4 indicates (x_{lr}^{gt}, y_{lr}^{gt}, x_{rb}^{gt}, y_{rb}^{gt})
+    
+    Outputs:
+    - offsets: offsets of shape (M, 4)
+    """
+    wh_offsets = torch.log((bboxes[:, 2:4] - bboxes[:, :2]) \
+        / (anchors[:, 2:4] - anchors[:, :2]))
+
+    xy_offsets = (bboxes[:, :2] + bboxes[:, 2:4] - \
+        anchors[:, :2] - anchors[:, 2:4]) / 2.
+
+    xy_offsets /= (anchors[:, 2:4] - anchors[:, :2])
+
+    offsets = torch.cat((xy_offsets, wh_offsets), dim=-1)
+
+    return offsets
+
+
+def generate_proposal(anchors, offsets):
+    """
+    Proposal generator.
+
+    Inputs:
+    - anchors: Anchor boxes, of shape (M, 4). Anchors are represented
+    by the coordinates of their top-left and bottom-right corners.
+    - offsets: Transformations of shape (M, 4) that will be used to
+    convert anchor boxes into region proposals. The transformation
+    offsets[m] = (tx, ty, tw, th) will be applied to the anchor
+    anchors[m].
+
+    Outputs:
+    - proposals: Region proposals of shape (M, 4), represented by the
+    coordinates of their top-left and bottom-right corners. Applying the
+    transform offsets[m] to the anchor[m] should give the
+    proposal proposals[m].
+
+    """
+    proposals = None
+    ##############################################################################
+    # TODO: Given anchor coordinates and the proposed offset for each anchor,    #
+    # compute the proposal coordinates using the transformation formulas above.  #
+    ##############################################################################
+    # Replace "pass" statement with your code
+    pass
+
+    ##############################################################################
+    #                               END OF YOUR CODE                             #
+    ##############################################################################
+
+    return proposals
+
+
+@torch.no_grad()
+def assign_label(proposals, bboxes, background_id, pos_thresh=0.5, neg_thresh=0.5, pos_fraction=0.25):
+    """
+    Determine the activated (positive) and negative proposals for model training.
+
+    For Fast R-CNN - Positive proposals are defined Any of the two
+    (i) the proposal/proposals with the highest IoU overlap with a GT box, or
+    (ii) a proposal that has an IoU overlap higher than positive threshold with any GT box.
+    Note: One proposal can match at most one GT box (the one with the largest IoU overlapping).
+
+    We assign a negative label to a proposal if its IoU ratio is lower than
+    a threshold value for all GT boxes. Proposals that are neither positive nor negative
+    do not contribute to the training objective.
+
+    Main steps include:
+    i) Decide activated and negative proposals based on the IoU matrix.
+    ii) Compute GT confidence score/offsets/object class on the positive proposals.
+    iii) Compute GT confidence score on the negative proposals.
+    
+    Inputs:
+    - proposal: Proposal boxes, of shape (M, 4), where M is the number of proposals
+    - bboxes: GT boxes of shape Nx5, where N is the number of GT boxes,
+                5 indicates (x_{lr}^{gt}, y_{lr}^{gt}, x_{rb}^{gt}, y_{rb}^{gt}) and class index
+    - background_id: Class id of the background class
+    - pos_thresh: Positive threshold value
+    - neg_thresh: Negative threshold value
+    - pos_fraction: a factor balancing pos/neg proposals
+    
+    Outputs:
+    - activated_anc_mask: a binary mask indicating the activated proposals, of shape M
+    - negative_anc_mask: a binary mask indicating the negative proposals, of shape M
+    - GT_class: GT class category on all proposals, background class for non-activated proposals,
+                of shape M
+    - bboxes: GT bboxes on activated proposals, of shape M'x4, where M' is the number of 
+              activated proposals
+    """
+    M = proposals.shape[0]
+    N = bboxes.shape[0]
+    iou_mat = compute_iou(proposals, bboxes[:, :4])
+    
+    # activated/positive proposals
+    max_iou_per_anc, max_iou_per_anc_ind = iou_mat.max(dim=-1)
+    max_iou_per_box = iou_mat.max(dim=0, keepdim=True)[0]
+    activated_anc_mask = (iou_mat == max_iou_per_box) & (max_iou_per_box > 0)
+    activated_anc_mask |= (iou_mat > pos_thresh) # using the pos_thresh condition as well
+    activated_anc_mask = activated_anc_mask.max(dim=-1)[0] # (M, )
+    activated_anc_ind = torch.nonzero(activated_anc_mask.view(-1)).squeeze(-1)
+
+    # GT class
+    box_cls = bboxes[:, 4].long().view(1, N).expand(M, N)
+    # if a proposal matches multiple GT boxes, choose the box with the largest iou
+    GT_class = torch.gather(box_cls, -1, max_iou_per_anc_ind.unsqueeze(-1)).squeeze(-1) # M
+    GT_class[~activated_anc_mask] = background_id
+
+    # GT bboxes
+    bboxes_expand = bboxes[:, :4].view(1, N, 4).expand((M, N, 4))
+    bboxes = torch.gather(bboxes_expand, -2, max_iou_per_anc_ind.unsqueeze(-1) \
+                          .unsqueeze(-1).expand(M, 1, 4)).view(M, 4)
+    bboxes = bboxes[activated_anc_ind]
+
+    # negative anchors
+    negative_anc_mask = (max_iou_per_anc < neg_thresh)
+    negative_anc_ind = torch.nonzero(negative_anc_mask.view(-1)).squeeze(-1)
+    # balance pos/neg anchors, random choose
+    num_neg = int(activated_anc_ind.shape[0] * (1 - pos_fraction) / pos_fraction)
+    negative_anc_ind = negative_anc_ind[torch.randint(0, negative_anc_ind.shape[0], (num_neg,))]
+    negative_anc_mask = torch.zeros_like(negative_anc_mask)
+    negative_anc_mask[negative_anc_ind] = 1
+
+    return activated_anc_mask, negative_anc_mask, GT_class, bboxes
--- a/视听导_编程作业2_说明文档.pdf
+++ b/视听导_编程作业2_说明文档.pdf