diff --git a/sh/activate.sh b/sh/activate.sh
new file mode 100644
index 0000000..184775f
--- /dev/null
+++ b/sh/activate.sh
@@ -0,0 +1,6 @@
+export PYTHONPATH=/home/licsber/services/gxs/src
+PY=/home/licsber/anaconda3/envs/gxs-36/bin/python
+
+hostname
+echo $PYTHONPATH
+echo $PY
diff --git a/sh/deploy.sh b/sh/deploy.sh
new file mode 100644
index 0000000..53f18ab
--- /dev/null
+++ b/sh/deploy.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env zsh
+
+SRC=/Users/licsber/Coding/Python/2021工训赛/
+DST=192.168.1.102:/home/licsber/gx/
+rsync -rtvzhP $SRC $DST --delete-after --exclude "venv/" --exclude "__pycache__/" --exclude "*.onnx" --exclude "*.engine" --exclude ".git/"
+
+SRC=/Users/licsber/datasets/工训赛/models/
+
+cd "$SRC" || exit
+rsync -rtvzhP ssd-mobilenet.onnx $DST
+rsync -rtvzhP labels.txt $DST
diff --git a/sh/run.sh b/sh/run.sh
new file mode 100644
index 0000000..e69de29
diff --git a/sh/ser.sh b/sh/ser.sh
new file mode 100644
index 0000000..655e238
--- /dev/null
+++ b/sh/ser.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env zsh
+
+SRC=/Users/licsber/Coding/Python/2021工训赛/
+DST=ser:/home/licsber/services/gxs/
+
+rsync -rtvzhP $SRC $DST --delete-after --exclude "venv/" --exclude "__pycache__/"
+
+SRC=/Users/licsber/datasets/工训赛/models/
+DST=ser:/datasets/工训赛/models
+cd "$SRC" || exit
+
+rsync -tvzhP labels.txt $DST
+rsync -rtvzhP $SRC/../voc/ $DST/../voc --delete-after
+rsync -tvzhP mobilenet-v1-ssd-mp-0_675.pth $DST
+rsync -tvzhP ser:/datasets/工训赛/models/mb1-ssd-Epoch-60-Loss-1.0784624990294962.pth /Users/licsber/datasets/工训赛/models/
+#rsync -tvzhP ssd-mobilenet.onnx $DST
+#rsync -tvzhP mb1-ssd-Epoch-28-Loss-1.1538286421980177.pth $DST
diff --git a/sh/train.sh b/sh/train.sh
new file mode 100644
index 0000000..56acb95
--- /dev/null
+++ b/sh/train.sh
@@ -0,0 +1,3 @@
+source activate.sh
+
+$PY "$PYTHONPATH/train.py" "$@"
diff --git a/src/0_extract_video.py b/src/0_extract_video.py
new file mode 100644
index 0000000..1e9f11f
--- /dev/null
+++ b/src/0_extract_video.py
@@ -0,0 +1,15 @@
+import cv2
+
+from config import VIDEO_PATH, IMG_PATH
+
+count = 0
+for avi in VIDEO_PATH.glob('*.avi'):
+    cap = cv2.VideoCapture(str(avi))
+    while True:
+        suc, bgr = cap.read()
+        if not suc:
+            break
+
+        save_name = IMG_PATH / f"{count}.jpg"
+        cv2.imwrite(str(save_name), bgr)
+        count += 1
diff --git a/src/1_rename_img.py b/src/1_rename_img.py
new file mode 100644
index 0000000..9090973
--- /dev/null
+++ b/src/1_rename_img.py
@@ -0,0 +1,8 @@
+from config import IMG_PATH
+
+count = 0
+all_files = list(IMG_PATH.glob('*.jpg'))
+all_files.sort()
+for img in all_files:
+    img.rename(img.parent / f"{count:06d}.jpg")
+    count += 1
diff --git a/src/2_make_voc.py b/src/2_make_voc.py
new file mode 100644
index 0000000..1d81b09
--- /dev/null
+++ b/src/2_make_voc.py
@@ -0,0 +1,48 @@
+import random
+
+from config import IMG_PATH, VOC_PATH, CLASSES
+
+random.seed(233)
+annos = VOC_PATH / 'Annotations'
+datasets = VOC_PATH / 'ImageSets' / 'Main'
+images = VOC_PATH / 'JPEGImages'
+annos.mkdir(exist_ok=True)
+datasets.mkdir(parents=True, exist_ok=True)
+images.mkdir(exist_ok=True)
+
+for img in IMG_PATH.glob('*.jpg'):
+    img.rename(images / img.name)
+
+for anno in IMG_PATH.glob('*.xml'):
+    anno.rename(annos / anno.name)
+
+labels = VOC_PATH / 'labels.txt'
+labels.write_text('\n'.join(CLASSES))
+
+train_file = datasets / 'train.txt'
+val_file = datasets / 'val.txt'
+train_val_file = datasets / 'trainval.txt'
+test_file = datasets / 'test.txt'
+
+train_ratio = 0.7
+val_ratio = 0.1
+
+total = list(annos.glob('*.xml'))
+random.shuffle(total)
+total_nums = len(total)
+
+train_num = int(total_nums * train_ratio)
+val_num = int(total_nums * val_ratio)
+
+train = total[:train_num]
+val = total[train_num:train_num + val_num]
+test = total[train_num + val_num:]
+
+train = '\n'.join([i.name.rstrip('.xml') for i in train])
+val = '\n'.join([i.name.rstrip('.xml') for i in val])
+test = '\n'.join([i.name.rstrip('.xml') for i in test])
+
+train_file.write_text(train)
+val_file.write_text(val)
+test_file.write_text(test)
+train_val_file.write_text(train + '\n' + val)
diff --git a/src/3_train_ssd.py b/src/3_train_ssd.py
new file mode 100644
index 0000000..abb22aa
--- /dev/null
+++ b/src/3_train_ssd.py
@@ -0,0 +1,322 @@
+import argparse
+import itertools
+import logging
+import os
+import sys
+
+import torch
+from torch.optim.lr_scheduler import CosineAnnealingLR, MultiStepLR
+from torch.utils.data import DataLoader, ConcatDataset
+
+from config import VOC_PATH, MODEL_PATH
+from vision.datasets.voc_dataset import VOCDataset
+from vision.nn.multibox_loss import MultiboxLoss
+from vision.ssd.config import mobilenetv1_ssd_config
+from vision.ssd.config import squeezenet_ssd_config
+from vision.ssd.config import vgg_ssd_config
+from vision.ssd.data_preprocessing import TrainAugmentation, TestTransform
+from vision.ssd.mobilenet_v2_ssd_lite import create_mobilenetv2_ssd_lite
+from vision.ssd.mobilenetv1_ssd import create_mobilenetv1_ssd
+from vision.ssd.mobilenetv1_ssd_lite import create_mobilenetv1_ssd_lite
+from vision.ssd.squeezenet_ssd_lite import create_squeezenet_ssd_lite
+from vision.ssd.ssd import MatchPrior
+from vision.ssd.vgg_ssd import create_vgg_ssd
+from vision.utils.misc import str2bool, Timer, freeze_net_layers, store_labels
+
+parser = argparse.ArgumentParser(
+    description='Single Shot MultiBox Detector Training With PyTorch')
+
+parser.add_argument("--dataset-type", default="voc", type=str,
+                    help='Specify dataset type. Currently supports voc and open_images.')
+parser.add_argument('--datasets', '--data', nargs='+', default=[str(VOC_PATH)], help='Dataset directory path')
+parser.add_argument('--balance-data', action='store_true',
+                    help="Balance training data by down-sampling more frequent labels.")
+
+parser.add_argument('--net', default="mb1-ssd",
+                    help="The network architecture, it can be mb1-ssd, mb1-lite-ssd, mb2-ssd-lite or vgg16-ssd.")
+parser.add_argument('--freeze-base-net', action='store_true',
+                    help="Freeze base net layers.")
+parser.add_argument('--freeze-net', action='store_true',
+                    help="Freeze all the layers except the prediction head.")
+parser.add_argument('--mb2-width-mult', default=1.0, type=float,
+                    help='Width Multiplifier for MobilenetV2')
+
+# Params for loading pretrained basenet or checkpoints.
+parser.add_argument('--base-net', help='Pretrained base model')
+parser.add_argument('--pretrained-ssd', default=str(MODEL_PATH) + '/mobilenet-v1-ssd-mp-0_675.pth', type=str,
+                    help='Pre-trained base model')
+parser.add_argument('--resume', default=None, type=str,
+                    help='Checkpoint state_dict file to resume training from')
+
+# Params for SGD
+parser.add_argument('--lr', '--learning-rate', default=0.01, type=float,
+                    help='initial learning rate')
+parser.add_argument('--momentum', default=0.9, type=float,
+                    help='Momentum value for optim')
+parser.add_argument('--weight-decay', default=5e-4, type=float,
+                    help='Weight decay for SGD')
+parser.add_argument('--gamma', default=0.1, type=float,
+                    help='Gamma update for SGD')
+parser.add_argument('--base-net-lr', default=0.001, type=float,
+                    help='initial learning rate for base net, or None to use --lr')
+parser.add_argument('--extra-layers-lr', default=None, type=float,
+                    help='initial learning rate for the layers not in base net and prediction heads.')
+
+# Scheduler
+parser.add_argument('--scheduler', default="cosine", type=str,
+                    help="Scheduler for SGD. It can one of multi-step and cosine")
+
+# Params for Multi-step Scheduler
+parser.add_argument('--milestones', default="80,100", type=str,
+                    help="milestones for MultiStepLR")
+
+# Params for Cosine Annealing
+parser.add_argument('--t-max', default=100, type=float,
+                    help='T_max value for Cosine Annealing Scheduler.')
+
+# Train params
+parser.add_argument('--batch-size', default=16, type=int,
+                    help='Batch size for training')
+parser.add_argument('--num-epochs', '--epochs', default=100, type=int,
+                    help='the number epochs')
+parser.add_argument('--num-workers', '--workers', default=0, type=int,
+                    help='Number of workers used in dataloading')
+parser.add_argument('--validation-epochs', default=1, type=int,
+                    help='the number epochs between running validation')
+parser.add_argument('--debug-steps', default=10, type=int,
+                    help='Set the debug log output frequency.')
+parser.add_argument('--use-cuda', default=True, type=str2bool,
+                    help='Use CUDA to train model')
+parser.add_argument('--checkpoint-folder', '--model-dir', default=str(MODEL_PATH),
+                    help='Directory for saving checkpoint models')
+
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format='%(asctime)s - %(message)s', datefmt="%Y-%m-%d %H:%M:%S")
+
+args = parser.parse_args()
+DEVICE = torch.device("cuda:0" if torch.cuda.is_available() and args.use_cuda else "cpu")
+
+if args.use_cuda and torch.cuda.is_available():
+    torch.backends.cudnn.benchmark = True
+    logging.info("Using CUDA...")
+
+
+def train(loader, net, criterion, optimizer, device, debug_steps=100, epoch=-1):
+    net.train(True)
+    running_loss = 0.0
+    running_regression_loss = 0.0
+    running_classification_loss = 0.0
+    for i, data in enumerate(loader):
+        images, boxes, labels = data
+        images = images.to(device)
+        boxes = boxes.to(device)
+        labels = labels.to(device)
+
+        optimizer.zero_grad()
+        confidence, locations = net(images)
+        regression_loss, classification_loss = criterion(confidence, locations, labels, boxes)
+        loss = regression_loss + classification_loss
+        loss.backward()
+        optimizer.step()
+
+        running_loss += loss.item()
+        running_regression_loss += regression_loss.item()
+        running_classification_loss += classification_loss.item()
+        if i and i % debug_steps == 0:
+            avg_loss = running_loss / debug_steps
+            avg_reg_loss = running_regression_loss / debug_steps
+            avg_clf_loss = running_classification_loss / debug_steps
+            logging.info(
+                f"Epoch: {epoch}, Step: {i}/{len(loader)}, " +
+                f"Avg Loss: {avg_loss:.4f}, " +
+                f"Avg Regression Loss {avg_reg_loss:.4f}, " +
+                f"Avg Classification Loss: {avg_clf_loss:.4f}"
+            )
+            running_loss = 0.0
+            running_regression_loss = 0.0
+            running_classification_loss = 0.0
+
+
+def test(loader, net, criterion, device):
+    net.eval()
+    running_loss = 0.0
+    running_regression_loss = 0.0
+    running_classification_loss = 0.0
+    num = 0
+    for _, data in enumerate(loader):
+        images, boxes, labels = data
+        images = images.to(device)
+        boxes = boxes.to(device)
+        labels = labels.to(device)
+        num += 1
+
+        with torch.no_grad():
+            confidence, locations = net(images)
+            regression_loss, classification_loss = criterion(confidence, locations, labels, boxes)
+            loss = regression_loss + classification_loss
+
+        running_loss += loss.item()
+        running_regression_loss += regression_loss.item()
+        running_classification_loss += classification_loss.item()
+    return running_loss / num, running_regression_loss / num, running_classification_loss / num
+
+
+if __name__ == '__main__':
+    timer = Timer()
+
+    logging.info(args)
+
+    if args.checkpoint_folder:
+        args.checkpoint_folder = os.path.expanduser(args.checkpoint_folder)
+
+        if not os.path.exists(args.checkpoint_folder):
+            os.mkdir(args.checkpoint_folder)
+
+    if args.net == 'vgg16-ssd':
+        create_net = create_vgg_ssd
+        config = vgg_ssd_config
+    elif args.net == 'mb1-ssd':
+        create_net = create_mobilenetv1_ssd
+        config = mobilenetv1_ssd_config
+    elif args.net == 'mb1-ssd-lite':
+        create_net = create_mobilenetv1_ssd_lite
+        config = mobilenetv1_ssd_config
+    elif args.net == 'sq-ssd-lite':
+        create_net = create_squeezenet_ssd_lite
+        config = squeezenet_ssd_config
+    elif args.net == 'mb2-ssd-lite':
+        create_net = lambda num: create_mobilenetv2_ssd_lite(num, width_mult=args.mb2_width_mult)
+        config = mobilenetv1_ssd_config
+    else:
+        logging.fatal("The net type is wrong.")
+        parser.print_help(sys.stderr)
+        sys.exit(1)
+
+    train_transform = TrainAugmentation(config.image_size, config.image_mean, config.image_std)
+    target_transform = MatchPrior(config.priors, config.center_variance,
+                                  config.size_variance, 0.5)
+
+    test_transform = TestTransform(config.image_size, config.image_mean, config.image_std)
+
+    logging.info("Prepare training datasets.")
+    datasets = []
+    for dataset_path in args.datasets:
+        dataset = VOCDataset(dataset_path, transform=train_transform,
+                             target_transform=target_transform)
+        label_file = os.path.join(args.checkpoint_folder, "labels.txt")
+        store_labels(label_file, dataset.class_names)
+        num_classes = len(dataset.class_names)
+        datasets.append(dataset)
+
+    logging.info(f"Stored labels into file {label_file}.")
+    train_dataset = ConcatDataset(datasets)
+    logging.info("Train dataset size: {}".format(len(train_dataset)))
+    train_loader = DataLoader(train_dataset, args.batch_size,
+                              num_workers=args.num_workers,
+                              shuffle=True)
+
+    logging.info("Prepare Validation datasets.")
+    val_dataset = VOCDataset(dataset_path, transform=test_transform,
+                             target_transform=target_transform, is_test=True)
+    logging.info("Validation dataset size: {}".format(len(val_dataset)))
+    val_loader = DataLoader(val_dataset, args.batch_size,
+                            num_workers=args.num_workers,
+                            shuffle=False)
+
+    logging.info("Build network.")
+    net = create_net(num_classes)
+    min_loss = -10000.0
+    last_epoch = -1
+
+    base_net_lr = args.base_net_lr if args.base_net_lr is not None else args.lr
+    extra_layers_lr = args.extra_layers_lr if args.extra_layers_lr is not None else args.lr
+
+    if args.freeze_base_net:
+        logging.info("Freeze base net.")
+        freeze_net_layers(net.base_net)
+        params = itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters(),
+                                 net.regression_headers.parameters(), net.classification_headers.parameters())
+        params = [
+            {'params': itertools.chain(
+                net.source_layer_add_ons.parameters(),
+                net.extras.parameters()
+            ), 'lr': extra_layers_lr},
+            {'params': itertools.chain(
+                net.regression_headers.parameters(),
+                net.classification_headers.parameters()
+            )}
+        ]
+    elif args.freeze_net:
+        freeze_net_layers(net.base_net)
+        freeze_net_layers(net.source_layer_add_ons)
+        freeze_net_layers(net.extras)
+        params = itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters())
+        logging.info("Freeze all the layers except prediction heads.")
+    else:
+        params = [
+            {'params': net.base_net.parameters(), 'lr': base_net_lr},
+            {'params': itertools.chain(
+                net.source_layer_add_ons.parameters(),
+                net.extras.parameters()
+            ), 'lr': extra_layers_lr},
+            {'params': itertools.chain(
+                net.regression_headers.parameters(),
+                net.classification_headers.parameters()
+            )}
+        ]
+
+    # load a previous model checkpoint (if requested)
+    timer.start("Load Model")
+    if args.resume:
+        logging.info(f"Resume from the model {args.resume}")
+        net.load(args.resume)
+    elif args.base_net:
+        logging.info(f"Init from base net {args.base_net}")
+        net.init_from_base_net(args.base_net)
+    elif args.pretrained_ssd:
+        logging.info(f"Init from pretrained ssd {args.pretrained_ssd}")
+        net.init_from_pretrained_ssd(args.pretrained_ssd)
+    logging.info(f'Took {timer.end("Load Model"):.2f} seconds to load the model.')
+
+    net.to(DEVICE)
+
+    criterion = MultiboxLoss(config.priors, iou_threshold=0.5, neg_pos_ratio=3,
+                             center_variance=0.1, size_variance=0.2, device=DEVICE)
+    optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum,
+                                weight_decay=args.weight_decay)
+    logging.info(f"Learning rate: {args.lr}, Base net learning rate: {base_net_lr}, "
+                 + f"Extra Layers learning rate: {extra_layers_lr}.")
+
+    if args.scheduler == 'multi-step':
+        logging.info("Uses MultiStepLR scheduler.")
+        milestones = [int(v.strip()) for v in args.milestones.split(",")]
+        scheduler = MultiStepLR(optimizer, milestones=milestones,
+                                gamma=0.1, last_epoch=last_epoch)
+    elif args.scheduler == 'cosine':
+        logging.info("Uses CosineAnnealingLR scheduler.")
+        scheduler = CosineAnnealingLR(optimizer, args.t_max, last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {args.scheduler}.")
+        parser.print_help(sys.stderr)
+        sys.exit(1)
+
+    logging.info(f"Start training from epoch {last_epoch + 1}.")
+
+    for epoch in range(last_epoch + 1, args.num_epochs):
+        train(train_loader, net, criterion, optimizer,
+              device=DEVICE, debug_steps=args.debug_steps, epoch=epoch)
+        scheduler.step()
+
+        if epoch % args.validation_epochs == 0 or epoch == args.num_epochs - 1:
+            val_loss, val_regression_loss, val_classification_loss = test(val_loader, net, criterion, DEVICE)
+            logging.info(
+                f"Epoch: {epoch}, " +
+                f"Validation Loss: {val_loss:.4f}, " +
+                f"Validation Regression Loss {val_regression_loss:.4f}, " +
+                f"Validation Classification Loss: {val_classification_loss:.4f}"
+            )
+            model_path = os.path.join(args.checkpoint_folder, f"{args.net}-Epoch-{epoch}-Loss-{val_loss}.pth")
+            net.save(model_path)
+            logging.info(f"Saved model {model_path}")
+
+    logging.info("Task done, exiting program.")
diff --git a/src/4_eval_ssd.py b/src/4_eval_ssd.py
new file mode 100644
index 0000000..fd03c0e
--- /dev/null
+++ b/src/4_eval_ssd.py
@@ -0,0 +1,219 @@
+import argparse
+import logging
+import pathlib
+import sys
+
+import numpy as np
+import torch
+
+from config import MODEL_PATH, VOC_PATH, MODEL_NAME
+from vision.datasets.open_images import OpenImagesDataset
+from vision.datasets.voc_dataset import VOCDataset
+from vision.ssd.mobilenet_v2_ssd_lite import create_mobilenetv2_ssd_lite, create_mobilenetv2_ssd_lite_predictor
+from vision.ssd.mobilenetv1_ssd import create_mobilenetv1_ssd, create_mobilenetv1_ssd_predictor
+from vision.ssd.mobilenetv1_ssd_lite import create_mobilenetv1_ssd_lite, create_mobilenetv1_ssd_lite_predictor
+from vision.ssd.squeezenet_ssd_lite import create_squeezenet_ssd_lite, create_squeezenet_ssd_lite_predictor
+from vision.ssd.vgg_ssd import create_vgg_ssd, create_vgg_ssd_predictor
+from vision.utils import box_utils, measurements
+from vision.utils import str2bool, Timer
+
+parser = argparse.ArgumentParser(description="SSD Evaluation on VOC Dataset.")
+parser.add_argument('--net', default='mb1-ssd',
+                    help="The network architecture, it should be of mb1-ssd, mb1-ssd-lite, mb2-ssd-lite or vgg16-ssd.")
+parser.add_argument("--trained_model", type=str,
+                    default='/Users/licsber/datasets/工训赛/models/' + MODEL_NAME)
+
+parser.add_argument("--dataset_type", default="voc", type=str,
+                    help='Specify dataset type. Currently support voc and open_images.')
+parser.add_argument("--dataset", type=str, help="The root directory of the VOC dataset or Open Images dataset.",
+                    default=str(VOC_PATH))
+parser.add_argument("--label_file", type=str, help="The label file path.", default=str(MODEL_PATH) + '/labels.txt')
+parser.add_argument("--use_cuda", type=str2bool, default=True)
+parser.add_argument("--use_2007_metric", type=str2bool, default=True)
+parser.add_argument("--nms_method", type=str, default="hard")
+parser.add_argument("--iou_threshold", type=float, default=0.5, help="The threshold of Intersection over Union.")
+parser.add_argument("--eval_dir", default="eval_results", type=str, help="The directory to store evaluation results.")
+parser.add_argument('--mb2_width_mult', default=1.0, type=float,
+                    help='Width Multiplifier for MobilenetV2')
+args = parser.parse_args()
+DEVICE = torch.device("cuda:0" if torch.cuda.is_available() and args.use_cuda else "cpu")
+
+
+def group_annotation_by_class(dataset):
+    true_case_stat = {}
+    all_gt_boxes = {}
+    all_difficult_cases = {}
+    for i in range(len(dataset)):
+        image_id, annotation = dataset.get_annotation(i)
+        gt_boxes, classes, is_difficult = annotation
+        gt_boxes = torch.from_numpy(gt_boxes)
+        for i, difficult in enumerate(is_difficult):
+            class_index = int(classes[i])
+            gt_box = gt_boxes[i]
+            if not difficult:
+                true_case_stat[class_index] = true_case_stat.get(class_index, 0) + 1
+
+            if class_index not in all_gt_boxes:
+                all_gt_boxes[class_index] = {}
+            if image_id not in all_gt_boxes[class_index]:
+                all_gt_boxes[class_index][image_id] = []
+            all_gt_boxes[class_index][image_id].append(gt_box)
+            if class_index not in all_difficult_cases:
+                all_difficult_cases[class_index] = {}
+            if image_id not in all_difficult_cases[class_index]:
+                all_difficult_cases[class_index][image_id] = []
+            all_difficult_cases[class_index][image_id].append(difficult)
+
+    for class_index in all_gt_boxes:
+        for image_id in all_gt_boxes[class_index]:
+            all_gt_boxes[class_index][image_id] = torch.stack(all_gt_boxes[class_index][image_id])
+    for class_index in all_difficult_cases:
+        for image_id in all_difficult_cases[class_index]:
+            all_gt_boxes[class_index][image_id] = torch.tensor(all_gt_boxes[class_index][image_id])
+    return true_case_stat, all_gt_boxes, all_difficult_cases
+
+
+def compute_average_precision_per_class(num_true_cases, gt_boxes, difficult_cases,
+                                        prediction_file, iou_threshold, use_2007_metric):
+    with open(prediction_file) as f:
+        image_ids = []
+        boxes = []
+        scores = []
+        for line in f:
+            t = line.rstrip().split("\t")
+            image_ids.append(t[0])
+            scores.append(float(t[1]))
+            box = torch.tensor([float(v) for v in t[2:]]).unsqueeze(0)
+            box -= 1.0  # convert to python format where indexes start from 0
+            boxes.append(box)
+
+        scores = np.array(scores)
+        sorted_indexes = np.argsort(-scores)
+        boxes = [boxes[i] for i in sorted_indexes]
+        image_ids = [image_ids[i] for i in sorted_indexes]
+        true_positive = np.zeros(len(image_ids))
+        false_positive = np.zeros(len(image_ids))
+        matched = set()
+        for i, image_id in enumerate(image_ids):
+            box = boxes[i]
+            if image_id not in gt_boxes:
+                false_positive[i] = 1
+                continue
+
+            gt_box = gt_boxes[image_id]
+            ious = box_utils.iou_of(box, gt_box)
+            max_iou = torch.max(ious).item()
+            max_arg = torch.argmax(ious).item()
+            if max_iou > iou_threshold:
+                if difficult_cases[image_id][max_arg] == 0:
+                    if (image_id, max_arg) not in matched:
+                        true_positive[i] = 1
+                        matched.add((image_id, max_arg))
+                    else:
+                        false_positive[i] = 1
+            else:
+                false_positive[i] = 1
+
+    true_positive = true_positive.cumsum()
+    false_positive = false_positive.cumsum()
+    precision = true_positive / (true_positive + false_positive)
+    recall = true_positive / num_true_cases
+    if use_2007_metric:
+        return measurements.compute_voc2007_average_precision(precision, recall)
+    else:
+        return measurements.compute_average_precision(precision, recall)
+
+
+if __name__ == '__main__':
+    eval_path = pathlib.Path(args.eval_dir)
+    eval_path.mkdir(exist_ok=True)
+    timer = Timer()
+    class_names = [name.strip() for name in open(args.label_file).readlines()]
+
+    if args.dataset_type == "voc":
+        dataset = VOCDataset(args.dataset, is_test=True)
+    elif args.dataset_type == 'open_images':
+        dataset = OpenImagesDataset(args.dataset, dataset_type="test")
+
+    true_case_stat, all_gb_boxes, all_difficult_cases = group_annotation_by_class(dataset)
+    if args.net == 'vgg16-ssd':
+        net = create_vgg_ssd(len(class_names), is_test=True)
+    elif args.net == 'mb1-ssd':
+        net = create_mobilenetv1_ssd(len(class_names), is_test=True)
+    elif args.net == 'mb1-ssd-lite':
+        net = create_mobilenetv1_ssd_lite(len(class_names), is_test=True)
+    elif args.net == 'sq-ssd-lite':
+        net = create_squeezenet_ssd_lite(len(class_names), is_test=True)
+    elif args.net == 'mb2-ssd-lite':
+        net = create_mobilenetv2_ssd_lite(len(class_names), width_mult=args.mb2_width_mult, is_test=True)
+    else:
+        logging.fatal("The net type is wrong. It should be one of vgg16-ssd, mb1-ssd and mb1-ssd-lite.")
+        parser.print_help(sys.stderr)
+        sys.exit(1)
+
+    timer.start("Load Model")
+    net.load(args.trained_model)
+    net = net.to(DEVICE)
+    print(f'It took {timer.end("Load Model")} seconds to load the model.')
+    if args.net == 'vgg16-ssd':
+        predictor = create_vgg_ssd_predictor(net, nms_method=args.nms_method, device=DEVICE)
+    elif args.net == 'mb1-ssd':
+        predictor = create_mobilenetv1_ssd_predictor(net, nms_method=args.nms_method, device=DEVICE)
+    elif args.net == 'mb1-ssd-lite':
+        predictor = create_mobilenetv1_ssd_lite_predictor(net, nms_method=args.nms_method, device=DEVICE)
+    elif args.net == 'sq-ssd-lite':
+        predictor = create_squeezenet_ssd_lite_predictor(net, nms_method=args.nms_method, device=DEVICE)
+    elif args.net == 'mb2-ssd-lite':
+        predictor = create_mobilenetv2_ssd_lite_predictor(net, nms_method=args.nms_method, device=DEVICE)
+    else:
+        logging.fatal("The net type is wrong. It should be one of vgg16-ssd, mb1-ssd and mb1-ssd-lite.")
+        parser.print_help(sys.stderr)
+        sys.exit(1)
+
+    results = []
+    for i in range(len(dataset)):
+        print("process image", i)
+        timer.start("Load Image")
+        image = dataset.get_image(i)
+        print("Load Image: {:4f} seconds.".format(timer.end("Load Image")))
+        timer.start("Predict")
+        boxes, labels, probs = predictor.predict(image)
+        print("Prediction: {:4f} seconds.".format(timer.end("Predict")))
+        indexes = torch.ones(labels.size(0), 1, dtype=torch.float32) * i
+        results.append(torch.cat([
+            indexes.reshape(-1, 1),
+            labels.reshape(-1, 1).float(),
+            probs.reshape(-1, 1),
+            boxes + 1.0  # matlab's indexes start from 1
+        ], dim=1))
+    results = torch.cat(results)
+    for class_index, class_name in enumerate(class_names):
+        if class_index == 0: continue  # ignore background
+        prediction_path = eval_path / f"det_test_{class_name}.txt"
+        with open(prediction_path, "w") as f:
+            sub = results[results[:, 1] == class_index, :]
+            for i in range(sub.size(0)):
+                prob_box = sub[i, 2:].numpy()
+                image_id = dataset.ids[int(sub[i, 0])]
+                print(
+                    image_id + "\t" + " ".join([str(v) for v in prob_box]).replace(" ", "\t"),
+                    file=f
+                )
+    aps = []
+    print("\n\nAverage Precision Per-class:")
+    for class_index, class_name in enumerate(class_names):
+        if class_index == 0:
+            continue
+        prediction_path = eval_path / f"det_test_{class_name}.txt"
+        ap = compute_average_precision_per_class(
+            true_case_stat[class_index],
+            all_gb_boxes[class_index],
+            all_difficult_cases[class_index],
+            prediction_path,
+            args.iou_threshold,
+            args.use_2007_metric
+        )
+        aps.append(ap)
+        print(f"{class_name}: {ap}")
+
+    print(f"\nAverage Precision Across All Classes: {sum(aps) / len(aps)}")
diff --git a/src/5_video_test.py b/src/5_video_test.py
new file mode 100644
index 0000000..da554c8
--- /dev/null
+++ b/src/5_video_test.py
@@ -0,0 +1,42 @@
+import cv2
+
+from config import MODEL_PATH, VIDEO_PATH, LABEL_PATH, MODEL_NAME
+from vision.ssd.mobilenetv1_ssd import create_mobilenetv1_ssd, create_mobilenetv1_ssd_predictor
+
+train_model = MODEL_PATH / MODEL_NAME
+test_videos = VIDEO_PATH.glob('*.avi')
+
+class_names = [name.strip() for name in LABEL_PATH.read_text().split()]
+net = create_mobilenetv1_ssd(len(class_names), is_test=True)
+net.load(train_model)
+predictor = create_mobilenetv1_ssd_predictor(net, nms_method='hard')
+
+count = 0
+for video in test_videos:
+    count += 1
+    cap = cv2.VideoCapture(str(video))
+    if count != 1:
+        continue
+
+    while True:
+        suc, bgr = cap.read()
+        if not suc:
+            break
+
+        rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
+        boxes, labels, probs = predictor.predict(rgb, 5, 0.4)
+        for i in range(boxes.size(0)):
+            box = boxes[i, :]
+            if box[0] <= 0 or box[1] <= 0 or box[3] >= 240:
+                continue
+
+            label = f"{class_names[labels[i]]}: {probs[i]:.2f}"
+            print(label)
+
+            cv2.rectangle(bgr, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), (255, 255, 0), 4)
+            cv2.putText(bgr, label, (int(box[0]) + 20, int(box[1]) + 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 255), 2)
+
+        cv2.imshow('bgr', bgr)
+        cv2.waitKey(1)
+
+    break
diff --git a/src/6_onnx_export.py b/src/6_onnx_export.py
new file mode 100644
index 0000000..a502ebf
--- /dev/null
+++ b/src/6_onnx_export.py
@@ -0,0 +1,107 @@
+import argparse
+import os
+import sys
+
+import torch.onnx
+
+from config import MODEL_PATH, MAC, MODEL_NAME
+from vision.ssd.mobilenet_v2_ssd_lite import create_mobilenetv2_ssd_lite
+from vision.ssd.mobilenetv1_ssd import create_mobilenetv1_ssd
+from vision.ssd.mobilenetv1_ssd_lite import create_mobilenetv1_ssd_lite
+from vision.ssd.squeezenet_ssd_lite import create_squeezenet_ssd_lite
+from vision.ssd.vgg_ssd import create_vgg_ssd
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--net', default='ssd-mobilenet',
+                    help="The network architecture, it can be mb1-ssd (aka ssd-mobilenet), mb1-lite-ssd, mb2-ssd-lite or vgg16-ssd.")
+parser.add_argument('--input', type=str, default=str(MODEL_PATH / MODEL_NAME),
+                    help="path to input PyTorch model (.pth checkpoint)")
+parser.add_argument('--output', type=str, default='', help="desired path of converted ONNX model (default: <NET>.onnx)")
+parser.add_argument('--labels', type=str, default=str(MODEL_PATH) + '/labels.txt', help="name of the class labels file")
+parser.add_argument('--width', type=int, default=300, help="input width of the model to be exported (in pixels)")
+parser.add_argument('--height', type=int, default=300, help="input height of the model to be exported (in pixels)")
+parser.add_argument('--batch-size', type=int, default=1, help="batch size of the model to be exported (default=1)")
+parser.add_argument('--model-dir', type=str, default=str(MODEL_PATH),
+                    help="directory to look for the input PyTorch model in, and export the converted ONNX model to (if --output doesn't specify a directory)")
+
+args = parser.parse_args()
+print(args)
+
+# set the device
+device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+print('running on device ' + str(device))
+
+# format input model paths
+if args.model_dir:
+    args.model_dir = os.path.expanduser(args.model_dir)
+
+    # find the checkpoint with the lowest loss
+    if not args.input:
+        best_loss = 10000
+        for file in os.listdir(args.model_dir):
+            if not file.endswith(".pth"):
+                continue
+            try:
+                loss = float(file[file.rfind("-") + 1:len(file) - 4])
+                if loss < best_loss:
+                    best_loss = loss
+                    args.input = os.path.join(args.model_dir, file)
+            except ValueError:
+                continue
+        print('found best checkpoint with loss {:f} ({:s})'.format(best_loss, args.input))
+
+    # append the model dir (if needed)
+    if not os.path.isfile(args.input):
+        args.input = os.path.join(args.model_dir, args.input)
+
+    if not os.path.isfile(args.labels):
+        args.labels = os.path.join(args.model_dir, args.labels)
+
+# determine the number of classes
+class_names = [name.strip() for name in open(args.labels).readlines()]
+num_classes = len(class_names)
+
+# construct the network architecture
+print('creating network:  ' + args.net)
+print('num classes:       ' + str(num_classes))
+
+if args.net == 'vgg16-ssd':
+    net = create_vgg_ssd(len(class_names), is_test=True)
+elif args.net == 'mb1-ssd' or args.net == 'ssd-mobilenet':
+    net = create_mobilenetv1_ssd(len(class_names), is_test=True)
+elif args.net == 'mb1-ssd-lite':
+    net = create_mobilenetv1_ssd_lite(len(class_names), is_test=True)
+elif args.net == 'mb2-ssd-lite':
+    net = create_mobilenetv2_ssd_lite(len(class_names), is_test=True)
+elif args.net == 'sq-ssd-lite':
+    net = create_squeezenet_ssd_lite(len(class_names), is_test=True)
+else:
+    print("The net type is wrong. It should be one of vgg16-ssd, mb1-ssd and mb1-ssd-lite.")
+    sys.exit(1)
+
+# load the model checkpoint
+print('loading checkpoint:  ' + args.input)
+
+net.load(args.input)
+net.to(device)
+net.eval()
+
+if MAC:
+    dummy_input = torch.randn(args.batch_size, 3, args.height, args.width)
+else:
+    dummy_input = torch.randn(args.batch_size, 3, args.height, args.width).cuda()
+
+# format output model path
+if not args.output:
+    args.output = args.net + '.onnx'
+
+if args.model_dir and args.output.find('/') == -1 and args.output.find('\\') == -1:
+    args.output = os.path.join(args.model_dir, args.output)
+
+input_names = ['input_0']
+output_names = ['scores', 'boxes']
+
+print('exporting model to ONNX...')
+torch.onnx.export(net, dummy_input, args.output, verbose=True, input_names=input_names, output_names=output_names)
+print('model exported to:  {:s}'.format(args.output))
+print('task done, exiting program')
diff --git a/src/7_onnx_test.py b/src/7_onnx_test.py
new file mode 100644
index 0000000..82ad54f
--- /dev/null
+++ b/src/7_onnx_test.py
@@ -0,0 +1,9 @@
+import onnx
+
+from config import MODEL_PATH
+
+model_path = MODEL_PATH / 'ssd-mobilenet.onnx'
+model = onnx.load(str(model_path))
+
+print(onnx.checker.check_model(model))
+print(onnx.helper.printable_graph(model.graph))
diff --git a/src/8_merge_voc.py b/src/8_merge_voc.py
new file mode 100644
index 0000000..d84d5fe
--- /dev/null
+++ b/src/8_merge_voc.py
@@ -0,0 +1,12 @@
+from config import IMG_PATH
+
+all_img = list(IMG_PATH.glob('*.jpg'))
+all_img.sort()
+
+count = 0
+for img in all_img:
+    xml = IMG_PATH / img.name.replace('jpg', 'xml')
+    new_file_basename = f"{count:05d}"
+    xml.rename(xml.parent / (new_file_basename + '.xml'))
+    img.rename(img.parent / (new_file_basename + '.jpg'))
+    count += 1
diff --git a/src/config.py b/src/config.py
new file mode 100644
index 0000000..1a4e15b
--- /dev/null
+++ b/src/config.py
@@ -0,0 +1,27 @@
+import sys
+from pathlib import Path
+
+from licsber.dl import DATASETS_ROOT
+
+CLASSES = (
+    'circle',
+    'square',
+    'huan',
+)
+
+MODEL_NAME = 'mb1-ssd-Epoch-29-Loss-1.1743878581944633.pth'
+MAC = sys.platform == 'darwin'
+
+VIDEO_PATH = DATASETS_ROOT / '工训赛/video'
+VIDEO_PATH = Path(VIDEO_PATH)
+
+IMG_PATH = VIDEO_PATH.parent / 'labeled'
+IMG_PATH.mkdir(exist_ok=True)
+
+MODEL_PATH = VIDEO_PATH.parent / 'models'
+MODEL_PATH.mkdir(exist_ok=True)
+
+VOC_PATH = VIDEO_PATH.parent / 'voc'
+VOC_PATH.mkdir(exist_ok=True)
+
+LABEL_PATH = MODEL_PATH / 'labels.txt'
diff --git a/src/main.py b/src/main.py
new file mode 100644
index 0000000..aed327d
--- /dev/null
+++ b/src/main.py
@@ -0,0 +1,25 @@
+# noinspection PyUnresolvedReferences
+import jetson.inference
+import jetson.utils
+
+net = jetson.inference.detectNet(argv=[
+    '--model=../ssd-mobilenet.onnx',
+    '--labels=../labels.txt',
+    '--input-blob=input_0',
+    '--output-cvg=scores',
+    '--output-bbox=boxes',
+],
+    threshold=0.5)
+
+input = jetson.utils.videoSource('/dev/video0')
+
+count = 0
+while True:
+    count += 1
+    img = input.Capture()
+    detections = net.Detect(img, overlay='box,labels,conf')
+    print("detected {:d} objects in image".format(len(detections)))
+    for detection in detections:
+        print(detection)
+    if count >= 100:
+        break
diff --git a/src/vision/__init__.py b/src/vision/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/vision/datasets/__init__.py b/src/vision/datasets/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/vision/datasets/collation.py b/src/vision/datasets/collation.py
new file mode 100644
index 0000000..da8ae35
--- /dev/null
+++ b/src/vision/datasets/collation.py
@@ -0,0 +1,31 @@
+import numpy as np
+import torch
+
+
+def object_detection_collate(batch):
+    images = []
+    gt_boxes = []
+    gt_labels = []
+    image_type = type(batch[0][0])
+    box_type = type(batch[0][1])
+    label_type = type(batch[0][2])
+    for image, boxes, labels in batch:
+        if image_type is np.ndarray:
+            images.append(torch.from_numpy(image))
+        elif image_type is torch.Tensor:
+            images.append(image)
+        else:
+            raise TypeError(f"Image should be tensor or np.ndarray, but got {image_type}.")
+        if box_type is np.ndarray:
+            gt_boxes.append(torch.from_numpy(boxes))
+        elif box_type is torch.Tensor:
+            gt_boxes.append(boxes)
+        else:
+            raise TypeError(f"Boxes should be tensor or np.ndarray, but got {box_type}.")
+        if label_type is np.ndarray:
+            gt_labels.append(torch.from_numpy(labels))
+        elif label_type is torch.Tensor:
+            gt_labels.append(labels)
+        else:
+            raise TypeError(f"Labels should be tensor or np.ndarray, but got {label_type}.")
+    return torch.stack(images), gt_boxes, gt_labels
diff --git a/src/vision/datasets/generate_vocdata.py b/src/vision/datasets/generate_vocdata.py
new file mode 100644
index 0000000..8e33909
--- /dev/null
+++ b/src/vision/datasets/generate_vocdata.py
@@ -0,0 +1,128 @@
+import os
+import sys
+import xml.etree.ElementTree as ET
+from random import random
+
+
+def main(filename):
+    # ratio to divide up the images
+    train = 0.7
+    val = 0.2
+    test = 0.1
+    if (train + test + val) != 1.0:
+        print("probabilities must equal 1")
+        exit()
+
+    # get the labels
+    labels = []
+    imgnames = []
+    annotations = {}
+
+    with open(filename, 'r') as labelfile:
+        label_string = ""
+        for line in labelfile:
+            label_string += line.rstrip()
+
+    labels = label_string.split(',')
+    labels = [elem.replace(" ", "") for elem in labels]
+
+    # get image names
+    for filename in os.listdir("./JPEGImages"):
+        if filename.endswith(".jpg"):
+            img = filename.rstrip('.jpg')
+            imgnames.append(img)
+
+    print("Labels:", labels, "imgcnt:", len(imgnames))
+
+    # initialise annotation list
+    for label in labels:
+        annotations[label] = []
+
+    # Scan the annotations for the labels
+    for img in imgnames:
+        annote = "Annotations/" + img + '.xml'
+        if os.path.isfile(annote):
+            tree = ET.parse(annote)
+            root = tree.getroot()
+            annote_labels = []
+            for labelname in root.findall('*/name'):
+                labelname = labelname.text
+                annote_labels.append(labelname)
+                if labelname in labels:
+                    annotations[labelname].append(img)
+            annotations[img] = annote_labels
+        else:
+            print("Missing annotation for ", annote)
+            exit()
+
+            # divvy up the images to the different sets
+    sampler = imgnames.copy()
+    train_list = []
+    val_list = []
+    test_list = []
+
+    while len(sampler) > 0:
+        dice = random()
+        elem = sampler.pop()
+
+        if dice <= test:
+            test_list.append(elem)
+        elif dice <= (test + val):
+            val_list.append(elem)
+        else:
+            train_list.append(elem)
+
+    print("Training set:", len(train_list), "validation set:", len(val_list), "test set:", len(test_list))
+
+    # create the dataset files
+    create_folder("./ImageSets/Main/")
+    with open("./ImageSets/Main/train.txt", 'w') as outfile:
+        for name in train_list:
+            outfile.write(name + "\n")
+    with open("./ImageSets/Main/val.txt", 'w') as outfile:
+        for name in val_list:
+            outfile.write(name + "\n")
+    with open("./ImageSets/Main/trainval.txt", 'w') as outfile:
+        for name in train_list:
+            outfile.write(name + "\n")
+        for name in val_list:
+            outfile.write(name + "\n")
+
+    with open("./ImageSets/Main/test.txt", 'w') as outfile:
+        for name in test_list:
+            outfile.write(name + "\n")
+
+    # create the individiual files for each label
+    for label in labels:
+        with open("./ImageSets/Main/" + label + "_train.txt", 'w') as outfile:
+            for name in train_list:
+                if label in annotations[name]:
+                    outfile.write(name + " 1\n")
+                else:
+                    outfile.write(name + " -1\n")
+        with open("./ImageSets/Main/" + label + "_val.txt", 'w') as outfile:
+            for name in val_list:
+                if label in annotations[name]:
+                    outfile.write(name + " 1\n")
+                else:
+                    outfile.write(name + " -1\n")
+        with open("./ImageSets/Main/" + label + "_test.txt", 'w') as outfile:
+            for name in test_list:
+                if label in annotations[name]:
+                    outfile.write(name + " 1\n")
+                else:
+                    outfile.write(name + " -1\n")
+
+
+def create_folder(foldername):
+    if os.path.exists(foldername):
+        print('folder already exists:', foldername)
+    else:
+        os.makedirs(foldername)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        print("usage: python generate_vocdata.py <labelfile>")
+        exit()
+    main(sys.argv[1])
diff --git a/src/vision/datasets/open_images.py b/src/vision/datasets/open_images.py
new file mode 100644
index 0000000..f573003
--- /dev/null
+++ b/src/vision/datasets/open_images.py
@@ -0,0 +1,130 @@
+import copy
+import logging
+import os
+import pathlib
+
+import cv2
+import numpy as np
+import pandas as pd
+
+
+class OpenImagesDataset:
+
+    def __init__(self, root,
+                 transform=None, target_transform=None,
+                 dataset_type="train", balance_data=False):
+        self.root = pathlib.Path(root)
+        self.transform = transform
+        self.target_transform = target_transform
+        self.dataset_type = dataset_type.lower()
+
+        self.data, self.class_names, self.class_dict = self._read_data()
+        self.balance_data = balance_data
+        self.min_image_num = -1
+        if self.balance_data:
+            self.data = self._balance_data()
+        self.ids = [info['image_id'] for info in self.data]
+
+        self.class_stat = None
+
+    def _getitem(self, index):
+        image_info = self.data[index]
+        image = self._read_image(image_info['image_id'])
+        # duplicate boxes to prevent corruption of dataset
+        boxes = copy.copy(image_info['boxes'])
+        boxes[:, 0] *= image.shape[1]
+        boxes[:, 1] *= image.shape[0]
+        boxes[:, 2] *= image.shape[1]
+        boxes[:, 3] *= image.shape[0]
+        # duplicate labels to prevent corruption of dataset
+        labels = copy.copy(image_info['labels'])
+        if self.transform:
+            image, boxes, labels = self.transform(image, boxes, labels)
+        if self.target_transform:
+            boxes, labels = self.target_transform(boxes, labels)
+        return image_info['image_id'], image, boxes, labels
+
+    def __getitem__(self, index):
+        _, image, boxes, labels = self._getitem(index)
+        return image, boxes, labels
+
+    def get_annotation(self, index):
+        """To conform the eval_ssd implementation that is based on the VOC dataset."""
+        image_id, image, boxes, labels = self._getitem(index)
+        is_difficult = np.zeros(boxes.shape[0], dtype=np.uint8)
+        return image_id, (boxes, labels, is_difficult)
+
+    def get_image(self, index):
+        image_info = self.data[index]
+        image = self._read_image(image_info['image_id'])
+        if self.transform:
+            image, _ = self.transform(image)
+        return image
+
+    def _read_data(self):
+        annotation_file = f"{self.root}/sub-{self.dataset_type}-annotations-bbox.csv"
+        logging.info(f'loading annotations from: {annotation_file}')
+        annotations = pd.read_csv(annotation_file)
+        logging.info(f'annotations loaded from:  {annotation_file}')
+        class_names = ['BACKGROUND'] + sorted(list(annotations['ClassName'].unique()))
+        class_dict = {class_name: i for i, class_name in enumerate(class_names)}
+        data = []
+        for image_id, group in annotations.groupby("ImageID"):
+            img_path = os.path.join(self.root, self.dataset_type, image_id + '.jpg')
+            if os.path.isfile(img_path) is False:
+                logging.error(f'missing ImageID {image_id}.jpg - dropping from annotations')
+                continue
+            boxes = group.loc[:, ["XMin", "YMin", "XMax", "YMax"]].values.astype(np.float32)
+            # make labels 64 bits to satisfy the cross_entropy function
+            labels = np.array([class_dict[name] for name in group["ClassName"]], dtype='int64')
+            # print('found image {:s}  ({:d})'.format(img_path, len(data)))
+            data.append({
+                'image_id': image_id,
+                'boxes': boxes,
+                'labels': labels
+            })
+        print('num images:  {:d}'.format(len(data)))
+        return data, class_names, class_dict
+
+    def __len__(self):
+        return len(self.data)
+
+    def __repr__(self):
+        if self.class_stat is None:
+            self.class_stat = {name: 0 for name in self.class_names[1:]}
+            for example in self.data:
+                for class_index in example['labels']:
+                    class_name = self.class_names[class_index]
+                    self.class_stat[class_name] += 1
+        content = ["Dataset Summary:"
+                   f"Number of Images: {len(self.data)}",
+                   f"Minimum Number of Images for a Class: {self.min_image_num}",
+                   "Label Distribution:"]
+        for class_name, num in self.class_stat.items():
+            content.append(f"\t{class_name}: {num}")
+        return "\n".join(content)
+
+    def _read_image(self, image_id):
+        image_file = self.root / self.dataset_type / f"{image_id}.jpg"
+        image = cv2.imread(str(image_file))
+        if image.shape[2] == 1:
+            image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
+        else:
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        return image
+
+    def _balance_data(self):
+        logging.info('balancing data')
+        label_image_indexes = [set() for _ in range(len(self.class_names))]
+        for i, image in enumerate(self.data):
+            for label_id in image['labels']:
+                label_image_indexes[label_id].add(i)
+        label_stat = [len(s) for s in label_image_indexes]
+        self.min_image_num = min(label_stat[1:])
+        sample_image_indexes = set()
+        for image_indexes in label_image_indexes[1:]:
+            image_indexes = np.array(list(image_indexes))
+            sub = np.random.permutation(image_indexes)[:self.min_image_num]
+            sample_image_indexes.update(sub)
+        sample_data = [self.data[i] for i in sample_image_indexes]
+        return sample_data
diff --git a/src/vision/datasets/voc_dataset.py b/src/vision/datasets/voc_dataset.py
new file mode 100644
index 0000000..133e5de
--- /dev/null
+++ b/src/vision/datasets/voc_dataset.py
@@ -0,0 +1,187 @@
+import logging
+import os
+import pathlib
+import xml.etree.ElementTree as ET
+
+import cv2
+import numpy as np
+
+
+class VOCDataset:
+
+    def __init__(self, root, transform=None, target_transform=None, is_test=False, keep_difficult=True,
+                 label_file=None):
+        """Dataset for VOC data.
+        Args:
+            root: the root of the VOC2007 or VOC2012 dataset, the directory contains the following sub-directories:
+                Annotations, ImageSets, JPEGImages, SegmentationClass, SegmentationObject.
+        """
+        self.root = pathlib.Path(root)
+        self.transform = transform
+        self.target_transform = target_transform
+
+        # determine the image set file to use
+        if is_test:
+            image_sets_file = self.root / "ImageSets/Main/test.txt"
+        else:
+            image_sets_file = self.root / "ImageSets/Main/trainval.txt"
+
+        if not os.path.isfile(image_sets_file):
+            image_sets_default = self.root / "ImageSets/Main/default.txt"  # CVAT only saves default.txt
+
+            if os.path.isfile(image_sets_default):
+                image_sets_file = image_sets_default
+            else:
+                raise IOError("missing ImageSet file {:s}".format(image_sets_file))
+
+        # read the image set ID's
+        self.ids = self._read_image_ids(image_sets_file)
+        self.keep_difficult = keep_difficult
+
+        # if the labels file exists, read in the class names
+        label_file_name = self.root / "labels.txt"
+
+        if os.path.isfile(label_file_name):
+            classes = []
+
+            # classes should be a line-separated list
+            with open(label_file_name, 'r') as infile:
+                for line in infile:
+                    classes.append(line.rstrip())
+
+            # prepend BACKGROUND as first class
+            classes.insert(0, 'BACKGROUND')
+            # classes  = [ elem.replace(" ", "") for elem in classes]
+            self.class_names = tuple(classes)
+            logging.info("VOC Labels read from file: " + str(self.class_names))
+
+        else:
+            logging.info("No labels file, using default VOC classes.")
+            self.class_names = ('BACKGROUND',
+                                'aeroplane', 'bicycle', 'bird', 'boat',
+                                'bottle', 'bus', 'car', 'cat', 'chair',
+                                'cow', 'diningtable', 'dog', 'horse',
+                                'motorbike', 'person', 'pottedplant',
+                                'sheep', 'sofa', 'train', 'tvmonitor')
+
+        self.class_dict = {class_name: i for i, class_name in enumerate(self.class_names)}
+
+    def __getitem__(self, index):
+        image_id = self.ids[index]
+        boxes, labels, is_difficult = self._get_annotation(image_id)
+
+        if not self.keep_difficult:
+            boxes = boxes[is_difficult == 0]
+            labels = labels[is_difficult == 0]
+
+        # print('__getitem__  image_id=' + str(image_id) + ' \nboxes=' + str(boxes) + ' \nlabels=' + str(labels))
+
+        image = self._read_image(image_id)
+
+        if self.transform:
+            image, boxes, labels = self.transform(image, boxes, labels)
+        if self.target_transform:
+            boxes, labels = self.target_transform(boxes, labels)
+
+        return image, boxes, labels
+
+    def get_image(self, index):
+        image_id = self.ids[index]
+        image = self._read_image(image_id)
+        if self.transform:
+            image, _ = self.transform(image)
+        return image
+
+    def get_annotation(self, index):
+        image_id = self.ids[index]
+        return image_id, self._get_annotation(image_id)
+
+    def __len__(self):
+        return len(self.ids)
+
+    def _read_image_ids(self, image_sets_file):
+        ids = []
+        with open(image_sets_file) as f:
+            for line in f:
+                image_id = line.rstrip()
+
+                if len(image_id) <= 0:
+                    print('warning - found empty line in {:s}, skipping line'.format(image_sets_file))
+                    continue
+
+                if self._get_num_annotations(image_id) > 0:
+                    if self._find_image(image_id) is not None:
+                        ids.append(line.rstrip())
+                    else:
+                        print('warning - could not find image {:s} - ignoring from dataset'.format(image_id))
+                else:
+                    print('warning - image {:s} has no box/labels annotations, ignoring from dataset'.format(image_id))
+
+        return ids
+
+    def _get_num_annotations(self, image_id):
+        annotation_file = self.root / f"Annotations/{image_id}.xml"
+        objects = ET.parse(annotation_file).findall("object")
+        return len(objects)
+
+    def _get_annotation(self, image_id):
+        annotation_file = self.root / f"Annotations/{image_id}.xml"
+        objects = ET.parse(annotation_file).findall("object")
+        boxes = []
+        labels = []
+        is_difficult = []
+        for object in objects:
+            class_name = object.find('name').text.strip()  # .lower().strip()
+            # we're only concerned with clases in our list
+            if class_name in self.class_dict:
+                bbox = object.find('bndbox')
+
+                # VOC dataset format follows Matlab, in which indexes start from 0
+                x1 = float(bbox.find('xmin').text) - 1
+                y1 = float(bbox.find('ymin').text) - 1
+                x2 = float(bbox.find('xmax').text) - 1
+                y2 = float(bbox.find('ymax').text) - 1
+                boxes.append([x1, y1, x2, y2])
+
+                labels.append(self.class_dict[class_name])
+
+                # retrieve <difficult> element
+                is_difficult_obj = object.find('difficult')
+                is_difficult_str = '0'
+
+                if is_difficult_obj is not None:
+                    is_difficult_str = object.find('difficult').text
+
+                is_difficult.append(int(is_difficult_str) if is_difficult_str else 0)
+            else:
+                print("warning - image {:s} has object with unknown class '{:s}'".format(image_id, class_name))
+
+        return (np.array(boxes, dtype=np.float32),
+                np.array(labels, dtype=np.int64),
+                np.array(is_difficult, dtype=np.uint8))
+
+    def _find_image(self, image_id):
+        img_extensions = (
+            '.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.bmp', '.BMP', '.tif', '.TIF', '.tiff', '.TIFF')
+
+        for ext in img_extensions:
+            image_file = os.path.join(self.root, "JPEGImages/{:s}{:s}".format(image_id, ext))
+
+            if os.path.exists(image_file):
+                return image_file
+
+        return None
+
+    def _read_image(self, image_id):
+        image_file = self._find_image(image_id)
+
+        if image_file is None:
+            raise IOError('failed to load ' + image_file)
+
+        image = cv2.imread(str(image_file))
+
+        if image is None or image.size == 0:
+            raise IOError('failed to load ' + str(image_file))
+
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        return image
diff --git a/src/vision/nn/__init__.py b/src/vision/nn/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/vision/nn/alexnet.py b/src/vision/nn/alexnet.py
new file mode 100644
index 0000000..1b2e8dd
--- /dev/null
+++ b/src/vision/nn/alexnet.py
@@ -0,0 +1,60 @@
+import torch.nn as nn
+import torch.utils.model_zoo as model_zoo
+
+# copied from torchvision (https://github.com/pytorch/vision/blob/master/torchvision/models/alexnet.py).
+# The forward function is modified for model pruning.
+
+__all__ = ['AlexNet', 'alexnet']
+
+model_urls = {
+    'alexnet': 'https://download.pytorch.org/models/alexnet-owt-4df8aa71.pth',
+}
+
+
+class AlexNet(nn.Module):
+
+    def __init__(self, num_classes=1000):
+        super(AlexNet, self).__init__()
+        self.features = nn.Sequential(
+            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(64, 192, kernel_size=5, padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(192, 384, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(384, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+        )
+        self.classifier = nn.Sequential(
+            nn.Dropout(),
+            nn.Linear(256 * 6 * 6, 4096),
+            nn.ReLU(inplace=True),
+            nn.Dropout(),
+            nn.Linear(4096, 4096),
+            nn.ReLU(inplace=True),
+            nn.Linear(4096, num_classes),
+        )
+
+    def forward(self, x):
+        x = self.features(x)
+        x = x.view(x.size(0), -1)
+        x = self.classifier(x)
+        return x
+
+
+def alexnet(pretrained=False, **kwargs):
+    r"""AlexNet model architecture from the
+    `"One weird trick..." <https://arxiv.org/abs/1404.5997>`_ paper.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = AlexNet(**kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['alexnet']))
+    return model
diff --git a/src/vision/nn/mobilenet.py b/src/vision/nn/mobilenet.py
new file mode 100644
index 0000000..6216696
--- /dev/null
+++ b/src/vision/nn/mobilenet.py
@@ -0,0 +1,52 @@
+# borrowed from "https://github.com/marvis/pytorch-mobilenet"
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class MobileNetV1(nn.Module):
+    def __init__(self, num_classes=1024):
+        super(MobileNetV1, self).__init__()
+
+        def conv_bn(inp, oup, stride):
+            return nn.Sequential(
+                nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+                nn.BatchNorm2d(oup),
+                nn.ReLU(inplace=True)
+            )
+
+        def conv_dw(inp, oup, stride):
+            return nn.Sequential(
+                nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
+                nn.BatchNorm2d(inp),
+                nn.ReLU(inplace=True),
+
+                nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup),
+                nn.ReLU(inplace=True),
+            )
+
+        self.model = nn.Sequential(
+            conv_bn(3, 32, 2),
+            conv_dw(32, 64, 1),
+            conv_dw(64, 128, 2),
+            conv_dw(128, 128, 1),
+            conv_dw(128, 256, 2),
+            conv_dw(256, 256, 1),
+            conv_dw(256, 512, 2),
+            conv_dw(512, 512, 1),
+            conv_dw(512, 512, 1),
+            conv_dw(512, 512, 1),
+            conv_dw(512, 512, 1),
+            conv_dw(512, 512, 1),
+            conv_dw(512, 1024, 2),
+            conv_dw(1024, 1024, 1),
+        )
+        self.fc = nn.Linear(1024, num_classes)
+
+    def forward(self, x):
+        x = self.model(x)
+        x = F.avg_pool2d(x, 7)
+        x = x.view(-1, 1024)
+        x = self.fc(x)
+        return x
diff --git a/src/vision/nn/mobilenet_v2.py b/src/vision/nn/mobilenet_v2.py
new file mode 100644
index 0000000..f685d50
--- /dev/null
+++ b/src/vision/nn/mobilenet_v2.py
@@ -0,0 +1,175 @@
+import math
+
+import torch.nn as nn
+
+
+# Modified from https://github.com/tonylins/pytorch-mobilenet-v2/blob/master/MobileNetV2.py.
+# In this version, Relu6 is replaced with Relu to make it ONNX compatible.
+# BatchNorm Layer is optional to make it easy do batch norm confusion.
+
+
+def conv_bn(inp, oup, stride, use_batch_norm=True, onnx_compatible=False):
+    ReLU = nn.ReLU if onnx_compatible else nn.ReLU6
+
+    if use_batch_norm:
+        return nn.Sequential(
+            nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+            nn.BatchNorm2d(oup),
+            ReLU(inplace=True)
+        )
+    else:
+        return nn.Sequential(
+            nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+            ReLU(inplace=True)
+        )
+
+
+def conv_1x1_bn(inp, oup, use_batch_norm=True, onnx_compatible=False):
+    ReLU = nn.ReLU if onnx_compatible else nn.ReLU6
+    if use_batch_norm:
+        return nn.Sequential(
+            nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(oup),
+            ReLU(inplace=True)
+        )
+    else:
+        return nn.Sequential(
+            nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+            ReLU(inplace=True)
+        )
+
+
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, oup, stride, expand_ratio, use_batch_norm=True, onnx_compatible=False):
+        super(InvertedResidual, self).__init__()
+        ReLU = nn.ReLU if onnx_compatible else nn.ReLU6
+
+        self.stride = stride
+        assert stride in [1, 2]
+
+        hidden_dim = round(inp * expand_ratio)
+        self.use_res_connect = self.stride == 1 and inp == oup
+
+        if expand_ratio == 1:
+            if use_batch_norm:
+                self.conv = nn.Sequential(
+                    # dw
+                    nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
+                    nn.BatchNorm2d(hidden_dim),
+                    ReLU(inplace=True),
+                    # pw-linear
+                    nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                    nn.BatchNorm2d(oup),
+                )
+            else:
+                self.conv = nn.Sequential(
+                    # dw
+                    nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
+                    ReLU(inplace=True),
+                    # pw-linear
+                    nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                )
+        else:
+            if use_batch_norm:
+                self.conv = nn.Sequential(
+                    # pw
+                    nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
+                    nn.BatchNorm2d(hidden_dim),
+                    ReLU(inplace=True),
+                    # dw
+                    nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
+                    nn.BatchNorm2d(hidden_dim),
+                    ReLU(inplace=True),
+                    # pw-linear
+                    nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                    nn.BatchNorm2d(oup),
+                )
+            else:
+                self.conv = nn.Sequential(
+                    # pw
+                    nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
+                    ReLU(inplace=True),
+                    # dw
+                    nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
+                    ReLU(inplace=True),
+                    # pw-linear
+                    nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                )
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class MobileNetV2(nn.Module):
+    def __init__(self, n_class=1000, input_size=224, width_mult=1., dropout_ratio=0.2,
+                 use_batch_norm=True, onnx_compatible=False):
+        super(MobileNetV2, self).__init__()
+        block = InvertedResidual
+        input_channel = 32
+        last_channel = 1280
+        interverted_residual_setting = [
+            # t, c, n, s
+            [1, 16, 1, 1],
+            [6, 24, 2, 2],
+            [6, 32, 3, 2],
+            [6, 64, 4, 2],
+            [6, 96, 3, 1],
+            [6, 160, 3, 2],
+            [6, 320, 1, 1],
+        ]
+
+        # building first layer
+        assert input_size % 32 == 0
+        input_channel = int(input_channel * width_mult)
+        self.last_channel = int(last_channel * width_mult) if width_mult > 1.0 else last_channel
+        self.features = [conv_bn(3, input_channel, 2, onnx_compatible=onnx_compatible)]
+        # building inverted residual blocks
+        for t, c, n, s in interverted_residual_setting:
+            output_channel = int(c * width_mult)
+            for i in range(n):
+                if i == 0:
+                    self.features.append(block(input_channel, output_channel, s,
+                                               expand_ratio=t, use_batch_norm=use_batch_norm,
+                                               onnx_compatible=onnx_compatible))
+                else:
+                    self.features.append(block(input_channel, output_channel, 1,
+                                               expand_ratio=t, use_batch_norm=use_batch_norm,
+                                               onnx_compatible=onnx_compatible))
+                input_channel = output_channel
+        # building last several layers
+        self.features.append(conv_1x1_bn(input_channel, self.last_channel,
+                                         use_batch_norm=use_batch_norm, onnx_compatible=onnx_compatible))
+        # make it nn.Sequential
+        self.features = nn.Sequential(*self.features)
+
+        # building classifier
+        self.classifier = nn.Sequential(
+            nn.Dropout(dropout_ratio),
+            nn.Linear(self.last_channel, n_class),
+        )
+
+        self._initialize_weights()
+
+    def forward(self, x):
+        x = self.features(x)
+        x = x.mean(3).mean(2)
+        x = self.classifier(x)
+        return x
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                n = m.weight.size(1)
+                m.weight.data.normal_(0, 0.01)
+                m.bias.data.zero_()
diff --git a/src/vision/nn/multibox_loss.py b/src/vision/nn/multibox_loss.py
new file mode 100644
index 0000000..32049e5
--- /dev/null
+++ b/src/vision/nn/multibox_loss.py
@@ -0,0 +1,46 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..utils import box_utils
+
+
+class MultiboxLoss(nn.Module):
+    def __init__(self, priors, iou_threshold, neg_pos_ratio,
+                 center_variance, size_variance, device):
+        """Implement SSD Multibox Loss.
+
+        Basically, Multibox loss combines classification loss
+         and Smooth L1 regression loss.
+        """
+        super(MultiboxLoss, self).__init__()
+        self.iou_threshold = iou_threshold
+        self.neg_pos_ratio = neg_pos_ratio
+        self.center_variance = center_variance
+        self.size_variance = size_variance
+        self.priors = priors
+        self.priors.to(device)
+
+    def forward(self, confidence, predicted_locations, labels, gt_locations):
+        """Compute classification loss and smooth l1 loss.
+
+        Args:
+            confidence (batch_size, num_priors, num_classes): class predictions.
+            locations (batch_size, num_priors, 4): predicted locations.
+            labels (batch_size, num_priors): real labels of all the priors.
+            boxes (batch_size, num_priors, 4): real boxes corresponding all the priors.
+        """
+        num_classes = confidence.size(2)
+        with torch.no_grad():
+            # derived from cross_entropy=sum(log(p))
+            loss = -F.log_softmax(confidence, dim=2)[:, :, 0]
+            mask = box_utils.hard_negative_mining(loss, labels, self.neg_pos_ratio)
+
+        confidence = confidence[mask, :]
+        classification_loss = F.cross_entropy(confidence.reshape(-1, num_classes), labels[mask], size_average=False)
+        pos_mask = labels > 0
+        predicted_locations = predicted_locations[pos_mask, :].reshape(-1, 4)
+        gt_locations = gt_locations[pos_mask, :].reshape(-1, 4)
+        smooth_l1_loss = F.smooth_l1_loss(predicted_locations, gt_locations, size_average=False)
+        num_pos = gt_locations.size(0)
+        return smooth_l1_loss / num_pos, classification_loss / num_pos
diff --git a/src/vision/nn/scaled_l2_norm.py b/src/vision/nn/scaled_l2_norm.py
new file mode 100644
index 0000000..f31be6a
--- /dev/null
+++ b/src/vision/nn/scaled_l2_norm.py
@@ -0,0 +1,19 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ScaledL2Norm(nn.Module):
+    def __init__(self, in_channels, initial_scale):
+        super(ScaledL2Norm, self).__init__()
+        self.in_channels = in_channels
+        self.scale = nn.Parameter(torch.Tensor(in_channels))
+        self.initial_scale = initial_scale
+        self.reset_parameters()
+
+    def forward(self, x):
+        return (F.normalize(x, p=2, dim=1)
+                * self.scale.unsqueeze(0).unsqueeze(2).unsqueeze(3))
+
+    def reset_parameters(self):
+        self.scale.data.fill_(self.initial_scale)
diff --git a/src/vision/nn/squeezenet.py b/src/vision/nn/squeezenet.py
new file mode 100644
index 0000000..a05e39a
--- /dev/null
+++ b/src/vision/nn/squeezenet.py
@@ -0,0 +1,127 @@
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+import torch.utils.model_zoo as model_zoo
+
+__all__ = ['SqueezeNet', 'squeezenet1_0', 'squeezenet1_1']
+
+model_urls = {
+    'squeezenet1_0': 'https://download.pytorch.org/models/squeezenet1_0-a815701f.pth',
+    'squeezenet1_1': 'https://download.pytorch.org/models/squeezenet1_1-f364aa15.pth',
+}
+
+
+class Fire(nn.Module):
+
+    def __init__(self, inplanes, squeeze_planes,
+                 expand1x1_planes, expand3x3_planes):
+        super(Fire, self).__init__()
+        self.inplanes = inplanes
+        self.squeeze = nn.Conv2d(inplanes, squeeze_planes, kernel_size=1)
+        self.squeeze_activation = nn.ReLU(inplace=True)
+        self.expand1x1 = nn.Conv2d(squeeze_planes, expand1x1_planes,
+                                   kernel_size=1)
+        self.expand1x1_activation = nn.ReLU(inplace=True)
+        self.expand3x3 = nn.Conv2d(squeeze_planes, expand3x3_planes,
+                                   kernel_size=3, padding=1)
+        self.expand3x3_activation = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        x = self.squeeze_activation(self.squeeze(x))
+        return torch.cat([
+            self.expand1x1_activation(self.expand1x1(x)),
+            self.expand3x3_activation(self.expand3x3(x))
+        ], 1)
+
+
+class SqueezeNet(nn.Module):
+
+    def __init__(self, version=1.0, num_classes=1000):
+        super(SqueezeNet, self).__init__()
+        if version not in [1.0, 1.1]:
+            raise ValueError("Unsupported SqueezeNet version {version}:"
+                             "1.0 or 1.1 expected".format(version=version))
+        self.num_classes = num_classes
+        if version == 1.0:
+            self.features = nn.Sequential(
+                nn.Conv2d(3, 96, kernel_size=7, stride=2),
+                nn.ReLU(inplace=True),
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
+                Fire(96, 16, 64, 64),
+                Fire(128, 16, 64, 64),
+                Fire(128, 32, 128, 128),
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
+                Fire(256, 32, 128, 128),
+                Fire(256, 48, 192, 192),
+                Fire(384, 48, 192, 192),
+                Fire(384, 64, 256, 256),
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
+                Fire(512, 64, 256, 256),
+            )
+        else:
+            self.features = nn.Sequential(
+                nn.Conv2d(3, 64, kernel_size=3, stride=2),
+                nn.ReLU(inplace=True),
+                nn.MaxPool2d(kernel_size=3, stride=2),
+                Fire(64, 16, 64, 64),
+                Fire(128, 16, 64, 64),
+                nn.MaxPool2d(kernel_size=3, stride=2),
+                Fire(128, 32, 128, 128),
+                Fire(256, 32, 128, 128),
+                nn.MaxPool2d(kernel_size=3, stride=2),
+                Fire(256, 48, 192, 192),
+                Fire(384, 48, 192, 192),
+                Fire(384, 64, 256, 256),
+                Fire(512, 64, 256, 256),
+            )
+        # Final convolution is initialized differently form the rest
+        final_conv = nn.Conv2d(512, self.num_classes, kernel_size=1)
+        self.classifier = nn.Sequential(
+            nn.Dropout(p=0.5),
+            final_conv,
+            nn.ReLU(inplace=True),
+            nn.AvgPool2d(13, stride=1)
+        )
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                if m is final_conv:
+                    init.normal_(m.weight, mean=0.0, std=0.01)
+                else:
+                    init.kaiming_uniform_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        x = self.features(x)
+        x = self.classifier(x)
+        return x.view(x.size(0), self.num_classes)
+
+
+def squeezenet1_0(pretrained=False, **kwargs):
+    r"""SqueezeNet model architecture from the `"SqueezeNet: AlexNet-level
+    accuracy with 50x fewer parameters and <0.5MB model size"
+    <https://arxiv.org/abs/1602.07360>`_ paper.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = SqueezeNet(version=1.0, **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['squeezenet1_0']))
+    return model
+
+
+def squeezenet1_1(pretrained=False, **kwargs):
+    r"""SqueezeNet 1.1 model from the `official SqueezeNet repo
+    <https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1>`_.
+    SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters
+    than SqueezeNet 1.0, without sacrificing accuracy.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = SqueezeNet(version=1.1, **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['squeezenet1_1']))
+    return model
diff --git a/src/vision/nn/vgg.py b/src/vision/nn/vgg.py
new file mode 100644
index 0000000..255d8ad
--- /dev/null
+++ b/src/vision/nn/vgg.py
@@ -0,0 +1,25 @@
+import torch.nn as nn
+
+
+# borrowed from https://github.com/amdegroot/ssd.pytorch/blob/master/ssd.py
+def vgg(cfg, batch_norm=False):
+    layers = []
+    in_channels = 3
+    for v in cfg:
+        if v == 'M':
+            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
+        elif v == 'C':
+            layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
+        else:
+            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
+            if batch_norm:
+                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
+            else:
+                layers += [conv2d, nn.ReLU(inplace=True)]
+            in_channels = v
+    pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
+    conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
+    conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
+    layers += [pool5, conv6,
+               nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)]
+    return layers
diff --git a/src/vision/prunning/__init__.py b/src/vision/prunning/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/vision/prunning/prunner.py b/src/vision/prunning/prunner.py
new file mode 100644
index 0000000..6e6510a
--- /dev/null
+++ b/src/vision/prunning/prunner.py
@@ -0,0 +1,235 @@
+import logging
+from heapq import nsmallest
+
+import torch
+import torch.nn as nn
+
+from ..utils.model_book import ModelBook
+
+
+class ModelPrunner:
+    def __init__(self, model, train_fun, ignored_paths=[]):
+        """ Implement the pruning algorithm described in the paper https://arxiv.org/pdf/1611.06440.pdf .
+
+        The prunning criteria is dC/dh * h, while C is the cost, h is the activation.
+        """
+        self.model = model
+        self.train_fun = train_fun
+        self.ignored_paths = ignored_paths
+        self.book = ModelBook(self.model)
+        self.outputs = {}
+        self.grads = {}
+        self.handles = []
+        self.decendent_batch_norms = {}  # descendants impacted by the conv layers.
+        self.last_conv_path = None  # used to trace the graph
+        self.descendent_convs = {}  # descendants impacted by the conv layers.
+        self.descendent_linears = {}  # descendants impacted by the linear layers.
+        self.last_linear_path = None  # used to trace the graph
+
+    def _make_new_conv(self, conv, filter_index, channel_type="out"):
+        if not isinstance(conv, nn.Conv2d):
+            raise TypeError(f"The module is not Conv2d, but {type(conv)}.")
+
+        if channel_type == "out":
+            new_conv = nn.Conv2d(conv.in_channels, conv.out_channels - 1, conv.kernel_size, conv.stride,
+                                 conv.padding, conv.dilation, conv.groups, conv.bias is not None)
+            mask = torch.ones(conv.out_channels, dtype=torch.uint8)
+            mask[filter_index] = 0
+            new_conv.weight.data = conv.weight.data[mask, :, :, :]
+            if conv.bias is not None:
+                new_conv.bias.data = conv.bias.data[mask]
+
+        elif channel_type == 'in':
+            new_conv = nn.Conv2d(conv.in_channels - 1, conv.out_channels, conv.kernel_size, conv.stride,
+                                 conv.padding, conv.dilation, conv.groups, conv.bias is not None)
+            mask = torch.ones(conv.in_channels, dtype=torch.uint8)
+            mask[filter_index] = 0
+            new_conv.weight.data = conv.weight.data[:, mask, :, :]
+            if conv.bias is not None:
+                new_conv.bias.data = conv.bias.data
+        else:
+            raise ValueError(f"{channel_type} should be either 'in' or 'out'.")
+        return new_conv
+
+    def remove_conv_filter(self, path, filter_index):
+        conv = self.book.get_module(path)
+        logging.info(f'Prune Conv: {"/".join(path)}, Filter: {filter_index}, Layer: {conv}')
+        new_conv = self._make_new_conv(conv, filter_index, channel_type="out")
+        self._update_model(path, new_conv)
+
+        next_conv_path = self.descendent_convs.get(path)
+        if next_conv_path:
+            next_conv = self.book.get_module(next_conv_path)
+            new_next_conv = self._make_new_conv(next_conv, filter_index, channel_type="in")
+            self._update_model(next_conv_path, new_next_conv)
+
+        # reduce the num_features of batch norm
+        batch_norm_path = self.decendent_batch_norms.get(path)
+        if batch_norm_path:
+            batch_norm = self.book.get_module(batch_norm_path)
+            new_batch_norm = nn.BatchNorm2d(batch_norm.num_features - 1)
+            self._update_model(batch_norm_path, new_batch_norm)
+
+        # reduce the in channels of linear layer
+        linear_path = self.descendent_linears.get(path)
+        if linear_path:
+            linear = self.book.get_module(linear_path)
+            new_linear = self._make_new_linear(linear, filter_index, conv, channel_type="in")
+            self._update_model(linear_path, new_linear)
+
+    @staticmethod
+    def _make_new_linear(linear, feature_index, conv=None, channel_type="out"):
+        if channel_type == "out":
+            new_linear = nn.Linear(linear.in_features, linear.out_features - 1,
+                                   bias=linear.bias is not None)
+            mask = torch.ones(linear.out_features, dtype=torch.uint8)
+            mask[feature_index] = 0
+            new_linear.weight.data = linear.weight.data[mask, :]
+            if linear.bias is not None:
+                new_linear.bias.data = linear.bias.data[mask]
+        elif channel_type == "in":
+            if conv:
+                block = int(linear.in_features / conv.out_channels)
+            else:
+                block = 1
+            new_linear = nn.Linear(linear.in_features - block, linear.out_features,
+                                   bias=linear.bias is not None)
+            start_index = feature_index * block
+            end_index = (feature_index + 1) * block
+            mask = torch.ones(linear.in_features, dtype=torch.uint8)
+            mask[start_index: end_index] = 0
+            new_linear.weight.data = linear.weight.data[:, mask]
+            if linear.bias is not None:
+                new_linear.bias.data = linear.bias.data
+        else:
+            raise ValueError(f"{channel_type} should be either 'in' or 'out'.")
+        return new_linear
+
+    def prune_conv_layers(self, num=1):
+        """Prune one conv2d filter.
+        """
+        self.register_conv_hooks()
+        before_loss, before_accuracy = self.train_fun(self.model)
+        ranks = []
+        for path, output in self.outputs.items():
+            output = output.data
+            grad = self.grads[path].data
+            v = grad * output
+            v = v.sum(0).sum(1).sum(1)  # sum to the channel axis.
+            v = torch.abs(v)
+            v = v / torch.sqrt(torch.sum(v * v))  # normalize
+            for i, e in enumerate(v):
+                ranks.append((path, i, e))
+        to_prune = nsmallest(num, ranks, key=lambda t: t[2])
+        to_prune = sorted(to_prune, key=lambda t: (
+        t[0], -t[1]))  # prune the filters with bigger indexes first to avoid rearrangement.
+        for path, filter_index, value in to_prune:
+            self.remove_conv_filter(path, filter_index)
+        self.deregister_hooks()
+        after_loss, after_accuracy = self.train_fun(self.model)
+        return after_loss - before_loss, after_accuracy - before_accuracy
+
+    def register_conv_hooks(self):
+        """Run register before training for pruning."""
+        self.outputs.clear()
+        self.grads.clear()
+        self.handles.clear()
+        self.last_conv_path = None
+        self.decendent_batch_norms.clear()
+        self.descendent_convs.clear()
+        self.descendent_linears.clear()
+
+        def forward_hook(m, input, output):
+            path = self.book.get_path(m)
+            if isinstance(m, nn.Conv2d):
+                if path not in self.ignored_paths:
+                    self.outputs[path] = output
+                if self.last_conv_path:
+                    self.descendent_convs[self.last_conv_path] = path
+                self.last_conv_path = path
+            elif isinstance(m, nn.BatchNorm2d):
+                if self.last_conv_path:
+                    self.decendent_batch_norms[self.last_conv_path] = path
+            elif isinstance(m, nn.Linear):
+                if self.last_conv_path:
+                    self.descendent_linears[self.last_conv_path] = path
+                self.last_conv_path = None  # after a linear layer the conv layer doesn't matter
+
+        def backward_hook(m, input, output):
+            path = self.book.get_path(m)
+            self.grads[path] = output[0]
+
+        for path, m in self.book.modules(module_type=(nn.Conv2d, nn.BatchNorm2d, nn.Linear)):
+            h = m.register_forward_hook(forward_hook)
+            self.handles.append(h)
+            h = m.register_backward_hook(backward_hook)
+            self.handles.append(h)
+
+    def deregister_hooks(self):
+        """Run degresiter before retraining to recover the model"""
+        for handle in self.handles:
+            handle.remove()
+
+    def prune_linear_layers(self, num=1):
+        self.register_linear_hooks()
+        before_loss, before_accuracy = self.train_fun(self.model)
+        ranks = []
+        for path, output in self.outputs.items():
+            output = output.data
+            grad = self.grads[path].data
+            v = grad * output
+            v = v.sum(0)  # sum to the channel axis.
+            v = torch.abs(v)
+            v = v / torch.sqrt(torch.sum(v * v))  # normalize
+            for i, e in enumerate(v):
+                ranks.append((path, i, e))
+        to_prune = nsmallest(num, ranks, key=lambda t: t[2])
+        to_prune = sorted(to_prune, key=lambda t: (t[0], -t[1]))
+        for path, feature_index, value in to_prune:
+            self.remove_linear_feature(path, feature_index)
+        self.deregister_hooks()
+        after_loss, after_accuracy = self.train_fun(self.model)
+        return after_loss - before_loss, after_accuracy - before_accuracy
+
+    def register_linear_hooks(self):
+        self.outputs.clear()
+        self.grads.clear()
+        self.handles.clear()
+        self.descendent_linears.clear()
+        self.last_linear_path = None
+
+        def forward_hook(m, input, output):
+            path = self.book.get_path(m)
+            if path not in self.ignored_paths:
+                self.outputs[path] = output
+            if self.last_linear_path:
+                self.descendent_linears[self.last_linear_path] = path
+            self.last_linear_path = path
+
+        def backward_hook(m, input, output):
+            path = self.book.get_path(m)
+            self.grads[path] = output[0]
+
+        for _, m in self.book.linear_modules():
+            h = m.register_forward_hook(forward_hook)
+            self.handles.append(h)
+            h = m.register_backward_hook(backward_hook)
+            self.handles.append(h)
+
+    def remove_linear_feature(self, path, feature_index):
+        linear = self.book.get_module(path)
+        logging.info(f'Prune Linear: {"/".join(path)}, Filter: {feature_index}, Layer: {linear}')
+        new_linear = self._make_new_linear(linear, feature_index, channel_type="out")
+        self._update_model(path, new_linear)
+
+        # update following linear layers
+        next_linear_path = self.descendent_linears.get(path)
+        if next_linear_path:
+            next_linear = self.book.get_module(next_linear_path)
+            new_next_linear = self._make_new_linear(next_linear, feature_index, channel_type='in')
+            self._update_model(next_linear_path, new_next_linear)
+
+    def _update_model(self, path, module):
+        parent = self.book.get_module(path[:-1])
+        parent._modules[path[-1]] = module
+        self.book.update(path, module)
diff --git a/src/vision/ssd/__init__.py b/src/vision/ssd/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/vision/ssd/config/__init__.py b/src/vision/ssd/config/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/vision/ssd/config/mobilenetv1_ssd_config.py b/src/vision/ssd/config/mobilenetv1_ssd_config.py
new file mode 100644
index 0000000..af5137d
--- /dev/null
+++ b/src/vision/ssd/config/mobilenetv1_ssd_config.py
@@ -0,0 +1,32 @@
+import numpy as np
+
+from vision.utils.box_utils import SSDSpec, SSDBoxSizes, generate_ssd_priors
+
+image_size = 300
+image_mean = np.array([127, 127, 127])  # RGB layout
+image_std = 128.0
+iou_threshold = 0.45
+center_variance = 0.1
+size_variance = 0.2
+
+specs = [
+    SSDSpec(19, 16, SSDBoxSizes(60, 105), [2, 3]),
+    SSDSpec(10, 32, SSDBoxSizes(105, 150), [2, 3]),
+    SSDSpec(5, 64, SSDBoxSizes(150, 195), [2, 3]),
+    SSDSpec(3, 100, SSDBoxSizes(195, 240), [2, 3]),
+    SSDSpec(2, 150, SSDBoxSizes(240, 285), [2, 3]),
+    SSDSpec(1, 300, SSDBoxSizes(285, 330), [2, 3])
+]
+
+priors = generate_ssd_priors(specs, image_size)
+
+# print(' ')
+# print('SSD-Mobilenet-v1 priors:')
+# print(priors.shape)
+# print(priors)
+# print(' ')
+
+# import torch
+# torch.save(priors, 'mb1-ssd-priors.pt')
+
+# np.savetxt('mb1-ssd-priors.txt', priors.numpy())
diff --git a/src/vision/ssd/config/squeezenet_ssd_config.py b/src/vision/ssd/config/squeezenet_ssd_config.py
new file mode 100644
index 0000000..531dcd1
--- /dev/null
+++ b/src/vision/ssd/config/squeezenet_ssd_config.py
@@ -0,0 +1,21 @@
+import numpy as np
+
+from vision.utils.box_utils import SSDSpec, SSDBoxSizes, generate_ssd_priors
+
+image_size = 300
+image_mean = np.array([127, 127, 127])  # RGB layout
+image_std = 128.0
+iou_threshold = 0.45
+center_variance = 0.1
+size_variance = 0.2
+
+specs = [
+    SSDSpec(17, 16, SSDBoxSizes(60, 105), [2, 3]),
+    SSDSpec(10, 32, SSDBoxSizes(105, 150), [2, 3]),
+    SSDSpec(5, 64, SSDBoxSizes(150, 195), [2, 3]),
+    SSDSpec(3, 100, SSDBoxSizes(195, 240), [2, 3]),
+    SSDSpec(2, 150, SSDBoxSizes(240, 285), [2, 3]),
+    SSDSpec(1, 300, SSDBoxSizes(285, 330), [2, 3])
+]
+
+priors = generate_ssd_priors(specs, image_size)
diff --git a/src/vision/ssd/config/vgg_ssd_config.py b/src/vision/ssd/config/vgg_ssd_config.py
new file mode 100644
index 0000000..1358053
--- /dev/null
+++ b/src/vision/ssd/config/vgg_ssd_config.py
@@ -0,0 +1,22 @@
+import numpy as np
+
+from vision.utils.box_utils import SSDSpec, SSDBoxSizes, generate_ssd_priors
+
+image_size = 300
+image_mean = np.array([123, 117, 104])  # RGB layout
+image_std = 1.0
+
+iou_threshold = 0.45
+center_variance = 0.1
+size_variance = 0.2
+
+specs = [
+    SSDSpec(38, 8, SSDBoxSizes(30, 60), [2]),
+    SSDSpec(19, 16, SSDBoxSizes(60, 111), [2, 3]),
+    SSDSpec(10, 32, SSDBoxSizes(111, 162), [2, 3]),
+    SSDSpec(5, 64, SSDBoxSizes(162, 213), [2, 3]),
+    SSDSpec(3, 100, SSDBoxSizes(213, 264), [2]),
+    SSDSpec(1, 300, SSDBoxSizes(264, 315), [2])
+]
+
+priors = generate_ssd_priors(specs, image_size)
diff --git a/src/vision/ssd/data_preprocessing.py b/src/vision/ssd/data_preprocessing.py
new file mode 100644
index 0000000..d9b2fe8
--- /dev/null
+++ b/src/vision/ssd/data_preprocessing.py
@@ -0,0 +1,62 @@
+from ..transforms.transforms import *
+
+
+class TrainAugmentation:
+    def __init__(self, size, mean=0, std=1.0):
+        """
+        Args:
+            size: the size the of final image.
+            mean: mean pixel value per channel.
+        """
+        self.mean = mean
+        self.size = size
+        self.augment = Compose([
+            ConvertFromInts(),
+            PhotometricDistort(),
+            Expand(self.mean),
+            RandomSampleCrop(),
+            RandomMirror(),
+            ToPercentCoords(),
+            Resize(self.size),
+            SubtractMeans(self.mean),
+            lambda img, boxes=None, labels=None: (img / std, boxes, labels),
+            ToTensor(),
+        ])
+
+    def __call__(self, img, boxes, labels):
+        """
+
+        Args:
+            img: the output of cv.imread in RGB layout.
+            boxes: boundding boxes in the form of (x1, y1, x2, y2).
+            labels: labels of boxes.
+        """
+        return self.augment(img, boxes, labels)
+
+
+class TestTransform:
+    def __init__(self, size, mean=0.0, std=1.0):
+        self.transform = Compose([
+            ToPercentCoords(),
+            Resize(size),
+            SubtractMeans(mean),
+            lambda img, boxes=None, labels=None: (img / std, boxes, labels),
+            ToTensor(),
+        ])
+
+    def __call__(self, image, boxes, labels):
+        return self.transform(image, boxes, labels)
+
+
+class PredictionTransform:
+    def __init__(self, size, mean=0.0, std=1.0):
+        self.transform = Compose([
+            Resize(size),
+            SubtractMeans(mean),
+            lambda img, boxes=None, labels=None: (img / std, boxes, labels),
+            ToTensor()
+        ])
+
+    def __call__(self, image):
+        image, _, _ = self.transform(image)
+        return image
diff --git a/src/vision/ssd/fpn_mobilenetv1_ssd.py b/src/vision/ssd/fpn_mobilenetv1_ssd.py
new file mode 100644
index 0000000..548f001
--- /dev/null
+++ b/src/vision/ssd/fpn_mobilenetv1_ssd.py
@@ -0,0 +1,77 @@
+import torch
+from torch.nn import Conv2d, Sequential, ModuleList, ReLU
+
+from .config import mobilenetv1_ssd_config as config
+from .fpn_ssd import FPNSSD
+from .predictor import Predictor
+from ..nn.mobilenet import MobileNetV1
+
+
+def create_fpn_mobilenetv1_ssd(num_classes):
+    base_net = MobileNetV1(1001).features  # disable dropout layer
+
+    source_layer_indexes = [
+        (69, Conv2d(in_channels=512, out_channels=256, kernel_size=1)),
+        (len(base_net), Conv2d(in_channels=1024, out_channels=256, kernel_size=1)),
+    ]
+    extras = ModuleList([
+        Sequential(
+            Conv2d(in_channels=1024, out_channels=256, kernel_size=1),
+            ReLU(),
+            Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=2, padding=1),
+            ReLU()
+        ),
+        Sequential(
+            Conv2d(in_channels=256, out_channels=128, kernel_size=1),
+            ReLU(),
+            Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
+            ReLU()
+        ),
+        Sequential(
+            Conv2d(in_channels=256, out_channels=128, kernel_size=1),
+            ReLU(),
+            Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
+            ReLU()
+        ),
+        Sequential(
+            Conv2d(in_channels=256, out_channels=128, kernel_size=1),
+            ReLU(),
+            Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
+            ReLU()
+        )
+    ])
+
+    regression_headers = ModuleList([
+        Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        # TODO: change to kernel_size=1, padding=0?
+    ])
+
+    classification_headers = ModuleList([
+        Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        # TODO: change to kernel_size=1, padding=0?
+    ])
+
+    return FPNSSD(num_classes, base_net, source_layer_indexes,
+                  extras, classification_headers, regression_headers)
+
+
+def create_fpn_mobilenetv1_ssd_predictor(net, candidate_size=200, nms_method=None, sigma=0.5,
+                                         device=torch.device('cpu')):
+    predictor = Predictor(net, config.image_size, config.image_mean, config.priors,
+                          config.center_variance, config.size_variance,
+                          nms_method=nms_method,
+                          iou_threshold=config.iou_threshold,
+                          candidate_size=candidate_size,
+                          sigma=sigma,
+                          device=device)
+    return predictor
diff --git a/src/vision/ssd/fpn_ssd.py b/src/vision/ssd/fpn_ssd.py
new file mode 100644
index 0000000..f301270
--- /dev/null
+++ b/src/vision/ssd/fpn_ssd.py
@@ -0,0 +1,143 @@
+from typing import List, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ..utils import box_utils
+
+
+class FPNSSD(nn.Module):
+    def __init__(self, num_classes: int, base_net: nn.ModuleList, source_layer_indexes: List[int],
+                 extras: nn.ModuleList, classification_headers: nn.ModuleList,
+                 regression_headers: nn.ModuleList, upsample_mode="nearest"):
+        """Compose a SSD model using the given components.
+        """
+        super(FPNSSD, self).__init__()
+
+        self.num_classes = num_classes
+        self.base_net = base_net
+        self.source_layer_indexes = source_layer_indexes
+        self.extras = extras
+        self.classification_headers = classification_headers
+        self.regression_headers = regression_headers
+        self.upsample_mode = upsample_mode
+
+        # register layers in source_layer_indexes by adding them to a module list
+        self.source_layer_add_ons = nn.ModuleList([t[1] for t in source_layer_indexes if isinstance(t, tuple)])
+        self.upsamplers = [
+            nn.Upsample(size=(19, 19), mode='bilinear'),
+            nn.Upsample(size=(10, 10), mode='bilinear'),
+            nn.Upsample(size=(5, 5), mode='bilinear'),
+            nn.Upsample(size=(3, 3), mode='bilinear'),
+            nn.Upsample(size=(2, 2), mode='bilinear'),
+        ]
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        confidences = []
+        locations = []
+        start_layer_index = 0
+        header_index = 0
+        features = []
+        for end_layer_index in self.source_layer_indexes:
+
+            if isinstance(end_layer_index, tuple):
+                added_layer = end_layer_index[1]
+                end_layer_index = end_layer_index[0]
+            else:
+                added_layer = None
+            for layer in self.base_net[start_layer_index: end_layer_index]:
+                x = layer(x)
+            start_layer_index = end_layer_index
+            if added_layer:
+                y = added_layer(x)
+            else:
+                y = x
+            # confidence, location = self.compute_header(header_index, y)
+            features.append(y)
+            header_index += 1
+            # confidences.append(confidence)
+            # locations.append(location)
+
+        for layer in self.base_net[end_layer_index:]:
+            x = layer(x)
+
+        for layer in self.extras:
+            x = layer(x)
+            # confidence, location = self.compute_header(header_index, x)
+            features.append(x)
+            header_index += 1
+            # confidences.append(confidence)
+            # locations.append(location)
+
+        upstream_feature = None
+        for i in range(len(features) - 1, -1, -1):
+            feature = features[i]
+            if upstream_feature is not None:
+                upstream_feature = self.upsamplers[i](upstream_feature)
+                upstream_feature += feature
+            else:
+                upstream_feature = feature
+            confidence, location = self.compute_header(i, upstream_feature)
+            confidences.append(confidence)
+            locations.append(location)
+        confidences = torch.cat(confidences, 1)
+        locations = torch.cat(locations, 1)
+        return confidences, locations
+
+    def compute_header(self, i, x):
+        confidence = self.classification_headers[i](x)
+        confidence = confidence.permute(0, 2, 3, 1).contiguous()
+        confidence = confidence.view(confidence.size(0), -1, self.num_classes)
+
+        location = self.regression_headers[i](x)
+        location = location.permute(0, 2, 3, 1).contiguous()
+        location = location.view(location.size(0), -1, 4)
+
+        return confidence, location
+
+    def init_from_base_net(self, model):
+        self.base_net.load_state_dict(torch.load(model, map_location=lambda storage, loc: storage), strict=False)
+        self.source_layer_add_ons.apply(_xavier_init_)
+        self.extras.apply(_xavier_init_)
+        self.classification_headers.apply(_xavier_init_)
+        self.regression_headers.apply(_xavier_init_)
+
+    def init(self):
+        self.base_net.apply(_xavier_init_)
+        self.source_layer_add_ons.apply(_xavier_init_)
+        self.extras.apply(_xavier_init_)
+        self.classification_headers.apply(_xavier_init_)
+        self.regression_headers.apply(_xavier_init_)
+
+    def load(self, model):
+        self.load_state_dict(torch.load(model, map_location=lambda storage, loc: storage))
+
+    def save(self, model_path):
+        torch.save(self.state_dict(), model_path)
+
+
+class MatchPrior(object):
+    def __init__(self, center_form_priors, center_variance, size_variance, iou_threshold):
+        self.center_form_priors = center_form_priors
+        self.corner_form_priors = box_utils.center_form_to_corner_form(center_form_priors)
+        self.center_variance = center_variance
+        self.size_variance = size_variance
+        self.iou_threshold = iou_threshold
+
+    def __call__(self, gt_boxes, gt_labels):
+        if type(gt_boxes) is np.ndarray:
+            gt_boxes = torch.from_numpy(gt_boxes)
+        if type(gt_labels) is np.ndarray:
+            gt_labels = torch.from_numpy(gt_labels)
+        boxes, labels = box_utils.assign_priors(gt_boxes, gt_labels,
+                                                self.corner_form_priors, self.iou_threshold)
+        boxes = box_utils.corner_form_to_center_form(boxes)
+        locations = box_utils.convert_boxes_to_locations(boxes, self.center_form_priors, self.center_variance,
+                                                         self.size_variance)
+        return locations, labels
+
+
+def _xavier_init_(m: nn.Module):
+    if isinstance(m, nn.Conv2d):
+        nn.init.xavier_uniform_(m.weight)
diff --git a/src/vision/ssd/mobilenet_v2_ssd_lite.py b/src/vision/ssd/mobilenet_v2_ssd_lite.py
new file mode 100644
index 0000000..c68246b
--- /dev/null
+++ b/src/vision/ssd/mobilenet_v2_ssd_lite.py
@@ -0,0 +1,71 @@
+import torch
+from torch import nn
+from torch.nn import Conv2d, Sequential, ModuleList, BatchNorm2d
+
+from .config import mobilenetv1_ssd_config as config
+from .predictor import Predictor
+from .ssd import SSD, GraphPath
+from ..nn.mobilenet_v2 import MobileNetV2, InvertedResidual
+
+
+def SeperableConv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, onnx_compatible=False):
+    """Replace Conv2d with a depthwise Conv2d and Pointwise Conv2d.
+    """
+    ReLU = nn.ReLU if onnx_compatible else nn.ReLU6
+    return Sequential(
+        Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size,
+               groups=in_channels, stride=stride, padding=padding),
+        BatchNorm2d(in_channels),
+        ReLU(),
+        Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1),
+    )
+
+
+def create_mobilenetv2_ssd_lite(num_classes, width_mult=1.0, use_batch_norm=True, onnx_compatible=False, is_test=False):
+    base_net = MobileNetV2(width_mult=width_mult, use_batch_norm=use_batch_norm,
+                           onnx_compatible=onnx_compatible).features
+
+    source_layer_indexes = [
+        GraphPath(14, 'conv', 3),
+        19,
+    ]
+    extras = ModuleList([
+        InvertedResidual(1280, 512, stride=2, expand_ratio=0.2),
+        InvertedResidual(512, 256, stride=2, expand_ratio=0.25),
+        InvertedResidual(256, 256, stride=2, expand_ratio=0.5),
+        InvertedResidual(256, 64, stride=2, expand_ratio=0.25)
+    ])
+
+    regression_headers = ModuleList([
+        SeperableConv2d(in_channels=round(576 * width_mult), out_channels=6 * 4,
+                        kernel_size=3, padding=1, onnx_compatible=False),
+        SeperableConv2d(in_channels=1280, out_channels=6 * 4, kernel_size=3, padding=1, onnx_compatible=False),
+        SeperableConv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1, onnx_compatible=False),
+        SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1, onnx_compatible=False),
+        SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1, onnx_compatible=False),
+        Conv2d(in_channels=64, out_channels=6 * 4, kernel_size=1),
+    ])
+
+    classification_headers = ModuleList([
+        SeperableConv2d(in_channels=round(576 * width_mult), out_channels=6 * num_classes, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=1280, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=64, out_channels=6 * num_classes, kernel_size=1),
+    ])
+
+    return SSD(num_classes, base_net, source_layer_indexes,
+               extras, classification_headers, regression_headers, is_test=is_test, config=config)
+
+
+def create_mobilenetv2_ssd_lite_predictor(net, candidate_size=200, nms_method=None, sigma=0.5,
+                                          device=torch.device('cpu')):
+    predictor = Predictor(net, config.image_size, config.image_mean,
+                          config.image_std,
+                          nms_method=nms_method,
+                          iou_threshold=config.iou_threshold,
+                          candidate_size=candidate_size,
+                          sigma=sigma,
+                          device=device)
+    return predictor
diff --git a/src/vision/ssd/mobilenetv1_ssd.py b/src/vision/ssd/mobilenetv1_ssd.py
new file mode 100644
index 0000000..2d7115b
--- /dev/null
+++ b/src/vision/ssd/mobilenetv1_ssd.py
@@ -0,0 +1,75 @@
+from torch.nn import Conv2d, Sequential, ModuleList, ReLU
+
+from .config import mobilenetv1_ssd_config as config
+from .predictor import Predictor
+from .ssd import SSD
+from ..nn.mobilenet import MobileNetV1
+
+
+def create_mobilenetv1_ssd(num_classes, is_test=False):
+    base_net = MobileNetV1(1001).model  # disable dropout layer
+
+    source_layer_indexes = [
+        12,
+        14,
+    ]
+    extras = ModuleList([
+        Sequential(
+            Conv2d(in_channels=1024, out_channels=256, kernel_size=1),
+            ReLU(),
+            Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1),
+            ReLU()
+        ),
+        Sequential(
+            Conv2d(in_channels=512, out_channels=128, kernel_size=1),
+            ReLU(),
+            Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
+            ReLU()
+        ),
+        Sequential(
+            Conv2d(in_channels=256, out_channels=128, kernel_size=1),
+            ReLU(),
+            Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
+            ReLU()
+        ),
+        Sequential(
+            Conv2d(in_channels=256, out_channels=128, kernel_size=1),
+            ReLU(),
+            Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
+            ReLU()
+        )
+    ])
+
+    regression_headers = ModuleList([
+        Conv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=1024, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        # TODO: change to kernel_size=1, padding=0?
+    ])
+
+    classification_headers = ModuleList([
+        Conv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=1024, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        # TODO: change to kernel_size=1, padding=0?
+    ])
+
+    return SSD(num_classes, base_net, source_layer_indexes,
+               extras, classification_headers, regression_headers, is_test=is_test, config=config)
+
+
+def create_mobilenetv1_ssd_predictor(net, candidate_size=200, nms_method=None, sigma=0.5, device=None):
+    predictor = Predictor(net, config.image_size, config.image_mean,
+                          config.image_std,
+                          nms_method=nms_method,
+                          iou_threshold=config.iou_threshold,
+                          candidate_size=candidate_size,
+                          sigma=sigma,
+                          device=device)
+    return predictor
diff --git a/src/vision/ssd/mobilenetv1_ssd_lite.py b/src/vision/ssd/mobilenetv1_ssd_lite.py
new file mode 100644
index 0000000..bb18350
--- /dev/null
+++ b/src/vision/ssd/mobilenetv1_ssd_lite.py
@@ -0,0 +1,80 @@
+from torch.nn import Conv2d, Sequential, ModuleList, ReLU
+
+from .config import mobilenetv1_ssd_config as config
+from .predictor import Predictor
+from .ssd import SSD
+from ..nn.mobilenet import MobileNetV1
+
+
+def SeperableConv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0):
+    """Replace Conv2d with a depthwise Conv2d and Pointwise Conv2d.
+    """
+    return Sequential(
+        Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size,
+               groups=in_channels, stride=stride, padding=padding),
+        ReLU(),
+        Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1),
+    )
+
+
+def create_mobilenetv1_ssd_lite(num_classes, is_test=False):
+    base_net = MobileNetV1(1001).model  # disable dropout layer
+
+    source_layer_indexes = [
+        12,
+        14,
+    ]
+    extras = ModuleList([
+        Sequential(
+            Conv2d(in_channels=1024, out_channels=256, kernel_size=1),
+            ReLU(),
+            SeperableConv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1),
+        ),
+        Sequential(
+            Conv2d(in_channels=512, out_channels=128, kernel_size=1),
+            ReLU(),
+            SeperableConv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
+        ),
+        Sequential(
+            Conv2d(in_channels=256, out_channels=128, kernel_size=1),
+            ReLU(),
+            SeperableConv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
+        ),
+        Sequential(
+            Conv2d(in_channels=256, out_channels=128, kernel_size=1),
+            ReLU(),
+            SeperableConv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1)
+        )
+    ])
+
+    regression_headers = ModuleList([
+        SeperableConv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=1024, out_channels=6 * 4, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=1),
+    ])
+
+    classification_headers = ModuleList([
+        SeperableConv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=1024, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=1),
+    ])
+
+    return SSD(num_classes, base_net, source_layer_indexes,
+               extras, classification_headers, regression_headers, is_test=is_test, config=config)
+
+
+def create_mobilenetv1_ssd_lite_predictor(net, candidate_size=200, nms_method=None, sigma=0.5, device=None):
+    predictor = Predictor(net, config.image_size, config.image_mean,
+                          config.image_std,
+                          nms_method=nms_method,
+                          iou_threshold=config.iou_threshold,
+                          candidate_size=candidate_size,
+                          sigma=sigma,
+                          device=device)
+    return predictor
diff --git a/src/vision/ssd/predictor.py b/src/vision/ssd/predictor.py
new file mode 100644
index 0000000..a37209e
--- /dev/null
+++ b/src/vision/ssd/predictor.py
@@ -0,0 +1,73 @@
+import torch
+
+from .data_preprocessing import PredictionTransform
+from ..utils import box_utils
+from ..utils.misc import Timer
+
+
+class Predictor:
+    def __init__(self, net, size, mean=0.0, std=1.0, nms_method=None,
+                 iou_threshold=0.45, filter_threshold=0.01, candidate_size=200, sigma=0.5, device=None):
+        self.net = net
+        self.transform = PredictionTransform(size, mean, std)
+        self.iou_threshold = iou_threshold
+        self.filter_threshold = filter_threshold
+        self.candidate_size = candidate_size
+        self.nms_method = nms_method
+
+        self.sigma = sigma
+        if device:
+            self.device = device
+        else:
+            self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+        self.net.to(self.device)
+        self.net.eval()
+
+        self.timer = Timer()
+
+    def predict(self, image, top_k=-1, prob_threshold=None):
+        cpu_device = torch.device("cpu")
+        height, width, _ = image.shape
+        image = self.transform(image)
+        # print(image)
+        images = image.unsqueeze(0)
+        images = images.to(self.device)
+        with torch.no_grad():
+            self.timer.start()
+            scores, boxes = self.net.forward(images)
+            print("Inference time: ", self.timer.end())
+        boxes = boxes[0]
+        scores = scores[0]
+        if not prob_threshold:
+            prob_threshold = self.filter_threshold
+
+        boxes = boxes.to(cpu_device)
+        scores = scores.to(cpu_device)
+        picked_box_probs = []
+        picked_labels = []
+        for class_index in range(1, scores.size(1)):
+            probs = scores[:, class_index]
+            mask = probs > prob_threshold
+            probs = probs[mask]
+            if probs.size(0) == 0:
+                continue
+
+            subset_boxes = boxes[mask, :]
+            box_probs = torch.cat([subset_boxes, probs.reshape(-1, 1)], dim=1)
+            box_probs = box_utils.nms(box_probs, self.nms_method,
+                                      score_threshold=prob_threshold,
+                                      iou_threshold=self.iou_threshold,
+                                      sigma=self.sigma,
+                                      top_k=top_k,
+                                      candidate_size=self.candidate_size)
+            picked_box_probs.append(box_probs)
+            picked_labels.extend([class_index] * box_probs.size(0))
+        if not picked_box_probs:
+            return torch.tensor([]), torch.tensor([]), torch.tensor([])
+        picked_box_probs = torch.cat(picked_box_probs)
+        picked_box_probs[:, 0] *= width
+        picked_box_probs[:, 1] *= height
+        picked_box_probs[:, 2] *= width
+        picked_box_probs[:, 3] *= height
+        return picked_box_probs[:, :4], torch.tensor(picked_labels), picked_box_probs[:, 4]
diff --git a/src/vision/ssd/squeezenet_ssd_lite.py b/src/vision/ssd/squeezenet_ssd_lite.py
new file mode 100644
index 0000000..2533597
--- /dev/null
+++ b/src/vision/ssd/squeezenet_ssd_lite.py
@@ -0,0 +1,86 @@
+import torch
+from torch.nn import Conv2d, Sequential, ModuleList, ReLU
+
+from .config import squeezenet_ssd_config as config
+from .predictor import Predictor
+from .ssd import SSD
+from ..nn.squeezenet import squeezenet1_1
+
+
+def SeperableConv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0):
+    """Replace Conv2d with a depthwise Conv2d and Pointwise Conv2d.
+    """
+    return Sequential(
+        Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size,
+               groups=in_channels, stride=stride, padding=padding),
+        ReLU(),
+        Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1),
+    )
+
+
+def create_squeezenet_ssd_lite(num_classes, is_test=False):
+    base_net = squeezenet1_1(False).features  # disable dropout layer
+
+    source_layer_indexes = [
+        12
+    ]
+    extras = ModuleList([
+        Sequential(
+            Conv2d(in_channels=512, out_channels=256, kernel_size=1),
+            ReLU(),
+            SeperableConv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=2),
+        ),
+        Sequential(
+            Conv2d(in_channels=512, out_channels=256, kernel_size=1),
+            ReLU(),
+            SeperableConv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1),
+        ),
+        Sequential(
+            Conv2d(in_channels=512, out_channels=128, kernel_size=1),
+            ReLU(),
+            SeperableConv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
+        ),
+        Sequential(
+            Conv2d(in_channels=256, out_channels=128, kernel_size=1),
+            ReLU(),
+            SeperableConv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
+        ),
+        Sequential(
+            Conv2d(in_channels=256, out_channels=128, kernel_size=1),
+            ReLU(),
+            SeperableConv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1)
+        )
+    ])
+
+    regression_headers = ModuleList([
+        SeperableConv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=1),
+    ])
+
+    classification_headers = ModuleList([
+        SeperableConv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=1),
+    ])
+
+    return SSD(num_classes, base_net, source_layer_indexes,
+               extras, classification_headers, regression_headers, is_test=is_test, config=config)
+
+
+def create_squeezenet_ssd_lite_predictor(net, candidate_size=200, nms_method=None, sigma=0.5,
+                                         device=torch.device('cpu')):
+    predictor = Predictor(net, config.image_size, config.image_mean,
+                          config.image_std,
+                          nms_method=nms_method,
+                          iou_threshold=config.iou_threshold,
+                          candidate_size=candidate_size,
+                          sigma=sigma,
+                          device=device)
+    return predictor
diff --git a/src/vision/ssd/ssd.py b/src/vision/ssd/ssd.py
new file mode 100644
index 0000000..bcdbdfd
--- /dev/null
+++ b/src/vision/ssd/ssd.py
@@ -0,0 +1,167 @@
+from collections import namedtuple
+from typing import List, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..utils import box_utils
+
+GraphPath = namedtuple("GraphPath", ['s0', 'name', 's1'])  #
+
+
+class SSD(nn.Module):
+    def __init__(self, num_classes: int, base_net: nn.ModuleList, source_layer_indexes: List[int],
+                 extras: nn.ModuleList, classification_headers: nn.ModuleList,
+                 regression_headers: nn.ModuleList, is_test=False, config=None, device=None):
+        """Compose a SSD model using the given components.
+        """
+        super(SSD, self).__init__()
+
+        self.num_classes = num_classes
+        self.base_net = base_net
+        self.source_layer_indexes = source_layer_indexes
+        self.extras = extras
+        self.classification_headers = classification_headers
+        self.regression_headers = regression_headers
+        self.is_test = is_test
+        self.config = config
+
+        # register layers in source_layer_indexes by adding them to a module list
+        self.source_layer_add_ons = nn.ModuleList([t[1] for t in source_layer_indexes
+                                                   if isinstance(t, tuple) and not isinstance(t, GraphPath)])
+        if device:
+            self.device = device
+        else:
+            self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        if is_test:
+            self.config = config
+            self.priors = config.priors.to(self.device)
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        confidences = []
+        locations = []
+        start_layer_index = 0
+        header_index = 0
+        for end_layer_index in self.source_layer_indexes:
+            if isinstance(end_layer_index, GraphPath):
+                path = end_layer_index
+                end_layer_index = end_layer_index.s0
+                added_layer = None
+            elif isinstance(end_layer_index, tuple):
+                added_layer = end_layer_index[1]
+                end_layer_index = end_layer_index[0]
+                path = None
+            else:
+                added_layer = None
+                path = None
+            for layer in self.base_net[start_layer_index: end_layer_index]:
+                x = layer(x)
+            if added_layer:
+                y = added_layer(x)
+            else:
+                y = x
+            if path:
+                sub = getattr(self.base_net[end_layer_index], path.name)
+                for layer in sub[:path.s1]:
+                    x = layer(x)
+                y = x
+                for layer in sub[path.s1:]:
+                    x = layer(x)
+                end_layer_index += 1
+            start_layer_index = end_layer_index
+            confidence, location = self.compute_header(header_index, y)
+            header_index += 1
+            confidences.append(confidence)
+            locations.append(location)
+
+        for layer in self.base_net[end_layer_index:]:
+            x = layer(x)
+
+        for layer in self.extras:
+            x = layer(x)
+            confidence, location = self.compute_header(header_index, x)
+            header_index += 1
+            confidences.append(confidence)
+            locations.append(location)
+
+        confidences = torch.cat(confidences, 1)
+        locations = torch.cat(locations, 1)
+
+        if self.is_test:
+            confidences = F.softmax(confidences, dim=2)
+            boxes = box_utils.convert_locations_to_boxes(
+                locations, self.priors, self.config.center_variance, self.config.size_variance
+            )
+            boxes = box_utils.center_form_to_corner_form(boxes)
+            return confidences, boxes
+        else:
+            return confidences, locations
+
+    def compute_header(self, i, x):
+        confidence = self.classification_headers[i](x)
+        confidence = confidence.permute(0, 2, 3, 1).contiguous()
+        confidence = confidence.view(confidence.size(0), -1, self.num_classes)
+
+        location = self.regression_headers[i](x)
+        location = location.permute(0, 2, 3, 1).contiguous()
+        location = location.view(location.size(0), -1, 4)
+
+        return confidence, location
+
+    def init_from_base_net(self, model):
+        self.base_net.load_state_dict(torch.load(model, map_location=lambda storage, loc: storage), strict=True)
+        self.source_layer_add_ons.apply(_xavier_init_)
+        self.extras.apply(_xavier_init_)
+        self.classification_headers.apply(_xavier_init_)
+        self.regression_headers.apply(_xavier_init_)
+
+    def init_from_pretrained_ssd(self, model):
+        state_dict = torch.load(model, map_location=lambda storage, loc: storage)
+        state_dict = {k: v for k, v in state_dict.items() if
+                      not (k.startswith("classification_headers") or k.startswith("regression_headers"))}
+        model_dict = self.state_dict()
+        model_dict.update(state_dict)
+        self.load_state_dict(model_dict)
+        self.classification_headers.apply(_xavier_init_)
+        self.regression_headers.apply(_xavier_init_)
+
+    def init(self):
+        self.base_net.apply(_xavier_init_)
+        self.source_layer_add_ons.apply(_xavier_init_)
+        self.extras.apply(_xavier_init_)
+        self.classification_headers.apply(_xavier_init_)
+        self.regression_headers.apply(_xavier_init_)
+
+    def load(self, model):
+        self.load_state_dict(torch.load(model, map_location=lambda storage, loc: storage))
+
+    def save(self, model_path):
+        torch.save(self.state_dict(), model_path)
+
+
+class MatchPrior(object):
+    def __init__(self, center_form_priors, center_variance, size_variance, iou_threshold):
+        self.center_form_priors = center_form_priors
+        self.corner_form_priors = box_utils.center_form_to_corner_form(center_form_priors)
+        self.center_variance = center_variance
+        self.size_variance = size_variance
+        self.iou_threshold = iou_threshold
+
+    def __call__(self, gt_boxes, gt_labels):
+        if type(gt_boxes) is np.ndarray:
+            gt_boxes = torch.from_numpy(gt_boxes)
+        if type(gt_labels) is np.ndarray:
+            gt_labels = torch.from_numpy(gt_labels)
+        boxes, labels = box_utils.assign_priors(gt_boxes, gt_labels,
+                                                self.corner_form_priors, self.iou_threshold)
+        boxes = box_utils.corner_form_to_center_form(boxes)
+        locations = box_utils.convert_boxes_to_locations(boxes, self.center_form_priors, self.center_variance,
+                                                         self.size_variance)
+        return locations, labels
+
+
+def _xavier_init_(m: nn.Module):
+    if isinstance(m, nn.Conv2d):
+        nn.init.xavier_uniform_(m.weight)
diff --git a/src/vision/ssd/vgg_ssd.py b/src/vision/ssd/vgg_ssd.py
new file mode 100644
index 0000000..21ff264
--- /dev/null
+++ b/src/vision/ssd/vgg_ssd.py
@@ -0,0 +1,76 @@
+from torch.nn import Conv2d, Sequential, ModuleList, ReLU, BatchNorm2d
+
+from .config import vgg_ssd_config as config
+from .predictor import Predictor
+from .ssd import SSD
+from ..nn.vgg import vgg
+
+
+def create_vgg_ssd(num_classes, is_test=False):
+    vgg_config = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
+                  512, 512, 512]
+    base_net = ModuleList(vgg(vgg_config))
+
+    source_layer_indexes = [
+        (23, BatchNorm2d(512)),
+        len(base_net),
+    ]
+    extras = ModuleList([
+        Sequential(
+            Conv2d(in_channels=1024, out_channels=256, kernel_size=1),
+            ReLU(),
+            Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1),
+            ReLU()
+        ),
+        Sequential(
+            Conv2d(in_channels=512, out_channels=128, kernel_size=1),
+            ReLU(),
+            Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
+            ReLU()
+        ),
+        Sequential(
+            Conv2d(in_channels=256, out_channels=128, kernel_size=1),
+            ReLU(),
+            Conv2d(in_channels=128, out_channels=256, kernel_size=3),
+            ReLU()
+        ),
+        Sequential(
+            Conv2d(in_channels=256, out_channels=128, kernel_size=1),
+            ReLU(),
+            Conv2d(in_channels=128, out_channels=256, kernel_size=3),
+            ReLU()
+        )
+    ])
+
+    regression_headers = ModuleList([
+        Conv2d(in_channels=512, out_channels=4 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=1024, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=4 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=4 * 4, kernel_size=3, padding=1),
+        # TODO: change to kernel_size=1, padding=0?
+    ])
+
+    classification_headers = ModuleList([
+        Conv2d(in_channels=512, out_channels=4 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=1024, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=4 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=4 * num_classes, kernel_size=3, padding=1),
+        # TODO: change to kernel_size=1, padding=0?
+    ])
+
+    return SSD(num_classes, base_net, source_layer_indexes,
+               extras, classification_headers, regression_headers, is_test=is_test, config=config)
+
+
+def create_vgg_ssd_predictor(net, candidate_size=200, nms_method=None, sigma=0.5, device=None):
+    predictor = Predictor(net, config.image_size, config.image_mean,
+                          nms_method=nms_method,
+                          iou_threshold=config.iou_threshold,
+                          candidate_size=candidate_size,
+                          sigma=sigma,
+                          device=device)
+    return predictor
diff --git a/src/vision/test/__init__.py b/src/vision/test/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/vision/test/assets/000138.jpg b/src/vision/test/assets/000138.jpg
new file mode 100644
index 0000000..6e4746e
Binary files /dev/null and b/src/vision/test/assets/000138.jpg differ
diff --git a/src/vision/test/test_vgg_ssd.py b/src/vision/test/test_vgg_ssd.py
new file mode 100644
index 0000000..89468b4
--- /dev/null
+++ b/src/vision/test/test_vgg_ssd.py
@@ -0,0 +1,49 @@
+import tempfile
+
+import torch
+
+from ..ssd.vgg_ssd import create_vgg_ssd
+
+
+def test_create_vgg_ssd():
+    for num_classes in [2, 10, 21, 100]:
+        _ = create_vgg_ssd(num_classes)
+
+
+def test_forward():
+    for num_classes in [2]:
+        net = create_vgg_ssd(num_classes)
+        net.init()
+        net.eval()
+        x = torch.randn(2, 3, 300, 300)
+        confidences, locations = net.forward(x)
+        assert confidences.size() == torch.Size([2, 8732, num_classes])
+        assert locations.size() == torch.Size([2, 8732, 4])
+        assert confidences.nonzero().size(0) != 0
+        assert locations.nonzero().size(0) != 0
+
+
+def test_save_model():
+    net = create_vgg_ssd(10)
+    net.init()
+    with tempfile.TemporaryFile() as f:
+        net.save(f)
+
+
+def test_save_load_model_consistency():
+    net = create_vgg_ssd(20)
+    net.init()
+    model_path = tempfile.NamedTemporaryFile().name
+    net.save(model_path)
+    net_copy = create_vgg_ssd(20)
+    net_copy.load(model_path)
+
+    net.eval()
+    net_copy.eval()
+
+    for _ in range(1):
+        x = torch.randn(1, 3, 300, 300)
+        confidences1, locations1 = net.forward(x)
+        confidences2, locations2 = net_copy.forward(x)
+        assert (confidences1 == confidences2).long().sum() == confidences2.numel()
+        assert (locations1 == locations2).long().sum() == locations2.numel()
diff --git a/src/vision/transforms/__init__.py b/src/vision/transforms/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/vision/transforms/transforms.py b/src/vision/transforms/transforms.py
new file mode 100644
index 0000000..95b5ab4
--- /dev/null
+++ b/src/vision/transforms/transforms.py
@@ -0,0 +1,410 @@
+# from https://github.com/amdegroot/ssd.pytorch
+
+
+import types
+
+import cv2
+import numpy as np
+import torch
+from numpy import random
+from torchvision import transforms
+
+
+def intersect(box_a, box_b):
+    max_xy = np.minimum(box_a[:, 2:], box_b[2:])
+    min_xy = np.maximum(box_a[:, :2], box_b[:2])
+    inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf)
+    return inter[:, 0] * inter[:, 1]
+
+
+def jaccard_numpy(box_a, box_b):
+    """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
+    is simply the intersection over union of two boxes.
+    E.g.:
+        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
+    Args:
+        box_a: Multiple bounding boxes, Shape: [num_boxes,4]
+        box_b: Single bounding box, Shape: [4]
+    Return:
+        jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]]
+    """
+    inter = intersect(box_a, box_b)
+    area_a = ((box_a[:, 2] - box_a[:, 0]) *
+              (box_a[:, 3] - box_a[:, 1]))  # [A,B]
+    area_b = ((box_b[2] - box_b[0]) *
+              (box_b[3] - box_b[1]))  # [A,B]
+    union = area_a + area_b - inter
+    return inter / union  # [A,B]
+
+
+class Compose(object):
+    """Composes several augmentations together.
+    Args:
+        transforms (List[Transform]): list of transforms to compose.
+    Example:
+        >>> augmentations.Compose([
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.ToTensor(),
+        >>> ])
+    """
+
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, img, boxes=None, labels=None):
+        for t in self.transforms:
+            img, boxes, labels = t(img, boxes, labels)
+        return img, boxes, labels
+
+
+class Lambda(object):
+    """Applies a lambda as a transform."""
+
+    def __init__(self, lambd):
+        assert isinstance(lambd, types.LambdaType)
+        self.lambd = lambd
+
+    def __call__(self, img, boxes=None, labels=None):
+        return self.lambd(img, boxes, labels)
+
+
+class ConvertFromInts(object):
+    def __call__(self, image, boxes=None, labels=None):
+        return image.astype(np.float32), boxes, labels
+
+
+class SubtractMeans(object):
+    def __init__(self, mean):
+        self.mean = np.array(mean, dtype=np.float32)
+
+    def __call__(self, image, boxes=None, labels=None):
+        image = image.astype(np.float32)
+        image -= self.mean
+        return image.astype(np.float32), boxes, labels
+
+
+class ToAbsoluteCoords(object):
+    def __call__(self, image, boxes=None, labels=None):
+        height, width, channels = image.shape
+        boxes[:, 0] *= width
+        boxes[:, 2] *= width
+        boxes[:, 1] *= height
+        boxes[:, 3] *= height
+
+        return image, boxes, labels
+
+
+class ToPercentCoords(object):
+    def __call__(self, image, boxes=None, labels=None):
+        height, width, channels = image.shape
+        boxes[:, 0] /= width
+        boxes[:, 2] /= width
+        boxes[:, 1] /= height
+        boxes[:, 3] /= height
+
+        return image, boxes, labels
+
+
+class Resize(object):
+    def __init__(self, size=300):
+        self.size = size
+
+    def __call__(self, image, boxes=None, labels=None):
+        image = cv2.resize(image, (self.size,
+                                   self.size))
+        return image, boxes, labels
+
+
+class RandomSaturation(object):
+    def __init__(self, lower=0.5, upper=1.5):
+        self.lower = lower
+        self.upper = upper
+        assert self.upper >= self.lower, "contrast upper must be >= lower."
+        assert self.lower >= 0, "contrast lower must be non-negative."
+
+    def __call__(self, image, boxes=None, labels=None):
+        if random.randint(2):
+            image[:, :, 1] *= random.uniform(self.lower, self.upper)
+
+        return image, boxes, labels
+
+
+class RandomHue(object):
+    def __init__(self, delta=18.0):
+        assert delta >= 0.0 and delta <= 360.0
+        self.delta = delta
+
+    def __call__(self, image, boxes=None, labels=None):
+        if random.randint(2):
+            image[:, :, 0] += random.uniform(-self.delta, self.delta)
+            image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0
+            image[:, :, 0][image[:, :, 0] < 0.0] += 360.0
+        return image, boxes, labels
+
+
+class RandomLightingNoise(object):
+    def __init__(self):
+        self.perms = ((0, 1, 2), (0, 2, 1),
+                      (1, 0, 2), (1, 2, 0),
+                      (2, 0, 1), (2, 1, 0))
+
+    def __call__(self, image, boxes=None, labels=None):
+        if random.randint(2):
+            swap = self.perms[random.randint(len(self.perms))]
+            shuffle = SwapChannels(swap)  # shuffle channels
+            image = shuffle(image)
+        return image, boxes, labels
+
+
+class ConvertColor(object):
+    def __init__(self, current, transform):
+        self.transform = transform
+        self.current = current
+
+    def __call__(self, image, boxes=None, labels=None):
+        if self.current == 'BGR' and self.transform == 'HSV':
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
+        elif self.current == 'RGB' and self.transform == 'HSV':
+            image = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
+        elif self.current == 'BGR' and self.transform == 'RGB':
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        elif self.current == 'HSV' and self.transform == 'BGR':
+            image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
+        elif self.current == 'HSV' and self.transform == "RGB":
+            image = cv2.cvtColor(image, cv2.COLOR_HSV2RGB)
+        else:
+            raise NotImplementedError
+        return image, boxes, labels
+
+
+class RandomContrast(object):
+    def __init__(self, lower=0.5, upper=1.5):
+        self.lower = lower
+        self.upper = upper
+        assert self.upper >= self.lower, "contrast upper must be >= lower."
+        assert self.lower >= 0, "contrast lower must be non-negative."
+
+    # expects float image
+    def __call__(self, image, boxes=None, labels=None):
+        if random.randint(2):
+            alpha = random.uniform(self.lower, self.upper)
+            image *= alpha
+        return image, boxes, labels
+
+
+class RandomBrightness(object):
+    def __init__(self, delta=32):
+        assert delta >= 0.0
+        assert delta <= 255.0
+        self.delta = delta
+
+    def __call__(self, image, boxes=None, labels=None):
+        if random.randint(2):
+            delta = random.uniform(-self.delta, self.delta)
+            image += delta
+        return image, boxes, labels
+
+
+class ToCV2Image(object):
+    def __call__(self, tensor, boxes=None, labels=None):
+        return tensor.cpu().numpy().astype(np.float32).transpose((1, 2, 0)), boxes, labels
+
+
+class ToTensor(object):
+    def __call__(self, cvimage, boxes=None, labels=None):
+        return torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), boxes, labels
+
+
+class RandomSampleCrop(object):
+    """Crop
+    Arguments:
+        img (Image): the image being input during training
+        boxes (Tensor): the original bounding boxes in pt form
+        labels (Tensor): the class labels for each bbox
+        mode (float tuple): the min and max jaccard overlaps
+    Return:
+        (img, boxes, classes)
+            img (Image): the cropped image
+            boxes (Tensor): the adjusted bounding boxes in pt form
+            labels (Tensor): the class labels for each bbox
+    """
+
+    def __init__(self):
+        self.sample_options = (
+            # using entire original input image
+            None,
+            # sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9
+            (0.1, None),
+            (0.3, None),
+            (0.7, None),
+            (0.9, None),
+            # randomly sample a patch
+            (None, None),
+        )
+
+    def __call__(self, image, boxes=None, labels=None):
+        height, width, _ = image.shape
+        while True:
+            # randomly choose a mode
+            # mode = random.choice(self.sample_options)  # throws numpy deprecation warning
+            mode = self.sample_options[random.randint(len(self.sample_options))]
+
+            if mode is None:
+                return image, boxes, labels
+
+            min_iou, max_iou = mode
+            if min_iou is None:
+                min_iou = float('-inf')
+            if max_iou is None:
+                max_iou = float('inf')
+
+            # max trails (50)
+            for _ in range(50):
+                current_image = image
+
+                w = random.uniform(0.3 * width, width)
+                h = random.uniform(0.3 * height, height)
+
+                # aspect ratio constraint b/t .5 & 2
+                if h / w < 0.5 or h / w > 2:
+                    continue
+
+                left = random.uniform(width - w)
+                top = random.uniform(height - h)
+
+                # convert to integer rect x1,y1,x2,y2
+                rect = np.array([int(left), int(top), int(left + w), int(top + h)])
+
+                # calculate IoU (jaccard overlap) b/t the cropped and gt boxes
+                overlap = jaccard_numpy(boxes, rect)
+
+                # is min and max overlap constraint satisfied? if not try again
+                if overlap.min() < min_iou and max_iou < overlap.max():
+                    continue
+
+                # cut the crop from the image
+                current_image = current_image[rect[1]:rect[3], rect[0]:rect[2],
+                                :]
+
+                # keep overlap with gt box IF center in sampled patch
+                centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0
+
+                # mask in all gt boxes that above and to the left of centers
+                m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1])
+
+                # mask in all gt boxes that under and to the right of centers
+                m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1])
+
+                # mask in that both m1 and m2 are true
+                mask = m1 * m2
+
+                # have any valid boxes? try again if not
+                if not mask.any():
+                    continue
+
+                # take only matching gt boxes
+                current_boxes = boxes[mask, :].copy()
+
+                # take only matching gt labels
+                current_labels = labels[mask]
+
+                # should we use the box left and top corner or the crop's
+                current_boxes[:, :2] = np.maximum(current_boxes[:, :2],
+                                                  rect[:2])
+                # adjust to crop (by substracting crop's left,top)
+                current_boxes[:, :2] -= rect[:2]
+
+                current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:],
+                                                  rect[2:])
+                # adjust to crop (by substracting crop's left,top)
+                current_boxes[:, 2:] -= rect[:2]
+
+                return current_image, current_boxes, current_labels
+
+
+class Expand(object):
+    def __init__(self, mean):
+        self.mean = mean
+
+    def __call__(self, image, boxes, labels):
+        if random.randint(2):
+            return image, boxes, labels
+
+        height, width, depth = image.shape
+        ratio = random.uniform(1, 4)
+        left = random.uniform(0, width * ratio - width)
+        top = random.uniform(0, height * ratio - height)
+
+        expand_image = np.zeros(
+            (int(height * ratio), int(width * ratio), depth),
+            dtype=image.dtype)
+        expand_image[:, :, :] = self.mean
+        expand_image[int(top):int(top + height),
+        int(left):int(left + width)] = image
+        image = expand_image
+
+        boxes = boxes.copy()
+        boxes[:, :2] += (int(left), int(top))
+        boxes[:, 2:] += (int(left), int(top))
+
+        return image, boxes, labels
+
+
+class RandomMirror(object):
+    def __call__(self, image, boxes, classes):
+        _, width, _ = image.shape
+        if random.randint(2):
+            image = image[:, ::-1]
+            boxes = boxes.copy()
+            boxes[:, 0::2] = width - boxes[:, 2::-2]
+        return image, boxes, classes
+
+
+class SwapChannels(object):
+    """Transforms a tensorized image by swapping the channels in the order
+     specified in the swap tuple.
+    Args:
+        swaps (int triple): final order of channels
+            eg: (2, 1, 0)
+    """
+
+    def __init__(self, swaps):
+        self.swaps = swaps
+
+    def __call__(self, image):
+        """
+        Args:
+            image (Tensor): image tensor to be transformed
+        Return:
+            a tensor with channels swapped according to swap
+        """
+        # if torch.is_tensor(image):
+        #     image = image.data.cpu().numpy()
+        # else:
+        #     image = np.array(image)
+        image = image[:, :, self.swaps]
+        return image
+
+
+class PhotometricDistort(object):
+    def __init__(self):
+        self.pd = [
+            RandomContrast(),  # RGB
+            ConvertColor(current="RGB", transform='HSV'),  # HSV
+            RandomSaturation(),  # HSV
+            RandomHue(),  # HSV
+            ConvertColor(current='HSV', transform='RGB'),  # RGB
+            RandomContrast()  # RGB
+        ]
+        self.rand_brightness = RandomBrightness()
+        self.rand_light_noise = RandomLightingNoise()
+
+    def __call__(self, image, boxes, labels):
+        im = image.copy()
+        im, boxes, labels = self.rand_brightness(im, boxes, labels)
+        if random.randint(2):
+            distort = Compose(self.pd[:-1])
+        else:
+            distort = Compose(self.pd[1:])
+        im, boxes, labels = distort(im, boxes, labels)
+        return self.rand_light_noise(im, boxes, labels)
diff --git a/src/vision/utils/__init__.py b/src/vision/utils/__init__.py
new file mode 100644
index 0000000..0789bdb
--- /dev/null
+++ b/src/vision/utils/__init__.py
@@ -0,0 +1 @@
+from .misc import *
diff --git a/src/vision/utils/box_utils.py b/src/vision/utils/box_utils.py
new file mode 100644
index 0000000..0f22bac
--- /dev/null
+++ b/src/vision/utils/box_utils.py
@@ -0,0 +1,293 @@
+import collections
+import itertools
+import math
+from typing import List
+
+import torch
+
+SSDBoxSizes = collections.namedtuple('SSDBoxSizes', ['min', 'max'])
+
+SSDSpec = collections.namedtuple('SSDSpec', ['feature_map_size', 'shrinkage', 'box_sizes', 'aspect_ratios'])
+
+
+def generate_ssd_priors(specs: List[SSDSpec], image_size, clamp=True) -> torch.Tensor:
+    """Generate SSD Prior Boxes.
+
+    It returns the center, height and width of the priors. The values are relative to the image size
+    Args:
+        specs: SSDSpecs about the shapes of sizes of prior boxes. i.e.
+            specs = [
+                SSDSpec(38, 8, SSDBoxSizes(30, 60), [2]),
+                SSDSpec(19, 16, SSDBoxSizes(60, 111), [2, 3]),
+                SSDSpec(10, 32, SSDBoxSizes(111, 162), [2, 3]),
+                SSDSpec(5, 64, SSDBoxSizes(162, 213), [2, 3]),
+                SSDSpec(3, 100, SSDBoxSizes(213, 264), [2]),
+                SSDSpec(1, 300, SSDBoxSizes(264, 315), [2])
+            ]
+        image_size: image size.
+        clamp: if true, clamp the values to make fall between [0.0, 1.0]
+    Returns:
+        priors (num_priors, 4): The prior boxes represented as [[center_x, center_y, w, h]]. All the values
+            are relative to the image size.
+    """
+    priors = []
+    for spec in specs:
+        scale = image_size / spec.shrinkage
+        for j, i in itertools.product(range(spec.feature_map_size), repeat=2):
+            x_center = (i + 0.5) / scale
+            y_center = (j + 0.5) / scale
+
+            # small sized square box
+            size = spec.box_sizes.min
+            h = w = size / image_size
+            priors.append([
+                x_center,
+                y_center,
+                w,
+                h
+            ])
+
+            # big sized square box
+            size = math.sqrt(spec.box_sizes.max * spec.box_sizes.min)
+            h = w = size / image_size
+            priors.append([
+                x_center,
+                y_center,
+                w,
+                h
+            ])
+
+            # change h/w ratio of the small sized box
+            size = spec.box_sizes.min
+            h = w = size / image_size
+            for ratio in spec.aspect_ratios:
+                ratio = math.sqrt(ratio)
+                priors.append([
+                    x_center,
+                    y_center,
+                    w * ratio,
+                    h / ratio
+                ])
+                priors.append([
+                    x_center,
+                    y_center,
+                    w / ratio,
+                    h * ratio
+                ])
+
+    priors = torch.tensor(priors)
+    if clamp:
+        torch.clamp(priors, 0.0, 1.0, out=priors)
+    return priors
+
+
+def convert_locations_to_boxes(locations, priors, center_variance,
+                               size_variance):
+    """Convert regressional location results of SSD into boxes in the form of (center_x, center_y, h, w).
+
+    The conversion:
+        $$predicted\_center * center_variance = \frac {real\_center - prior\_center} {prior\_hw}$$
+        $$exp(predicted\_hw * size_variance) = \frac {real\_hw} {prior\_hw}$$
+    We do it in the inverse direction here.
+    Args:
+        locations (batch_size, num_priors, 4): the regression output of SSD. It will contain the outputs as well.
+        priors (num_priors, 4) or (batch_size/1, num_priors, 4): prior boxes.
+        center_variance: a float used to change the scale of center.
+        size_variance: a float used to change of scale of size.
+    Returns:
+        boxes:  priors: [[center_x, center_y, h, w]]. All the values
+            are relative to the image size.
+    """
+    # priors can have one dimension less.
+    if priors.dim() + 1 == locations.dim():
+        priors = priors.unsqueeze(0)
+    return torch.cat([
+        locations[..., :2] * center_variance * priors[..., 2:] + priors[..., :2],
+        torch.exp(locations[..., 2:] * size_variance) * priors[..., 2:]
+    ], dim=locations.dim() - 1)
+
+
+def convert_boxes_to_locations(center_form_boxes, center_form_priors, center_variance, size_variance):
+    # priors can have one dimension less
+    if center_form_priors.dim() + 1 == center_form_boxes.dim():
+        center_form_priors = center_form_priors.unsqueeze(0)
+    return torch.cat([
+        (center_form_boxes[..., :2] - center_form_priors[..., :2]) / center_form_priors[..., 2:] / center_variance,
+        torch.log(center_form_boxes[..., 2:] / center_form_priors[..., 2:]) / size_variance
+    ], dim=center_form_boxes.dim() - 1)
+
+
+def area_of(left_top, right_bottom) -> torch.Tensor:
+    """Compute the areas of rectangles given two corners.
+
+    Args:
+        left_top (N, 2): left top corner.
+        right_bottom (N, 2): right bottom corner.
+
+    Returns:
+        area (N): return the area.
+    """
+    hw = torch.clamp(right_bottom - left_top, min=0.0)
+    return hw[..., 0] * hw[..., 1]
+
+
+def iou_of(boxes0, boxes1, eps=1e-5):
+    """Return intersection-over-union (Jaccard index) of boxes.
+
+    Args:
+        boxes0 (N, 4): ground truth boxes.
+        boxes1 (N or 1, 4): predicted boxes.
+        eps: a small number to avoid 0 as denominator.
+    Returns:
+        iou (N): IoU values.
+    """
+    overlap_left_top = torch.max(boxes0[..., :2], boxes1[..., :2])
+    overlap_right_bottom = torch.min(boxes0[..., 2:], boxes1[..., 2:])
+
+    overlap_area = area_of(overlap_left_top, overlap_right_bottom)
+    area0 = area_of(boxes0[..., :2], boxes0[..., 2:])
+    area1 = area_of(boxes1[..., :2], boxes1[..., 2:])
+    return overlap_area / (area0 + area1 - overlap_area + eps)
+
+
+def assign_priors(gt_boxes, gt_labels, corner_form_priors,
+                  iou_threshold):
+    """Assign ground truth boxes and targets to priors.
+
+    Args:
+        gt_boxes (num_targets, 4): ground truth boxes.
+        gt_labels (num_targets): labels of targets.
+        priors (num_priors, 4): corner form priors
+    Returns:
+        boxes (num_priors, 4): real values for priors.
+        labels (num_priros): labels for priors.
+    """
+    # size: num_priors x num_targets
+    ious = iou_of(gt_boxes.unsqueeze(0), corner_form_priors.unsqueeze(1))
+    # size: num_priors
+    best_target_per_prior, best_target_per_prior_index = ious.max(1)
+    # size: num_targets
+    best_prior_per_target, best_prior_per_target_index = ious.max(0)
+
+    for target_index, prior_index in enumerate(best_prior_per_target_index):
+        best_target_per_prior_index[prior_index] = target_index
+    # 2.0 is used to make sure every target has a prior assigned
+    best_target_per_prior.index_fill_(0, best_prior_per_target_index, 2)
+    # size: num_priors
+    labels = gt_labels[best_target_per_prior_index]
+    labels[best_target_per_prior < iou_threshold] = 0  # the backgournd id
+    boxes = gt_boxes[best_target_per_prior_index]
+    return boxes, labels
+
+
+def hard_negative_mining(loss, labels, neg_pos_ratio):
+    """
+    It used to suppress the presence of a large number of negative prediction.
+    It works on image level not batch level.
+    For any example/image, it keeps all the positive predictions and
+     cut the number of negative predictions to make sure the ratio
+     between the negative examples and positive examples is no more
+     the given ratio for an image.
+
+    Args:
+        loss (N, num_priors): the loss for each example.
+        labels (N, num_priors): the labels.
+        neg_pos_ratio:  the ratio between the negative examples and positive examples.
+    """
+    pos_mask = labels > 0
+    num_pos = pos_mask.long().sum(dim=1, keepdim=True)
+    num_neg = num_pos * neg_pos_ratio
+
+    loss[pos_mask] = -math.inf
+    _, indexes = loss.sort(dim=1, descending=True)
+    _, orders = indexes.sort(dim=1)
+    neg_mask = orders < num_neg
+    return pos_mask | neg_mask
+
+
+def center_form_to_corner_form(locations):
+    return torch.cat([locations[..., :2] - locations[..., 2:] / 2,
+                      locations[..., :2] + locations[..., 2:] / 2], locations.dim() - 1)
+
+
+def corner_form_to_center_form(boxes):
+    return torch.cat([
+        (boxes[..., :2] + boxes[..., 2:]) / 2,
+        boxes[..., 2:] - boxes[..., :2]
+    ], boxes.dim() - 1)
+
+
+def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200):
+    """
+
+    Args:
+        box_scores (N, 5): boxes in corner-form and probabilities.
+        iou_threshold: intersection over union threshold.
+        top_k: keep top_k results. If k <= 0, keep all the results.
+        candidate_size: only consider the candidates with the highest scores.
+    Returns:
+         picked: a list of indexes of the kept boxes
+    """
+    scores = box_scores[:, -1]
+    boxes = box_scores[:, :-1]
+    picked = []
+    _, indexes = scores.sort(descending=True)
+    indexes = indexes[:candidate_size]
+    while len(indexes) > 0:
+        current = indexes[0]
+        picked.append(current.item())
+        if 0 < top_k == len(picked) or len(indexes) == 1:
+            break
+        current_box = boxes[current, :]
+        indexes = indexes[1:]
+        rest_boxes = boxes[indexes, :]
+        iou = iou_of(
+            rest_boxes,
+            current_box.unsqueeze(0),
+        )
+        indexes = indexes[iou <= iou_threshold]
+
+    return box_scores[picked, :]
+
+
+def nms(box_scores, nms_method=None, score_threshold=None, iou_threshold=None,
+        sigma=0.5, top_k=-1, candidate_size=200):
+    if nms_method == "soft":
+        return soft_nms(box_scores, score_threshold, sigma, top_k)
+    else:
+        return hard_nms(box_scores, iou_threshold, top_k, candidate_size=candidate_size)
+
+
+def soft_nms(box_scores, score_threshold, sigma=0.5, top_k=-1):
+    """Soft NMS implementation.
+
+    References:
+        https://arxiv.org/abs/1704.04503
+        https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/cython_nms.pyx
+
+    Args:
+        box_scores (N, 5): boxes in corner-form and probabilities.
+        score_threshold: boxes with scores less than value are not considered.
+        sigma: the parameter in score re-computation.
+            scores[i] = scores[i] * exp(-(iou_i)^2 / simga)
+        top_k: keep top_k results. If k <= 0, keep all the results.
+    Returns:
+         picked_box_scores (K, 5): results of NMS.
+    """
+    picked_box_scores = []
+    while box_scores.size(0) > 0:
+        max_score_index = torch.argmax(box_scores[:, 4])
+        cur_box_prob = torch.tensor(box_scores[max_score_index, :])
+        picked_box_scores.append(cur_box_prob)
+        if len(picked_box_scores) == top_k > 0 or box_scores.size(0) == 1:
+            break
+        cur_box = cur_box_prob[:-1]
+        box_scores[max_score_index, :] = box_scores[-1, :]
+        box_scores = box_scores[:-1, :]
+        ious = iou_of(cur_box.unsqueeze(0), box_scores[:, :-1])
+        box_scores[:, -1] = box_scores[:, -1] * torch.exp(-(ious * ious) / sigma)
+        box_scores = box_scores[box_scores[:, -1] > score_threshold, :]
+    if len(picked_box_scores) > 0:
+        return torch.stack(picked_box_scores)
+    else:
+        return torch.tensor([])
diff --git a/src/vision/utils/box_utils_numpy.py b/src/vision/utils/box_utils_numpy.py
new file mode 100644
index 0000000..25fc207
--- /dev/null
+++ b/src/vision/utils/box_utils_numpy.py
@@ -0,0 +1,238 @@
+import itertools
+import math
+from typing import List
+
+import numpy as np
+
+from .box_utils import SSDSpec
+
+
+def generate_ssd_priors(specs: List[SSDSpec], image_size, clamp=True):
+    """Generate SSD Prior Boxes.
+
+    It returns the center, height and width of the priors. The values are relative to the image size
+    Args:
+        specs: SSDSpecs about the shapes of sizes of prior boxes. i.e.
+            specs = [
+                SSDSpec(38, 8, SSDBoxSizes(30, 60), [2]),
+                SSDSpec(19, 16, SSDBoxSizes(60, 111), [2, 3]),
+                SSDSpec(10, 32, SSDBoxSizes(111, 162), [2, 3]),
+                SSDSpec(5, 64, SSDBoxSizes(162, 213), [2, 3]),
+                SSDSpec(3, 100, SSDBoxSizes(213, 264), [2]),
+                SSDSpec(1, 300, SSDBoxSizes(264, 315), [2])
+            ]
+        image_size: image size.
+        clamp: if true, clamp the values to make fall between [0.0, 1.0]
+    Returns:
+        priors (num_priors, 4): The prior boxes represented as [[center_x, center_y, w, h]]. All the values
+            are relative to the image size.
+    """
+    priors = []
+    for spec in specs:
+        scale = image_size / spec.shrinkage
+        for j, i in itertools.product(range(spec.feature_map_size), repeat=2):
+            x_center = (i + 0.5) / scale
+            y_center = (j + 0.5) / scale
+
+            # small sized square box
+            size = spec.box_sizes.min
+            h = w = size / image_size
+            priors.append([
+                x_center,
+                y_center,
+                w,
+                h
+            ])
+
+            # big sized square box
+            size = math.sqrt(spec.box_sizes.max * spec.box_sizes.min)
+            h = w = size / image_size
+            priors.append([
+                x_center,
+                y_center,
+                w,
+                h
+            ])
+
+            # change h/w ratio of the small sized box
+            size = spec.box_sizes.min
+            h = w = size / image_size
+            for ratio in spec.aspect_ratios:
+                ratio = math.sqrt(ratio)
+                priors.append([
+                    x_center,
+                    y_center,
+                    w * ratio,
+                    h / ratio
+                ])
+                priors.append([
+                    x_center,
+                    y_center,
+                    w / ratio,
+                    h * ratio
+                ])
+
+    priors = np.array(priors, dtype=np.float32)
+    if clamp:
+        np.clip(priors, 0.0, 1.0, out=priors)
+    return priors
+
+
+def convert_locations_to_boxes(locations, priors, center_variance,
+                               size_variance):
+    """Convert regressional location results of SSD into boxes in the form of (center_x, center_y, h, w).
+
+    The conversion:
+        $$predicted\_center * center_variance = \frac {real\_center - prior\_center} {prior\_hw}$$
+        $$exp(predicted\_hw * size_variance) = \frac {real\_hw} {prior\_hw}$$
+    We do it in the inverse direction here.
+    Args:
+        locations (batch_size, num_priors, 4): the regression output of SSD. It will contain the outputs as well.
+        priors (num_priors, 4) or (batch_size/1, num_priors, 4): prior boxes.
+        center_variance: a float used to change the scale of center.
+        size_variance: a float used to change of scale of size.
+    Returns:
+        boxes:  priors: [[center_x, center_y, h, w]]. All the values
+            are relative to the image size.
+    """
+    # priors can have one dimension less.
+    if len(priors.shape) + 1 == len(locations.shape):
+        priors = np.expand_dims(priors, 0)
+    return np.concatenate([
+        locations[..., :2] * center_variance * priors[..., 2:] + priors[..., :2],
+        np.exp(locations[..., 2:] * size_variance) * priors[..., 2:]
+    ], axis=len(locations.shape) - 1)
+
+
+def convert_boxes_to_locations(center_form_boxes, center_form_priors, center_variance, size_variance):
+    # priors can have one dimension less
+    if len(center_form_priors.shape) + 1 == len(center_form_boxes.shape):
+        center_form_priors = np.expand_dims(center_form_priors, 0)
+    return np.concatenate([
+        (center_form_boxes[..., :2] - center_form_priors[..., :2]) / center_form_priors[..., 2:] / center_variance,
+        np.log(center_form_boxes[..., 2:] / center_form_priors[..., 2:]) / size_variance
+    ], axis=len(center_form_boxes.shape) - 1)
+
+
+def area_of(left_top, right_bottom):
+    """Compute the areas of rectangles given two corners.
+
+    Args:
+        left_top (N, 2): left top corner.
+        right_bottom (N, 2): right bottom corner.
+
+    Returns:
+        area (N): return the area.
+    """
+    hw = np.clip(right_bottom - left_top, 0.0, None)
+    return hw[..., 0] * hw[..., 1]
+
+
+def iou_of(boxes0, boxes1, eps=1e-5):
+    """Return intersection-over-union (Jaccard index) of boxes.
+
+    Args:
+        boxes0 (N, 4): ground truth boxes.
+        boxes1 (N or 1, 4): predicted boxes.
+        eps: a small number to avoid 0 as denominator.
+    Returns:
+        iou (N): IoU values.
+    """
+    overlap_left_top = np.maximum(boxes0[..., :2], boxes1[..., :2])
+    overlap_right_bottom = np.minimum(boxes0[..., 2:], boxes1[..., 2:])
+
+    overlap_area = area_of(overlap_left_top, overlap_right_bottom)
+    area0 = area_of(boxes0[..., :2], boxes0[..., 2:])
+    area1 = area_of(boxes1[..., :2], boxes1[..., 2:])
+    return overlap_area / (area0 + area1 - overlap_area + eps)
+
+
+def center_form_to_corner_form(locations):
+    return np.concatenate([locations[..., :2] - locations[..., 2:] / 2,
+                           locations[..., :2] + locations[..., 2:] / 2], len(locations.shape) - 1)
+
+
+def corner_form_to_center_form(boxes):
+    return np.concatenate([
+        (boxes[..., :2] + boxes[..., 2:]) / 2,
+        boxes[..., 2:] - boxes[..., :2]
+    ], len(boxes.shape) - 1)
+
+
+def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200):
+    """
+
+    Args:
+        box_scores (N, 5): boxes in corner-form and probabilities.
+        iou_threshold: intersection over union threshold.
+        top_k: keep top_k results. If k <= 0, keep all the results.
+        candidate_size: only consider the candidates with the highest scores.
+    Returns:
+         picked: a list of indexes of the kept boxes
+    """
+    scores = box_scores[:, -1]
+    boxes = box_scores[:, :-1]
+    picked = []
+    # _, indexes = scores.sort(descending=True)
+    indexes = np.argsort(scores)
+    # indexes = indexes[:candidate_size]
+    indexes = indexes[-candidate_size:]
+    while len(indexes) > 0:
+        # current = indexes[0]
+        current = indexes[-1]
+        picked.append(current)
+        if 0 < top_k == len(picked) or len(indexes) == 1:
+            break
+        current_box = boxes[current, :]
+        # indexes = indexes[1:]
+        indexes = indexes[:-1]
+        rest_boxes = boxes[indexes, :]
+        iou = iou_of(
+            rest_boxes,
+            np.expand_dims(current_box, axis=0),
+        )
+        indexes = indexes[iou <= iou_threshold]
+
+    return box_scores[picked, :]
+
+# def nms(box_scores, nms_method=None, score_threshold=None, iou_threshold=None,
+#         sigma=0.5, top_k=-1, candidate_size=200):
+#     if nms_method == "soft":
+#         return soft_nms(box_scores, score_threshold, sigma, top_k)
+#     else:
+#         return hard_nms(box_scores, iou_threshold, top_k, candidate_size=candidate_size)
+
+#
+# def soft_nms(box_scores, score_threshold, sigma=0.5, top_k=-1):
+#     """Soft NMS implementation.
+#
+#     References:
+#         https://arxiv.org/abs/1704.04503
+#         https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/cython_nms.pyx
+#
+#     Args:
+#         box_scores (N, 5): boxes in corner-form and probabilities.
+#         score_threshold: boxes with scores less than value are not considered.
+#         sigma: the parameter in score re-computation.
+#             scores[i] = scores[i] * exp(-(iou_i)^2 / simga)
+#         top_k: keep top_k results. If k <= 0, keep all the results.
+#     Returns:
+#          picked_box_scores (K, 5): results of NMS.
+#     """
+#     picked_box_scores = []
+#     while box_scores.size(0) > 0:
+#         max_score_index = torch.argmax(box_scores[:, 4])
+#         cur_box_prob = torch.tensor(box_scores[max_score_index, :])
+#         picked_box_scores.append(cur_box_prob)
+#         if len(picked_box_scores) == top_k > 0 or box_scores.size(0) == 1:
+#             break
+#         cur_box = cur_box_prob[:-1]
+#         box_scores[max_score_index, :] = box_scores[-1, :]
+#         box_scores = box_scores[:-1, :]
+#         ious = iou_of(cur_box.unsqueeze(0), box_scores[:, :-1])
+#         box_scores[:, -1] = box_scores[:, -1] * torch.exp(-(ious * ious) / sigma)
+#         box_scores = box_scores[box_scores[:, -1] > score_threshold, :]
+#     if len(picked_box_scores) > 0:
+#         return torch.stack(picked_box_scores)
+#     else:
+#         return torch.tensor([])
diff --git a/src/vision/utils/measurements.py b/src/vision/utils/measurements.py
new file mode 100644
index 0000000..5cc590c
--- /dev/null
+++ b/src/vision/utils/measurements.py
@@ -0,0 +1,32 @@
+import numpy as np
+
+
+def compute_average_precision(precision, recall):
+    """
+    It computes average precision based on the definition of Pascal Competition. It computes the under curve area
+    of precision and recall. Recall follows the normal definition. Precision is a variant.
+    pascal_precision[i] = typical_precision[i:].max()
+    """
+    # identical but faster version of new_precision[i] = old_precision[i:].max()
+    precision = np.concatenate([[0.0], precision, [0.0]])
+    for i in range(len(precision) - 1, 0, -1):
+        precision[i - 1] = np.maximum(precision[i - 1], precision[i])
+
+    # find the index where the value changes
+    recall = np.concatenate([[0.0], recall, [1.0]])
+    changing_points = np.where(recall[1:] != recall[:-1])[0]
+
+    # compute under curve area
+    areas = (recall[changing_points + 1] - recall[changing_points]) * precision[changing_points + 1]
+    return areas.sum()
+
+
+def compute_voc2007_average_precision(precision, recall):
+    ap = 0.
+    for t in np.arange(0., 1.1, 0.1):
+        if np.sum(recall >= t) == 0:
+            p = 0
+        else:
+            p = np.max(precision[recall >= t])
+        ap = ap + p / 11.
+    return ap
diff --git a/src/vision/utils/misc.py b/src/vision/utils/misc.py
new file mode 100644
index 0000000..f67ee4b
--- /dev/null
+++ b/src/vision/utils/misc.py
@@ -0,0 +1,46 @@
+import time
+
+import torch
+
+
+def str2bool(s):
+    return s.lower() in ('true', '1')
+
+
+class Timer:
+    def __init__(self):
+        self.clock = {}
+
+    def start(self, key="default"):
+        self.clock[key] = time.time()
+
+    def end(self, key="default"):
+        if key not in self.clock:
+            raise Exception(f"{key} is not in the clock.")
+        interval = time.time() - self.clock[key]
+        del self.clock[key]
+        return interval
+
+
+def save_checkpoint(epoch, net_state_dict, optimizer_state_dict, best_score, checkpoint_path, model_path):
+    torch.save({
+        'epoch': epoch,
+        'model': net_state_dict,
+        'optimizer': optimizer_state_dict,
+        'best_score': best_score
+    }, checkpoint_path)
+    torch.save(net_state_dict, model_path)
+
+
+def load_checkpoint(checkpoint_path):
+    return torch.load(checkpoint_path)
+
+
+def freeze_net_layers(net):
+    for param in net.parameters():
+        param.requires_grad = False
+
+
+def store_labels(path, labels):
+    with open(path, "w") as f:
+        f.write("\n".join(labels))
diff --git a/src/vision/utils/model_book.py b/src/vision/utils/model_book.py
new file mode 100644
index 0000000..763b79b
--- /dev/null
+++ b/src/vision/utils/model_book.py
@@ -0,0 +1,82 @@
+from collections import OrderedDict
+
+import torch.nn as nn
+
+
+class ModelBook:
+    """Maintain the mapping between modules and their paths.
+
+    Example:
+        book = ModelBook(model_ft)
+        for p, m in book.conv2d_modules():
+            print('path:', p, 'num of filters:', m.out_channels)
+            assert m is book.get_module(p)
+    """
+
+    def __init__(self, model):
+        self._model = model
+        self._modules = OrderedDict()
+        self._paths = OrderedDict()
+        path = []
+        self._construct(self._model, path)
+
+    def _construct(self, module, path):
+        if not module._modules:
+            return
+        for name, m in module._modules.items():
+            cur_path = tuple(path + [name])
+            self._paths[m] = cur_path
+            self._modules[cur_path] = m
+            self._construct(m, path + [name])
+
+    def conv2d_modules(self):
+        return self.modules(nn.Conv2d)
+
+    def linear_modules(self):
+        return self.modules(nn.Linear)
+
+    def modules(self, module_type=None):
+        for p, m in self._modules.items():
+            if not module_type or isinstance(m, module_type):
+                yield p, m
+
+    def num_of_conv2d_modules(self):
+        return self.num_of_modules(nn.Conv2d)
+
+    def num_of_conv2d_filters(self):
+        """Return the sum of out_channels of all conv2d layers.
+
+        Here we treat the sub weight with size of [in_channels, h, w] as a single filter.
+        """
+        num_filters = 0
+        for _, m in self.conv2d_modules():
+            num_filters += m.out_channels
+        return num_filters
+
+    def num_of_linear_modules(self):
+        return self.num_of_modules(nn.Linear)
+
+    def num_of_linear_filters(self):
+        num_filters = 0
+        for _, m in self.linear_modules():
+            num_filters += m.out_features
+        return num_filters
+
+    def num_of_modules(self, module_type=None):
+        num = 0
+        for p, m in self._modules.items():
+            if not module_type or isinstance(m, module_type):
+                num += 1
+        return num
+
+    def get_module(self, path):
+        return self._modules.get(path)
+
+    def get_path(self, module):
+        return self._paths.get(module)
+
+    def update(self, path, module):
+        old_module = self._modules[path]
+        del self._paths[old_module]
+        self._paths[module] = path
+        self._modules[path] = module