├── .gitignore
├── 1_computeRois.py
├── 2_cntkGenerateInputs.py
├── 3_runCntk.py
├── 4_trainSvm.py
├── 5_evaluateResults.py
├── 5_visualizeResults.py
├── 6_scoreImage.py
├── A1_annotateImages.py
├── A2_annotateBboxLabels.py
├── B1_evaluateRois.py
├── B2_cntkVisualizeInputs.py
├── B3_cntkAnalyzeInputs.py
├── PARAMETERS.py
├── README.md
├── __init__.py
├── data
└── grocery
│ ├── negative
│ ├── 1.jpg
│ ├── 2.jpg
│ ├── 3.jpg
│ ├── 4.jpg
│ └── 5.jpg
│ ├── positive
│ ├── 0.bboxes.labels.tsv
│ ├── 0.bboxes.tsv
│ ├── 0.jpg
│ ├── 11.bboxes.labels.tsv
│ ├── 11.bboxes.tsv
│ ├── 11.jpg
│ ├── 12.bboxes.labels.tsv
│ ├── 12.bboxes.tsv
│ ├── 12.jpg
│ ├── 13.bboxes.labels.tsv
│ ├── 13.bboxes.tsv
│ ├── 13.jpg
│ ├── 14.bboxes.labels.tsv
│ ├── 14.bboxes.tsv
│ ├── 14.jpg
│ ├── 17.bboxes.labels.tsv
│ ├── 17.bboxes.tsv
│ ├── 17.jpg
│ ├── 18.bboxes.labels.tsv
│ ├── 18.bboxes.tsv
│ ├── 18.jpg
│ ├── 19.bboxes.labels.tsv
│ ├── 19.bboxes.tsv
│ ├── 19.jpg
│ ├── 2.bboxes.labels.tsv
│ ├── 2.bboxes.tsv
│ ├── 2.jpg
│ ├── 21.bboxes.labels.tsv
│ ├── 21.bboxes.tsv
│ ├── 21.jpg
│ ├── 22.bboxes.labels.tsv
│ ├── 22.bboxes.tsv
│ ├── 22.jpg
│ ├── 23.bboxes.labels.tsv
│ ├── 23.bboxes.tsv
│ ├── 23.jpg
│ ├── 24.bboxes.labels.tsv
│ ├── 24.bboxes.tsv
│ ├── 24.jpg
│ ├── 26.bboxes.labels.tsv
│ ├── 26.bboxes.tsv
│ ├── 26.jpg
│ ├── 3.bboxes.labels.tsv
│ ├── 3.bboxes.tsv
│ ├── 3.jpg
│ ├── 4.bboxes.labels.tsv
│ ├── 4.bboxes.tsv
│ ├── 4.jpg
│ ├── 6.bboxes.labels.tsv
│ ├── 6.bboxes.tsv
│ ├── 6.jpg
│ ├── 7.bboxes.labels.tsv
│ ├── 7.bboxes.tsv
│ ├── 7.jpg
│ ├── 8.bboxes.labels.tsv
│ ├── 8.bboxes.tsv
│ ├── 8.jpg
│ ├── 9.bboxes.labels.tsv
│ ├── 9.bboxes.tsv
│ └── 9.jpg
│ └── testImages
│ ├── 10.bboxes.labels.tsv
│ ├── 10.bboxes.tsv
│ ├── 10.jpg
│ ├── 15.bboxes.labels.tsv
│ ├── 15.bboxes.tsv
│ ├── 15.jpg
│ ├── 20.bboxes.labels.tsv
│ ├── 20.bboxes.tsv
│ ├── 20.jpg
│ ├── 25.bboxes.labels.tsv
│ ├── 25.bboxes.tsv
│ ├── 25.jpg
│ ├── 5.bboxes.labels.tsv
│ ├── 5.bboxes.tsv
│ └── 5.jpg
├── deprecated_3_runCntk_brainscript.py
├── doc
├── 0.filter.roi.jpg
├── 0.grid.roi.jpg
├── 0.ss.roi.jpg
├── anno_boxes.jpg
├── anno_labels.jpg
├── nn_00.jpg
├── nn_00_no_nms.jpg
├── nn_01.jpg
├── nn_110.jpg
├── nn_215.jpg
├── nn_425.jpg
├── nn_55.jpg
├── precision_recall.jpg
├── rcnnPipeline.JPG
├── svm_010.jpg
├── svm_115.jpg
├── svm_220.jpg
├── svm_325.jpg
└── svm_45.jpg
├── fastRCNN
├── __init__.py
├── imdb.py
├── nms.py
├── pascal_voc.py
├── test.py
├── timer.py
├── train_svms.py
├── utils34_win64
│ ├── cython_bbox.pyd
│ └── cython_nms.pyd
├── utils35_win64
│ ├── cython_bbox.pyd
│ └── cython_nms.pyd
└── voc_eval.py
├── helpers.py
├── helpers_cntk.py
├── imdb_data.py
└── resources
├── cntk
├── config.cntk
└── model.pdf
└── python35_64bit_requirements
├── opencv_python-3.2.0-cp35-cp35m-win_amd64.whl
└── requirements.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.pyx
3 | /__pycache__/
4 | /backup_v0/
5 | /data/liebherr_v4/
6 | /fastRCNN/__pycache__/
7 | /proc/
8 | /resources/cntk/AlexNet.model
9 | /resources/pascalVocData/
10 | /results/
11 | /selectivesearch/
12 |
--------------------------------------------------------------------------------
/1_computeRois.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import sys, os, importlib, random
3 | import PARAMETERS
4 | locals().update(importlib.import_module("PARAMETERS").__dict__)
5 |
6 |
7 | ####################################
8 | # Parameters
9 | ####################################
10 | boShowImg = True
11 | subdirs = ['positive', 'testImages', 'negative']
12 |
13 | #no need to change these parameters
14 | boAddSelectiveSearchROIs = True
15 | boAddGridROIs = True
16 | boFilterROIs = True
17 | if datasetName.lower() == "pascalvoc":
18 | print("No need to run ROI computation since Pascal VOC comes with pre-computed ROIs.")
19 | exit()
20 |
21 |
22 | ####################################
23 | # Main
24 | ####################################
25 | #init
26 | for subdir in subdirs:
27 | makeDirectory(roiDir)
28 | makeDirectory(roiDir + subdir)
29 | imgFilenames = getFilesInDirectory(imgDir + subdir, ".jpg")
30 |
31 | #loop over all images
32 | times = []
33 | for imgIndex, imgFilename in enumerate(imgFilenames):
34 | #if os.path.exists(roiPath):
35 | # print "Skipping image since roi file already exists: " + imgFilename, imgIndex
36 | # continue
37 |
38 | # load image
39 | print("Processing image {} of {}: subdir={}, filename={}".format(imgIndex, len(imgFilenames), subdir, imgFilename))
40 | imgPath = join(imgDir, subdir, imgFilename)
41 | imgOrig = imread(imgPath)
42 |
43 | # compute ROIs
44 | tstart = datetime.datetime.now()
45 | rois = computeRois(imgOrig, boAddSelectiveSearchROIs, boAddGridROIs, boFilterROIs, ss_kvals, ss_minSize, ss_max_merging_iterations, ss_nmsThreshold,
46 | roi_minDimRel, roi_maxDimRel, roi_maxImgDim, roi_maxAspectRatio, roi_minNrPixelsRel, roi_maxNrPixelsRel,
47 | grid_nrScales, grid_aspectRatios, grid_downscaleRatioPerIteration, grid_stepSizeRel)
48 | times.append((datetime.datetime.now() - tstart).total_seconds() * 1000)
49 | print(" Time roi computation [ms]: " + str((datetime.datetime.now() - tstart).total_seconds() * 1000))
50 | roiPath = "{}/{}/{}.roi.txt".format(roiDir, subdir, imgFilename[:-4])
51 | np.savetxt(roiPath, rois, fmt='%d')
52 |
53 | #visualize ROIs
54 | if boShowImg:
55 | debugScale = 800.0 / max(imWidthHeight(imgOrig))
56 | img = imresize(imgOrig, debugScale)
57 | drawRectangles(img, rois*debugScale, color=(0, 255, 0), thickness=1)
58 | imshow(img, waitDuration = 1)
59 | roiImgPath = os.path.join(roiDir, subdir, imgFilename[:-4] + ".roi.jpg")
60 | imwrite(img, roiImgPath)
61 |
62 | print("Time per image [ms]: median={:.1f}, std={:.1f}, 90%-percentile={:.1f}".format(np.median(times), np.std(times), np.percentile(times, 90)))
63 | print("DONE.")
--------------------------------------------------------------------------------
/2_cntkGenerateInputs.py:
--------------------------------------------------------------------------------
1 | import os, sys, importlib
2 | import shutil, time
3 | import PARAMETERS
4 | locals().update(importlib.import_module("PARAMETERS").__dict__)
5 |
6 |
7 | ####################################
8 | # Parameters
9 | ####################################
10 | image_sets = ["train", "test"]
11 |
12 |
13 | ####################################
14 | # Main
15 | ####################################
16 | #clear imdb cache and other files
17 | if os.path.exists(cntkFilesDir):
18 | assert(cntkFilesDir.endswith("cntkFiles/"))
19 | userInput = input('--> INPUT: Press "y" to delete directory ' + cntkFilesDir + ": ")
20 | if userInput.lower() not in ['y', 'yes']:
21 | print("User input is %s: exiting now." % userInput)
22 | exit(-1)
23 | shutil.rmtree(cntkFilesDir)
24 | time.sleep(0.2) #avoid file access errors
25 |
26 |
27 | #create cntk representation for each image
28 | makeDirectory(cntkFilesDir)
29 | for image_set in image_sets:
30 | imdb = imdbs[image_set]
31 | counterGt = np.zeros(len(classes), np.int32)
32 | print("Number of images in set '{}' = {}".format(image_set, imdb.num_images))
33 |
34 | #open files for writing
35 | cntkImgsPath, cntkRoiCoordsPath, cntkRoiLabelsPath, nrRoisPath = cntkInputPaths(cntkFilesDir, image_set)
36 | with open(cntkImgsPath, 'w') as cntkImgsFile, \
37 | open(cntkRoiCoordsPath, 'w') as cntkRoiCoordsFile, \
38 | open(cntkRoiLabelsPath, 'w') as cntkRoiLabelsFile, \
39 | open(nrRoisPath, 'w') as nrRoisFile:
40 |
41 | # for each image, transform rois etc to cntk format
42 | for imgIndex in range(0, imdb.num_images):
43 | if imgIndex % 200 == 0:
44 | print("Processing image set '{}', image {} of {}".format(image_set, imgIndex, imdb.num_images))
45 | imgPath = imdb.image_path_at(imgIndex)
46 | currRois = imdb.roidb[imgIndex]['boxes']
47 | currGtOverlaps = imdb.roidb[imgIndex]['gt_overlaps']
48 | for i in imdb.roidb[imgIndex]['gt_classes']:
49 | counterGt[i] += 1
50 |
51 | #get DNN inputs for image
52 | #Note: this also marks other ROIs as 'positives', if overlap with GT is above a threshold
53 | labelsStr, roisStr, _ = getCntkInputs(imgPath, currRois, currGtOverlaps, train_posOverlapThres, nrClasses, cntk_nrRois, cntk_padWidth, cntk_padHeight)
54 |
55 | #update cntk data
56 | nrRoisFile.write("{}\n".format(len(currRois)))
57 | cntkImgsFile.write("{}\t{}\t0\n".format(imgIndex, imgPath))
58 | cntkRoiCoordsFile.write("{} |rois{}\n".format(imgIndex, roisStr))
59 | cntkRoiLabelsFile.write("{} |roiLabels{}\n".format(imgIndex, labelsStr))
60 |
61 | #print debug info
62 | if image_set == 'train':
63 | for i in range(len(classes)):
64 | print(" {:3}: Found {} objects of class {}.".format(i, counterGt[i], classes[i]))
65 |
66 | print("DONE.")
67 |
--------------------------------------------------------------------------------
/3_runCntk.py:
--------------------------------------------------------------------------------
1 | from PARAMETERS import *
2 | from helpers_cntk import *
3 |
4 |
5 | ####################################
6 | # MAIN
7 | ####################################
8 | makeDirectory(modelDir)
9 | print ("classifier = " + classifier)
10 | print ("cntk_lr_per_image = " + str(cntk_lr_per_image))
11 |
12 | # optionally retrain DNN
13 | # if the classifier is svm, then simply return the 4096-floats penultimate layer as model
14 | # otherwise add new output layer, retrain the DNN, and return this new model.
15 | if classifier == 'svm':
16 | boSkipTraining = True
17 | else:
18 | boSkipTraining = False
19 | model = init_train_fast_rcnn(cntk_padHeight, cntk_padWidth, nrClasses, cntk_nrRois, cntk_mb_size, cntk_max_epochs,
20 | cntk_lr_per_image, cntk_l2_reg_weight, cntk_momentum_time_constant, cntkFilesDir, boSkipTraining)
21 |
22 | # write model to disk
23 | model_path = os.path.join(modelDir, "frcn_" + classifier + ".model")
24 | print("Writing model to %s" % model_path)
25 | model.save(model_path)
26 |
27 | # compute output of every image and write to disk
28 | image_sets = ["test", "train"]
29 | for image_set in image_sets:
30 | outParsedDir = cntkFilesDir + image_set + "_" + classifier + "_parsed/"
31 | makeDirectory(outParsedDir)
32 | run_fast_rcnn(model, image_set, cntk_padHeight, cntk_padWidth, nrClasses, cntk_nrRois, cntkFilesDir, outParsedDir)
33 |
34 | print("DONE.")
--------------------------------------------------------------------------------
/4_trainSvm.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | from fastRCNN.train_svms import SVMTrainer
3 | import PARAMETERS
4 | locals().update(importlib.import_module("PARAMETERS").__dict__)
5 |
6 |
7 | #################################################
8 | # Parameters
9 | #################################################
10 | experimentName = "exp1"
11 |
12 | #no need to change these params
13 | cntkParsedOutputDir = cntkFilesDir + "train_svm_parsed/"
14 |
15 |
16 |
17 | #################################################
18 | # Main
19 | #################################################
20 | if classifier == "nn":
21 | print("No need to train SVM since using 'nn' classifier.")
22 | exit()
23 | print ("svm_targetNorm = " + str(svm_targetNorm))
24 | print ("svm_retrainLimit = " + str(svm_retrainLimit))
25 | print ("svm_posWeight = " + str(svm_posWeight))
26 | print ("svm_C = " + str(svm_C))
27 | print ("svm_B = " + str(svm_B))
28 | print ("svm_penality = " + str(svm_penality))
29 | print ("svm_loss = " + str(svm_loss))
30 | print ("svm_evictThreshold = " + str(svm_evictThreshold))
31 | print ("svm_nrEpochs = " + str(svm_nrEpochs))
32 |
33 | #init
34 | makeDirectory(trainedSvmDir)
35 | np.random.seed(svm_rngSeed)
36 | imdb = imdbs["train"]
37 | net = DummyNet(4096, imdb.num_classes, cntkParsedOutputDir)
38 | svmWeightsPath, svmBiasPath, svmFeatScalePath = svmModelPaths(trainedSvmDir, experimentName)
39 |
40 | # add ROIs which significantly overlap with a ground truth object as positives
41 | if train_posOverlapThres > 0:
42 | print ("Adding ROIs with gt overlap >= %2.2f as positives ..." % (train_posOverlapThres))
43 | existingPosCounter, addedPosCounter = updateRoisGtClassIfHighGtOverlap(imdb, train_posOverlapThres)
44 | print ("Number of positives originally: {} (in {} images)".format(existingPosCounter, imdb.num_images))
45 | print ("Number of additional positives: {}.".format(addedPosCounter))
46 |
47 | # start training
48 | svm = SVMTrainer(net, imdb, im_detect, svmWeightsPath, svmBiasPath, svmFeatScalePath,
49 | svm_C, svm_B, svm_nrEpochs, svm_retrainLimit, svm_evictThreshold, svm_posWeight,
50 | svm_targetNorm, svm_penality, svm_loss, svm_rngSeed)
51 | svm.train()
52 | print ("DONE.")
53 |
--------------------------------------------------------------------------------
/5_evaluateResults.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | from fastRCNN.test import test_net
3 | import PARAMETERS
4 | locals().update(importlib.import_module("PARAMETERS").__dict__)
5 |
6 |
7 | ####################################
8 | # Parameters
9 | ####################################
10 | image_set = 'test'
11 | svmExperimentName = "exp1"
12 |
13 | #no need to change these
14 | cntkParsedOutputDir = cntkFilesDir + image_set + "_" + classifier + "_parsed/"
15 |
16 |
17 | ####################################
18 | # Main
19 | ####################################
20 | print("classifier = " + classifier)
21 | print("image_set = " + image_set)
22 | imdb = imdbs[image_set]
23 | net = DummyNet(4096, imdb.num_classes, cntkParsedOutputDir)
24 |
25 | #load svm
26 | svmFeatScale = None
27 | if classifier == 'svm':
28 | svmWeights, svmBias, svmFeatScale = loadSvm(trainedSvmDir, svmExperimentName)
29 | net.params['cls_score'][0].data = svmWeights
30 | net.params['cls_score'][1].data = svmBias
31 |
32 | #create empty directory for evaluation files
33 | if type(imdb) == imdb_data:
34 | evalTempDir = None
35 | else:
36 | #pascal_voc implementation requires temporary directory for evaluation
37 | evalTempDir = os.path.join(procDir, "eval_mAP_" + image_set)
38 | makeDirectory(evalTempDir)
39 | deleteAllFilesInDirectory(evalTempDir, None)
40 |
41 | #compute mAPs
42 | evalResults = test_net(net, imdb, evalTempDir, svmFeatScale, classifier, nmsThreshold, boUsePythonImpl = True, overlapThreshold = evalVocOverlapThreshold) #, boApplyNms = False) #, boThresholdDetections = False)
43 | #writeTable("evalResults.tsv", [["CLASS","Average Precision (AP)"]] + evalResults)
44 |
45 | print("DONE.")
--------------------------------------------------------------------------------
/5_visualizeResults.py:
--------------------------------------------------------------------------------
1 | import os, importlib, sys
2 | import PARAMETERS
3 | locals().update(importlib.import_module("PARAMETERS").__dict__)
4 |
5 |
6 | ####################################
7 | # Parameters
8 | ####################################
9 | image_set = 'test' #'train', 'test'
10 | svm_experimentName = 'exp1'
11 |
12 | #no need to change these parameters
13 | boIncludeGroundTruthRois = False #remove GT (perfect) ROIs which were added to the 'train' imageSet
14 | boUseNonMaximaSurpression = True
15 | visualizationDir = resultsDir + "visualizations"
16 | cntkParsedOutputDir = cntkFilesDir + image_set + "_" + classifier + "_parsed/"
17 | if classifier == 'svm':
18 | prThresholds = np.linspace(0, 10, 21)
19 | else:
20 | prThresholds = np.linspace(0, 1, 21)
21 |
22 |
23 |
24 | ####################################
25 | # Main
26 | ####################################
27 | #init
28 | imdb = imdbs[image_set]
29 | gt_roidb = imdb.gt_roidb()
30 | recalls = collections.defaultdict(list)
31 | precisions = collections.defaultdict(list)
32 |
33 | #load svm
34 | print("classifier = " + classifier)
35 | makeDirectory(resultsDir)
36 | makeDirectory(visualizationDir)
37 | if classifier == "svm":
38 | print("Loading svm weights..")
39 | svmWeights, svmBias, svmFeatScale = loadSvm(trainedSvmDir, svm_experimentName)
40 | else:
41 | svmWeights, svmBias, svmFeatScale = (None, None, None)
42 |
43 |
44 | #loop over all images and visualize
45 | for imgIndex in range(0, imdb.num_images):
46 | imgPath = imdb.image_path_at(imgIndex)
47 | imgWidth, imgHeight = imWidthHeight(imgPath)
48 | print("Processing image {} of {}: {}".format(imgIndex, imdb.num_images, imgPath))
49 |
50 | #load DNN output
51 | cntkOutputPath = os.path.join(cntkParsedOutputDir, str(imgIndex) + ".dat.npz")
52 | dnnOutput = np.load(cntkOutputPath)['arr_0']
53 | assert(len(dnnOutput) == cntk_nrRois)
54 |
55 | #evaluate classifier for all rois and remove the zero-padded rois
56 | labels, scores = scoreRois(classifier, dnnOutput, svmWeights, svmBias, svmFeatScale, len(classes)) #, vis_decisionThresholds[classifier])
57 | scores = scores[:len(imdb.roidb[imgIndex]['boxes'])]
58 | labels = labels[:len(imdb.roidb[imgIndex]['boxes'])]
59 |
60 | #remove the ground truth ROIs which were added for training purposes
61 | if not boIncludeGroundTruthRois:
62 | inds = np.where(imdb.roidb[imgIndex]['gt_classes'] == 0)[0]
63 | labels = [labels[i] for i in inds]
64 | scores = [scores[i] for i in inds]
65 | imdb.roidb[imgIndex]['boxes'] = imdb.roidb[imgIndex]['boxes'][inds]
66 |
67 | #perform non-maxima surpression. note that the set of labels detected in the image is not affected by this.
68 | nmsKeepIndices = []
69 | if boUseNonMaximaSurpression:
70 | nmsKeepIndices = applyNonMaximaSuppression(nmsThreshold, labels, scores, imdb.roidb[imgIndex]['boxes'])
71 | print("Non-maxima surpression kept {:4} of {:4} rois (nmsThreshold={})".format(len(nmsKeepIndices), len(labels), nmsThreshold))
72 |
73 | #visualize results
74 | imgDebug = visualizeResults(imgPath, labels, scores, imdb.roidb[imgIndex]['boxes'], classes, nmsKeepIndices,
75 | boDrawNegativeRois=False, boDrawNmsRejectedRois=False, decisionThreshold = vis_decisionThresholds[classifier])
76 | imshow(imgDebug, waitDuration=1, maxDim = 800)
77 | imwrite(imgDebug, visualizationDir + "/" + classifier + "_" + str(imgIndex) + os.path.basename(imgPath))
78 |
79 |
80 | #compute precision recall of the detection for different thresholds
81 | gtLabels = gt_roidb[imgIndex]['gt_classes']
82 | gtBboxes = [Bbox(*rect) for rect in gt_roidb[imgIndex]['boxes']]
83 |
84 | for thres in prThresholds:
85 | # get detections with scores higher than the threshold and which were kept by nms
86 | keepInds = set(np.where((np.array(labels) > 0) & (np.array(scores) > thres))[0])
87 | if boUseNonMaximaSurpression:
88 | keepInds = keepInds.intersection(nmsKeepIndices)
89 | detLabels = [labels[i] for i in keepInds]
90 | detBboxes = [Bbox(*imdb.roidb[imgIndex]['boxes'][i]) for i in keepInds]
91 |
92 | #compute precision recall of the detection
93 | precision, recall = detPrecisionRecall(detBboxes, detLabels, gtBboxes, gtLabels,
94 | evalVocOverlapThreshold, boPenalizeMultipleDetections=False)
95 | recalls[thres].append(recall)
96 | if precision != None:
97 | precisions[thres].append(precision)
98 |
99 |
100 | #compute precision and recall at different thresholds
101 | print("Precision/recall when rejecting detections below a given threshold:")
102 | outPR = [("Threshold", "Precision", "Recall")]
103 | for thres in prThresholds:
104 | if precisions[thres] == []:
105 | break
106 | p = np.mean(precisions[thres])
107 | r = np.mean(recalls[thres])
108 | outPR.append((thres, p, r))
109 | print(" At threshold {:.2f}: precision = {:2.2f}, recall = {:2.2f}".format(thres, p, r))
110 | #writeTable("precisionRecalls.tsv", outPR)
111 |
112 | print("DONE.")
--------------------------------------------------------------------------------
/6_scoreImage.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import sys, os, importlib, random, json
3 | import PARAMETERS
4 | from helpers_cntk import *
5 | locals().update(importlib.import_module("PARAMETERS").__dict__)
6 |
7 |
8 | ####################################
9 | # Parameters
10 | ####################################
11 | imgPath = r"C:/Users/pabuehle/Desktop/newImgs/WIN_20160803_11_30_07_Pro.jpg"
12 |
13 | #choose which classifier to use
14 | classifier = 'svm'
15 | svm_experimentName = 'exp1'
16 |
17 | # no need to change these parameters
18 | boAddSelectiveSearchROIs = True
19 | boAddGridROIs = True
20 | boFilterROIs = True
21 | boUseNonMaximaSurpression = True
22 |
23 |
24 | ####################################
25 | # Main
26 | ####################################
27 | random.seed(0)
28 |
29 | # load cntk model
30 | print("Loading DNN..")
31 | tstart = datetime.datetime.now()
32 | model_path = os.path.join(modelDir, "frcn_" + classifier + ".model")
33 | if not os.path.exists(model_path):
34 | raise Exception("Model {} not found.".format(model_path))
35 | model = load_model(model_path)
36 | print("Time loading DNN [ms]: " + str((datetime.datetime.now() - tstart).total_seconds() * 1000))
37 |
38 | # load trained svm
39 | if classifier == "svm":
40 | print("Loading svm weights..")
41 | tstart = datetime.datetime.now()
42 | svmWeights, svmBias, svmFeatScale = loadSvm(trainedSvmDir, svm_experimentName)
43 | print("Time loading svm [ms]: " + str((datetime.datetime.now() - tstart).total_seconds() * 1000))
44 | else:
45 | svmWeights, svmBias, svmFeatScale = (None, None, None)
46 |
47 | # compute ROIs
48 | tstart = datetime.datetime.now()
49 | imgOrig = imread(imgPath)
50 | currRois = computeRois(imgOrig, boAddSelectiveSearchROIs, boAddGridROIs, boFilterROIs, ss_kvals, ss_minSize,
51 | ss_max_merging_iterations, ss_nmsThreshold,
52 | roi_minDimRel, roi_maxDimRel, roi_maxImgDim, roi_maxAspectRatio, roi_minNrPixelsRel,
53 | roi_maxNrPixelsRel, grid_nrScales, grid_aspectRatios, grid_downscaleRatioPerIteration, grid_stepSizeRel)
54 | currRois = currRois[:cntk_nrRois] # only keep first cntk_nrRois rois
55 | print("Time roi computation [ms]: " + str((datetime.datetime.now() - tstart).total_seconds() * 1000))
56 |
57 | # prepare DNN inputs
58 | tstart = datetime.datetime.now()
59 | imgPadded = imresizeAndPad(imgOrig, cntk_padWidth, cntk_padHeight)
60 | _, _, roisCntk = getCntkInputs(imgPath, currRois, None, train_posOverlapThres, nrClasses, cntk_nrRois, cntk_padWidth, cntk_padHeight)
61 | arguments = {
62 | model.arguments[0]: [np.ascontiguousarray(np.array(imgPadded, dtype=np.float32).transpose(2, 0, 1))], # convert to CNTK's HWC format
63 | model.arguments[1]: [np.array(roisCntk, np.float32)]
64 | }
65 | print("Time cnkt input generation [ms]: " + str((datetime.datetime.now() - tstart).total_seconds() * 1000))
66 |
67 | # run DNN model
68 | print("Running model..")
69 | tstart = datetime.datetime.now()
70 | dnnOutputs = model.eval(arguments)[0]
71 | dnnOutputs = dnnOutputs[:len(currRois)] # remove the zero-padded rois
72 | print("Time running model [ms]: " + str((datetime.datetime.now() - tstart).total_seconds() * 1000))
73 |
74 | # score all ROIs
75 | tstart = datetime.datetime.now()
76 | labels, scores = scoreRois(classifier, dnnOutputs, svmWeights, svmBias, svmFeatScale, len(classes),
77 | decisionThreshold = vis_decisionThresholds[classifier])
78 | print("Time making prediction [ms]: " + str((datetime.datetime.now() - tstart).total_seconds() * 1000))
79 |
80 | # perform non-maxima surpression
81 | tstart = datetime.datetime.now()
82 | nmsKeepIndices = []
83 | if boUseNonMaximaSurpression:
84 | nmsKeepIndices = applyNonMaximaSuppression(nmsThreshold, labels, scores, currRois)
85 | print("Non-maxima surpression kept {:4} of {:4} rois (nmsThreshold={})".format(
86 | len(nmsKeepIndices), len(labels), nmsThreshold))
87 | print("Time non-maxima surpression [ms]: " + str((datetime.datetime.now() - tstart).total_seconds() * 1000))
88 |
89 | # visualize results
90 | imgDebug = visualizeResults(imgPath, labels, scores, currRois, classes, nmsKeepIndices,
91 | boDrawNegativeRois=False, boDrawNmsRejectedRois=False)
92 | imshow(imgDebug, waitDuration=0, maxDim=800)
93 |
94 | # create json-encoded string of all detections
95 | outDict = [{"label": str(l), "score": str(s), "nms": str(False), "left": str(r[0]), "top": str(r[1]), "right": str(r[2]), "bottom": str(r[3])} for l,s, r in zip(labels, scores, currRois)]
96 | for i in nmsKeepIndices:
97 | outDict[i]["nms"] = str(True)
98 | outJsonString = json.dumps(outDict)
99 | print("Json-encoded detections: " + outJsonString[:120] + "...")
100 | print("DONE.")
101 |
102 | #--- optional code ---#
103 |
104 | # write all detections to file, and show how to read in again to visualize
105 | # writeDetectionsFile("detections.tsv", outDict, classes)
106 | # labels2, scores2, currRois2, nmsKeepIndices2 = parseDetectionsFile("detections.tsv", lutClass2Id)
107 | # imgDebug2 = visualizeResults(imgPath, labels2, scores2, currRois2, classes, nmsKeepIndices2, # identical to imgDebug
108 | # boDrawNegativeRois=False, boDrawNmsRejectedRois=False)
109 | # imshow(imgDebug2, waitDuration=0, maxDim=800)
110 |
111 | # extract crop of the highest scored ROI
112 | # maxScore = -float("inf")
113 | # maxScoreRoi = []
114 | # for index, (label,score) in enumerate(zip(labels,scores)):
115 | # if score > maxScore and label > 0: #and index in nmsKeepIndices:
116 | # maxScore = score
117 | # maxScoreRoi = currRois[index]
118 | # if maxScoreRoi == []:
119 | # print("WARNING: not a single object detected")
120 | # else:
121 | # imgCrop = imgOrig[maxScoreRoi[1]:maxScoreRoi[3], maxScoreRoi[0]:maxScoreRoi[2], :]
122 | # imwrite(imgCrop, outCropDir + os.path.basename(imgPath))
123 | # imshow(imgCrop)
124 |
125 |
--------------------------------------------------------------------------------
/A1_annotateImages.py:
--------------------------------------------------------------------------------
1 | import os, sys, importlib, shutil
2 | import PARAMETERS
3 | locals().update(importlib.import_module("PARAMETERS").__dict__)
4 |
5 |
6 | ####################################
7 | # Parameters
8 | ####################################
9 | imagesToAnnotateDir = "C:/Users/pabuehle/Desktop/newImgs/"
10 |
11 | #no need to change these params
12 | drawingMaxImgSize = 1000.0
13 | annotationsFile = resultsDir + "annotations.tsv"
14 | minNrPixels = -1
15 |
16 |
17 | ####################################
18 | # Functions
19 | ####################################
20 | def event_cv2GetRectangles(event, x, y, flags, param):
21 | global cv2GetRectangle_global_bboxes
22 | global cv2GetRectangle_global_leftButtonDownPoint
23 | boLeftMouseDown = flags == cv2.EVENT_FLAG_LBUTTON
24 |
25 | #draw all previous bounding boxes
26 | imgCopy = image.copy()
27 | drawRectangles(imgCopy, cv2GetRectangle_global_bboxes)
28 | if len(cv2GetRectangle_global_bboxes)>0:
29 | drawRectangles(imgCopy, [cv2GetRectangle_global_bboxes[-1]], color = (255, 0, 0))
30 |
31 | #handle mouse clicks
32 | if event == cv2.EVENT_LBUTTONDOWN:
33 | cv2GetRectangle_global_leftButtonDownPoint = (x, y)
34 | elif event == cv2.EVENT_LBUTTONUP:
35 | pt1 = cv2GetRectangle_global_leftButtonDownPoint
36 | pt2 = (x, y)
37 | minPt = (min(pt1[0], pt2[0]), min(pt1[1], pt2[1]))
38 | maxPt = (max(pt1[0], pt2[0]), max(pt1[1], pt2[1]))
39 | imgWidth, imgHeight = imWidthHeight(image)
40 | minPt = ptClip(minPt, imgWidth, imgHeight)
41 | maxPt = ptClip(maxPt, imgWidth, imgHeight)
42 | cv2GetRectangle_global_bboxes.append(minPt + maxPt)
43 | elif boLeftMouseDown:
44 | cv2.rectangle(imgCopy, cv2GetRectangle_global_leftButtonDownPoint, (x, y), (255, 255, 0), 1)
45 | else:
46 | drawCrossbar(imgCopy, (x, y))
47 | cv2.imshow("image", imgCopy)
48 |
49 |
50 | def procBoundingBoxes(rectsIn, imageUnscaled, scaleFactor):
51 | if len(rectsIn) <= 0:
52 | return rectsIn
53 | else:
54 | rects = copy.deepcopy(rectsIn)
55 | for index in range(len(rects)):
56 | for i in range(4):
57 | rects[index][i] = int(round(rects[index][i] / scaleFactor))
58 | imgWidth, imgHeight = imWidthHeight(imageUnscaled)
59 | bboxes = [Bbox(*rect) for rect in rects]
60 | for bbox in bboxes:
61 | bbox.crop(imgWidth, imgHeight)
62 | assert(bbox.isValid())
63 | return [bbox.rect() for bbox in bboxes]
64 |
65 |
66 |
67 | ####################################
68 | # Main
69 | ####################################
70 | makeDirectory(resultsDir)
71 | imgFilenames = [f for f in os.listdir(imagesToAnnotateDir) if f.lower().endswith(".jpg")]
72 |
73 | print("Using annotations file: " + annotationsFile)
74 | if annotationsFile and os.path.exists(annotationsFile):
75 | shutil.copyfile(annotationsFile, annotationsFile + ".backup.tsv")
76 | data = readTable(annotationsFile)
77 | annotationsLUT = getDictionary(getColumn(data,0), getColumn(data,1), False)
78 | else:
79 | annotationsLUT = dict()
80 |
81 |
82 | #loop over each image and get annotation
83 | for imgFilenameIndex,imgFilename in enumerate(imgFilenames):
84 | print("imgFilenameIndex = {}, imgFilename = {}".format(imgFilenameIndex, imgFilename))
85 | imgPath = imagesToAnnotateDir + imgFilename
86 | print("Processing image {0} of {1}: {2}".format(imgFilenameIndex, len(imgFilenames), imgPath))
87 | bBoxPath = imgPath[:-4] + ".bboxes.tsv"
88 |
89 | #compute scale factor
90 | imgWidth, imgHeight = imWidthHeight(imgPath)
91 | scaleFactor = min(1, drawingMaxImgSize / max(imgWidth, imgHeight))
92 | if imgWidth * imgHeight < minNrPixels:
93 | print("Low resolution ({0},{1}) hence skipping image: {2}.".format(imgWidth, imgHeight, imgPath))
94 | continue
95 |
96 | #load existing ground truth if provided
97 | cv2GetRectangle_global_bboxes = []
98 | if os.path.exists(bBoxPath):
99 | print("Skipping image since ground truth already exists: %s." % imgPath)
100 | continue
101 |
102 | #draw image
103 | imageUnscaled = imread(imgPath)
104 | image = imresize(imageUnscaled, scaleFactor)
105 | cv2.namedWindow("image")
106 | cv2.setMouseCallback("image", event_cv2GetRectangles)
107 | imgCopy = image.copy()
108 | drawRectangles(imgCopy, cv2GetRectangle_global_bboxes)
109 | cv2.imshow("image", imgCopy)
110 |
111 | #wait for user input
112 | while True:
113 | key = chr(cv2.waitKey()) #& 0xFF
114 |
115 | #skip
116 | if key == "s":
117 | if os.path.exists(bBoxPath):
118 | print("Skipping image hence deleting existing bbox file: " + bBoxPath)
119 | os.remove(bBoxPath)
120 | annotationsLUT[imgPath] = "skip"
121 | if annotationsFile:
122 | writeTable(annotationsFile, sortDictionary(annotationsLUT))
123 | break
124 |
125 | #undo
126 | if key == "u":
127 | if len(cv2GetRectangle_global_bboxes) >= 1:
128 | cv2GetRectangle_global_bboxes = cv2GetRectangle_global_bboxes[:-1]
129 | imgCopy = image.copy()
130 | drawRectangles(imgCopy, cv2GetRectangle_global_bboxes)
131 | cv2.imshow("image", imgCopy)
132 |
133 | #next image
134 | elif key == "n":
135 | bboxes = procBoundingBoxes(cv2GetRectangle_global_bboxes, imageUnscaled, scaleFactor)
136 | writeTable(bBoxPath, bboxes)
137 | annotationsLUT[imgPath] = bboxes
138 | if annotationsFile:
139 | writeTable(annotationsFile, sortDictionary(annotationsLUT))
140 | break
141 |
142 | #quit
143 | elif key == "q":
144 | sys.exit()
145 |
146 | cv2.destroyAllWindows()
147 | print("DONE.")
--------------------------------------------------------------------------------
/A2_annotateBboxLabels.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import cv2, os, sys, time, importlib
3 | from tkinter import *
4 | from PIL import ImageTk
5 | import PARAMETERS
6 | locals().update(importlib.import_module("PARAMETERS").__dict__)
7 |
8 |
9 | ####################################
10 | # Parameters
11 | ####################################
12 | imagesToAnnotateDir = "C:/Users/pabuehle/Desktop/newImgs/"
13 |
14 | #no need to change these
15 | boxWidth = 10
16 | boxHeight = 2
17 | drawingMaxImgSize = 1000
18 | objectNames = classes[1:]
19 | objectNames = np.sort(objectNames).tolist()
20 | objectNames += ["UNDECIDED", "EXCLUDE"]
21 |
22 |
23 |
24 | ####################################
25 | # Helper functions
26 | ####################################
27 | def buttonPressedCallback(s):
28 | global tkLastButtonPressed
29 | global tkBoButtonPressed
30 | tkLastButtonPressed = s
31 | tkBoButtonPressed = True
32 |
33 |
34 |
35 | ####################################
36 | # Main
37 | ####################################
38 | #create UI
39 | tk = Tk()
40 | w = Canvas(tk, width=len(objectNames) * boxWidth, height=len(objectNames) * boxHeight, bd = boxWidth, bg = 'white')
41 | w.grid(row = len(objectNames), column = 0, columnspan = 2)
42 | for objectIndex,objectName in enumerate(objectNames):
43 | b = Button(width=boxWidth, height=boxHeight, text=objectName, command=lambda s = objectName: buttonPressedCallback(s))
44 | b.grid(row = objectIndex, column = 0)
45 |
46 |
47 | #loop over all images
48 | imgFilenames = getFilesInDirectory(imagesToAnnotateDir, ".jpg")
49 | for imgIndex, imgFilename in enumerate(imgFilenames):
50 | print("imgIndex={}, imgFilename={}".format(imgIndex, imgFilename))
51 | labelsPath = imagesToAnnotateDir + "/" + imgFilename[:-4] + ".bboxes.labels.tsv"
52 | if os.path.exists(labelsPath):
53 | continue
54 |
55 | #load image and bboxes
56 | imgPath = imagesToAnnotateDir + "/" + imgFilename
57 | print("imgIndex = {}, imgPath = {}".format(imgIndex, imgPath))
58 | img = imread(imgPath)
59 | rectsPath = imgPath = imagesToAnnotateDir + "/" + imgFilename[:-4] + ".bboxes.tsv"
60 | rects = readTable(rectsPath)
61 | rects = [ToIntegers(rect) for rect in rects]
62 |
63 | #annotate each rectangle in turn
64 | labels = []
65 | for rectIndex,rect in enumerate(rects):
66 | imgCopy = img.copy()
67 | drawRectangles(imgCopy, [rect], thickness = 15)
68 |
69 | #draw image in tk window
70 | imgTk, _ = imresizeMaxDim(imgCopy, drawingMaxImgSize)
71 | imgTk = imconvertCv2Pil(imgTk)
72 | imgTk = ImageTk.PhotoImage(imgTk)
73 | label = Label(tk, image=imgTk)
74 | label.grid(row=0, column=1, rowspan=drawingMaxImgSize)
75 | tk.update_idletasks()
76 | tk.update()
77 |
78 | #busy-wait until button pressed
79 | tkBoButtonPressed = False
80 | tkLastButtonPressed = None
81 | while not tkBoButtonPressed:
82 | tk.update_idletasks()
83 | tk.update()
84 |
85 | #store result
86 | print("tkLastButtonPressed", tkLastButtonPressed)
87 | labels.append(tkLastButtonPressed)
88 |
89 | writeFile(labelsPath, labels)
90 | tk.destroy()
91 | print("DONE.")
--------------------------------------------------------------------------------
/B1_evaluateRois.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import sys, os, importlib
3 | import PARAMETERS
4 | locals().update(importlib.import_module("PARAMETERS").__dict__)
5 |
6 |
7 |
8 | ####################################
9 | # Parameters
10 | ####################################
11 | subdirs = ['positive']
12 |
13 |
14 | ####################################
15 | # Main
16 | ####################################
17 | overlaps = []
18 | roiCounts = []
19 | for subdir in subdirs:
20 | imgFilenames = getFilesInDirectory(imgDir + subdir, ".jpg")
21 |
22 | #loop over all iamges
23 | for imgIndex,imgFilename in enumerate(imgFilenames):
24 | if imgIndex % 50 == 0:
25 | print("Processing subdir '{}', image {} of {}".format(subdir, imgIndex, len(imgFilenames)))
26 | # load ground truth
27 | imgPath = imgDir + subdir + "/" + imgFilename
28 | imgWidth, imgHeight = imWidthHeight(imgPath)
29 | gtRois, gtLabels = readGtAnnotation(imgPath)
30 | gtRois = [Bbox(*roi) for roi in gtRois]
31 |
32 | # load rois and compute scale
33 | rois = readRois(roiDir, subdir, imgFilename)
34 | rois = rois[:cntk_nrRois] # only use the first N rois (similar to rest of code)
35 | rois = [Bbox(*roi) for roi in rois]
36 | roiCounts.append(len(rois))
37 |
38 | # for each ground truth, compute if it is covered by an roi
39 | for gtIndex, (gtLabel, gtRoi) in enumerate(zip(gtLabels,gtRois)):
40 | maxOverlap = -1
41 | assert (gtRoi.max() <= max(imgWidth, imgHeight) and gtRoi.max() >= 0)
42 | if gtLabel in classes[1:]:
43 | for roi in rois:
44 | assert (roi.max() <= max(imgWidth, imgHeight) and roi.max() >= 0)
45 | overlap = bboxComputeOverlapVoc(gtRoi, roi)
46 | maxOverlap = max(maxOverlap, overlap)
47 | overlaps.append(maxOverlap)
48 | print("Average number of rois per image " + str(int(1.0 * sum(roiCounts) / len(imgFilenames))))
49 |
50 | #compute recall at different overlaps
51 | recalls = []
52 | overlaps = np.array(overlaps, np.float32)
53 | for overlapThreshold in np.linspace(0,1,21):
54 | recall = 1.0 * sum(overlaps >= overlapThreshold) / len(overlaps)
55 | recalls.append(recall)
56 | print("At threshold {:.2f}: recall = {:2.2f}".format(overlapThreshold, recall))
57 | print("Mean recall = {:2.2}".format(np.mean(recalls)))
--------------------------------------------------------------------------------
/B2_cntkVisualizeInputs.py:
--------------------------------------------------------------------------------
1 | import os, importlib, sys
2 | import PARAMETERS
3 | locals().update(importlib.import_module("PARAMETERS").__dict__)
4 |
5 |
6 | ####################################
7 | # Parameters
8 | ####################################
9 | image_set = 'test' # 'train', 'test'
10 |
11 | #no need to change these parameters
12 | parseNrImages = 50 #for speed reasons only parse CNTK file for the first N images
13 | boUseNonMaximaSurpression = False
14 |
15 |
16 |
17 | ####################################
18 | # Main
19 | ####################################
20 | print("Load ROI co-ordinates and labels")
21 | cntkImgsPath, cntkRoiCoordsPath, cntkRoiLabelsPath, nrRoisPath = cntkInputPaths(cntkFilesDir, image_set)
22 | imgPaths = getColumn(readTable(cntkImgsPath),1)
23 | nrRealRois = [int(s) for s in readFile(nrRoisPath)]
24 | roiAllLabels = parseCntkRoiLabels(cntkRoiLabelsPath, cntk_nrRois, len(classes), parseNrImages)
25 | if parseNrImages:
26 | imgPaths = imgPaths[:parseNrImages]
27 | nrRealRois = nrRealRois[:parseNrImages]
28 | roiAllLabels = roiAllLabels[:parseNrImages]
29 | roiAllCoords = parseCntkRoiCoords(imgPaths, cntkRoiCoordsPath, cntk_nrRois, cntk_padWidth, cntk_padHeight, parseNrImages)
30 | assert(len(imgPaths) == len(roiAllCoords) == len(roiAllLabels) == len(nrRealRois))
31 |
32 |
33 | #loop over all images and visualize
34 | for imgIndex,imgPath in enumerate(imgPaths):
35 | print("Visualizing image %d at %s..." %(imgIndex,imgPath))
36 | roiCoords = roiAllCoords[imgIndex][:nrRealRois[imgIndex]]
37 | roiLabels = roiAllLabels[imgIndex][:nrRealRois[imgIndex]]
38 |
39 | #perform non-maxima surpression. note that the detected classes in the image is not affected by this.
40 | nmsKeepIndices = []
41 | if boUseNonMaximaSurpression:
42 | imgWidth, imgHeight = imWidthHeight(imgPath)
43 | nmsKeepIndices = applyNonMaximaSuppression(nmsThreshold, roiLabels, [0] * len(roiLabels), roiCoords)
44 | print("Non-maxima surpression kept {} of {} rois (nmsThreshold={})".format(len(nmsKeepIndices), len(roiLabels), nmsThreshold))
45 |
46 | #visualize results
47 | imgDebug = visualizeResults(imgPath, roiLabels, None, roiCoords, classes, nmsKeepIndices, boDrawNegativeRois=False)
48 | imshow(imgDebug, waitDuration=0, maxDim = 800)
49 | print("DONE.")
50 |
--------------------------------------------------------------------------------
/B3_cntkAnalyzeInputs.py:
--------------------------------------------------------------------------------
1 | import os, sys, importlib
2 | import shutil, time
3 | import PARAMETERS
4 | locals().update(importlib.import_module("PARAMETERS").__dict__)
5 |
6 |
7 | ####################################
8 | # Parameters
9 | ####################################
10 | image_set = "train"
11 |
12 |
13 | ####################################
14 | # Main
15 | ####################################
16 | # read ground truth and ROIs
17 | if not os.path.exists(cntkFilesDir + image_set + ".cache_gt_roidb.pkl"):
18 | raise Exception("Run 2_cntkGenerateInputs.py before executing this script.")
19 | imdb = imdbs[image_set]
20 | gtRois = imdb.gt_roidb()
21 | print("Number of images in set '{}' = {}".format(image_set, imdb.num_images))
22 |
23 | # extract width, height, etc for all ground truth annotations in all images
24 | roiInfos = []
25 | for imgIndex in range(0, imdb.num_images):
26 | imgPath = imdb.image_path_at(imgIndex)
27 | imgWidth, imgHeight = imWidthHeight(imgPath)
28 |
29 | if gtRois[imgIndex] != None:
30 | for gtRoi in gtRois[imgIndex]['boxes']:
31 | roiWidth = gtRoi[2] - gtRoi[0] +1
32 | roiHeight = gtRoi[3] - gtRoi[1] +1
33 | roiRelWidth = float(roiWidth) / imgWidth
34 | roiRelHeight = float(roiHeight) / imgHeight
35 | roiInfos.append((roiRelWidth, roiRelHeight, roiRelWidth * roiRelHeight, roiRelWidth / roiRelHeight))
36 |
37 | # analyse typical width, height, etc of the ground truth annotations
38 | print("\nStatistics for ground truth annotations:")
39 | for percentile in np.linspace(0, 100, 21):
40 | print(" Percentile {:3.0f}: width = {:<.2f}, height = {:<.2f}, area = {:<.3f}, aspectRatio = {:<.2f}".format(
41 | percentile,
42 | np.percentile(getColumn(roiInfos, 0), percentile),
43 | np.percentile(getColumn(roiInfos, 1), percentile),
44 | np.percentile(getColumn(roiInfos, 2), percentile),
45 | np.percentile(getColumn(roiInfos, 3), percentile)))
46 | print("DONE.")
47 |
--------------------------------------------------------------------------------
/PARAMETERS.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from helpers import *
3 | from imdb_data import imdb_data
4 | import fastRCNN, time, datetime
5 | from fastRCNN.pascal_voc import pascal_voc
6 | print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
7 |
8 |
9 | ############################
10 | # Adjust these parameters
11 | # to make scripts run
12 | ############################
13 | rootDir = os.path.dirname(os.path.realpath(sys.argv[0]))
14 |
15 | ############################
16 | # default parameters
17 | ############################
18 | datasetName = "grocery"
19 |
20 | #directories
21 | imgDir = rootDir + "/data/" + datasetName + "/"
22 | procDir = rootDir + "/proc/" + datasetName + "/"
23 | resultsDir = rootDir + "/results/" + datasetName + "/"
24 | roiDir = procDir + "rois/"
25 | modelDir = procDir + "models/"
26 | cntkFilesDir = procDir + "cntkFiles/"
27 | trainedSvmDir = procDir + "trainedSvm/"
28 | cntkResourcesDir = rootDir + "/resources/cntk/"
29 |
30 | # ROI generation
31 | roi_maxImgDim = 200 # image size used for ROI generation
32 | roi_minDimRel = 0.01 # minimum relative width/height of a ROI
33 | roi_maxDimRel = 1.0 # maximum relative width/height of a ROI
34 | roi_minNrPixelsRel = 0 # minimum relative area covered by a ROI
35 | roi_maxNrPixelsRel = 1.0 # maximum relative area covered by a ROI
36 | roi_maxAspectRatio = 4.0 # maximum aspect Ratio of a ROI, both vertically and horizontally
37 | ss_minSize = 20 # for a description of the selective search parameters see:
38 | ss_kvals = (50, 500, 6) # http://dlib.net/dlib/image_transforms/segment_image_abstract.h.html#find_candidate_object_locations
39 | ss_max_merging_iterations = 20 #
40 | ss_nmsThreshold = 0.85 # non-maxima surpression threshold run after selective search
41 | grid_nrScales = 7 # uniform grid ROIs: number of iterations from largest possible ROI to smaller ROIs
42 | grid_stepSizeRel = 0.5 # uniform grid ROIs: step size for sliding windows
43 | grid_aspectRatios = [1.0, 2.0, 0.5] # uniform grid ROIs: allowed aspect ratio of ROIs
44 | grid_downscaleRatioPerIteration = 1.5 # uniform grid ROIs: relative ROI width/height reduction per iteration, starting from largest possible ROI
45 |
46 | # cntk model
47 | cntk_nrRois = 2000 # DNN input number of ROIs per image. Zero-padded/truncated if necessary
48 | cntk_padWidth = 1000 # DNN input image width [pixels]
49 | cntk_padHeight = 1000 # DNN input image height [pixels]
50 | cntk_featureDimensions = {'svm': 4096} # DNN output, dimension of each ROI
51 |
52 | # nn and svm training
53 | classifier = 'svm' # Options: 'svm', 'nn'. Train either a Support Vector Machine, or directly the Neural Network
54 | train_posOverlapThres = 0.5 # DNN and SVM threshold for marking ROIs with significant overlap with a GT object as positive
55 |
56 | # nn training
57 | cntk_max_epochs = 18 # number of training epochs (only relevant if 'lassifier' is set to: 'nn')
58 | cntk_mb_size = 5 # minibatch size
59 | cntk_l2_reg_weight = 0.0005 # l2 regularizer weight
60 | cntk_lr_per_image = [0.01] * 10 + [0.001] * 5 + [0.0001] #learning rate per image
61 | cntk_momentum_time_constant = 10 # momentum
62 |
63 | # svm training
64 | svm_C = 0.001 # regularization parameter of the soft-margin error term
65 | svm_B = 10.0 # intercept scaling
66 | svm_nrEpochs = 2 # number of training iterations
67 | svm_retrainLimit = 2000 # number of new items to trigger SVM training
68 | svm_evictThreshold = -1.1 # remove easy negatives with decision value below this threshold
69 | svm_posWeight = "balanced"# automatically balance training set to correct for the majority of ROIs being negative
70 | svm_targetNorm = 20.0 # magic value from traditional R-CNN (helps with convergence)
71 | svm_penality = 'l2' # penalty norm
72 | svm_loss = 'l1' # loss norm
73 | svm_rngSeed = 3 # seed for randomization
74 |
75 | # postprocessing
76 | nmsThreshold = 0.3 # Non-Maxima suppression threshold (in range [0,1])
77 | # The lower the more ROIs will be combined. Used during evaluation and visualization (scripts 5_)
78 | vis_decisionThresholds = {'svm' : 0.5, # Reject detections with low confidence, used only in 5_visualizeResults
79 | 'nn' : None}
80 |
81 | # evaluation
82 | evalVocOverlapThreshold = 0.5 # voc-style intersection-over-union threshold used to determine if object was found
83 |
84 |
85 |
86 | ############################
87 | # project-specific
88 | # parameters / overrides
89 | ############################
90 | if datasetName.startswith("grocery"):
91 | classes = ('__background__', # always have '__background__' be at index 0
92 | "orange", "eggBox", "joghurt", "ketchup", "squash", "mushroom", "water", "mustard")
93 |
94 |
95 | # roi generation
96 | cntk_nrRois = 200 #this number is too low to get good accuracy but allows for fast training and scoring (for demo purposes)
97 | roi_minDimRel = 0.04
98 | roi_maxDimRel = 0.4
99 | roi_minNrPixelsRel = 2 * roi_minDimRel * roi_minDimRel
100 | roi_maxNrPixelsRel = 0.33 * roi_maxDimRel * roi_maxDimRel
101 |
102 | # postprocessing
103 | nmsThreshold = 0.01
104 |
105 | # database
106 | imdbs = dict() # database provider of images and image annotations
107 | for image_set in ["train", "test"]:
108 | imdbs[image_set] = imdb_data(image_set, classes, cntk_nrRois, imgDir, roiDir, cntkFilesDir, boAddGroundTruthRois = (image_set!='test'))
109 |
110 |
111 | elif datasetName.startswith("pascalVoc"):
112 | classes = ('__background__',
113 | 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable',
114 | 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor')
115 | lutImageSet = {"train": "trainval", "test": "test"}
116 |
117 | # model training / scoring
118 | classifier = 'nn'
119 |
120 | # cntk model (Should train a model with mean-AP around 0.45)
121 | # more than 99% of the test images have less than 4000 rois, but 50% more than 2000
122 | cntk_mb_size = 2
123 | cntk_nrRois = 4000
124 | cntk_lr_per_image = [0.05] * 10 + [0.005] * 5 + [0.0005]
125 |
126 | # database
127 | imdbs = dict()
128 | for image_set, year in zip(["train", "test"], ["2007", "2007"]):
129 | imdbs[image_set] = fastRCNN.pascal_voc(lutImageSet[image_set], year, classes, cntk_nrRois, cacheDir = cntkFilesDir)
130 | print("Number of {} images: {}".format(image_set, imdbs[image_set].num_images))
131 |
132 | else:
133 | ERROR
134 |
135 |
136 |
137 | ############################
138 | # computed parameters
139 | ############################
140 | nrClasses = len(classes)
141 | cntk_featureDimensions['nn'] = nrClasses
142 | lutClass2Id = dict(zip(classes, range(len(classes))))
143 |
144 | print("PARAMETERS: datasetName = " + datasetName)
145 | assert cntk_padWidth == cntk_padHeight, "ERROR: different width and height for padding not supported."
146 | assert classifier.lower() in ['svm','nn'], "ERROR: only 'nn' or 'svm' classifier supported."
147 | assert not (datasetName == 'pascalVoc' and classifier == 'svm'), "ERROR: 'svm' classifier for pascal VOC not supported."
148 | assert(train_posOverlapThres >= 0 and train_posOverlapThres <= 1)
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | Fast R-CNN Object Detection Tutorial for Microsoft Cognitive Toolkit (CNTK)
3 | ==============
4 |
5 | ```diff
6 | + Update V2.0.1 (June 2017):
7 | + Updated documentation to include Visual Object Tagging Tool as an annotation option.
8 | + Update v2 (June 2017):
9 | + Updated code to be compatible with the CNTK 2.0.0 release.
10 | + Update v1 (Feb 2017):
11 | + This tutorial was updated to use CNTK's python wrappers. Now all processing happens in-memory during scoring. See script 6_runSingleImage for an example. Furthermore, we switched to a much more accurate and faster implementation of Selective Search.
12 | + Note that, at the time of writing, CNTK does not support Python 2. If you need Python 2 then please refer to the [previous version](https://github.com/Azure/ObjectDetectionUsingCntk/tree/7edd3276a189bad862dc54e9f73b7cfcec5ae562) of this tutorial.
13 | ```
14 |
15 | DESCRIPTION
16 | --------------
17 |
18 | Object Detection is one of the main problems in Computer Vision. Traditionally, this required expert knowledge to identify and implement so called “features” that highlight the position of objects in the image. Starting in 2012 with the famous AlexNet paper, Deep Neural Networks are used to automatically find these features. This lead to a huge improvement in the field for a large range of problems.
19 |
20 | This tutorial uses Microsoft Cognitive Toolkit's (CNTK) fast R-CNN implementation (see the [Fast R-CNN](#fast-r-cnn) section for a description) which was shown to produce state-of-the-art results for [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/), one of the main object detection challenges in the field.
21 |
22 | GOALS
23 | --------------
24 |
25 | The goal of this tutorial is to show how to train and test your own Deep Learning object detection model using [Microsoft Cognitive Toolkit (CNTK)](https://github.com/Microsoft/CNTK). Example data and annotations are provided, but the reader can also bring their own images and train their own, unique, object detector.
26 |
27 | The tutorial is split into four parts:
28 | - [Part 1](#part-1) shows how to train an object detection model for the example data without retraining the provided Neural Network, but instead training an external classifier on its output. This approach works particularly well with small datasets, and does not require expertise with deep learning.
29 | - [Part 2](#part-2) extends this approach to refine the Neural Network directly without the need for an external classifier.
30 | - [Part 3](#part-3) illustrates how to annotate your own images and use these to train an object detection model for your specific use case.
31 | - [Part 4](#part-4) covers how to reproduce published results on the Pascal VOC dataset.
32 |
33 | Previous expertise with Machine Learning while not required to complete this tutorial, however is very helpful to understand the underlying principles. More information on the topic can also be found at [CNTK's Fast-RCNN page](https://github.com/Microsoft/CNTK/tree/master/Examples/Image/Detection/FastRCNN).
34 |
35 |
36 |
37 |
38 | PREREQUISITES
39 | --------------
40 |
41 | This tutorial was tested using CNTK v2.0.0, and assumes that CNTK was installed with the (default) Anaconda Python interpreter. Note that the code will only run on v2.0 due to breaking changes in other versions.
42 |
43 | CNTK can be easily installed by following the instructions on the [script-driven installation page](https://github.com/Microsoft/CNTK/wiki/Setup-Windows-Binary-Script). This will also automatically add an Anaconda Python distribution. At the time of writing, the default python version is 3.5.
44 |
45 | A dedicated GPU is not required, but recommended for retraining of the Neural Network (part 2). If you lack a strong GPU, don't want to install CNTK yourself, or want to train a model using multiple GPUs, then consider using Azure's Data Science Virtual Machine. See the [Cortana Intelligence Gallery](https://gallery.cortanaintelligence.com/Solution/Linux-Data-Science-Virtual-Machine-3) for a 1-click deployment solution.
46 |
47 |
56 |
57 | Several Python packages are required to execute the python scripts. These libraries can be installed easily using provided python wheels by opening a command prompt and running:
58 | ````bash
59 | c:/local/CNTK-2-0/cntk/Scripts/cntkpy35.bat
60 | cd resources/python35_64bit_requirements/
61 | pip.exe install -r requirements.txt
62 | ````
63 |
64 | In the code snippet above, we assumed that the CNTK root directory is C:/local/CNTK-2-0/. The python wheels were originally downloaded from this [page](http://www.lfd.uci.edu/~gohlke/pythonlibs/).
65 |
66 | Finally, the file *AlexNet.model* is too big to be hosted in Github and hence needs to be downloaded manually from [here](https://www.cntk.ai/Models/AlexNet/AlexNet.model) and placed into the subfolder */resources/cntk/AlexNet.model*.
67 |
68 |
69 |
70 | FOLDER STRUCTURE
71 | --------------
72 |
73 | |Folder| Description
74 | |---|---
75 | |/| Root directory
76 | |/data/| Directory containing images for different object recognition projects
77 | |/data/grocery/| Example data for grocery item detection in refrigerators
78 | |/data/grocery/positives/| Images and annotations to train the model
79 | |/data/grocery/negatives/| Images used as negatives during model training
80 | |/data/grocery/testImages/| Test images used to evaluate model accuracy
81 | |/doc/| Resources such as images for this readme page
82 | |/fastRCNN/| Slightly modified code used in R-CNN publications
83 | |/resources/| All provided resources are in here
84 | |/resources/cntk/| CNTK configuration file and pre-trained AlexNet model
85 | |/resources/python35_64_bit_requirements/| Python wheels and requirements file for 64bit Python version 3.5
86 |
87 |
88 | All scripts used in this tutorial are located in the root folder.
89 |
90 |
91 | PART 1
92 | --------------
93 | In the first part of this tutorial we will train a classifier which uses, but does not modify, a pre-trained deep neural network. See the [Fast R-CNN](#fast-r-cnn) section for details of the employed approaches. As example data 25 images of grocery items inside refrigerators are provided, split into 20 images for training and the remaining 5 images are used as test set. The training images contain in total 180 annotated objects, these are:
94 | ```
95 | Egg box, joghurt, ketchup, mushroom, mustard, orange, squash, and water.
96 | ```
97 | Note that 20 training images is a very low number and too little train a high-accuracy detector. Nevertheless, even this small dataset is sufficient to return plausible detections as can be seen in step 5.
98 | Every step has to be executed in order, and we recommend after each step to inspect which files are written, where they are written to, and what the content of these files is (mostly the content is written as text file).
99 |
100 |
101 |
102 |
103 | ### STEP 1: Computing Region of Interests
104 | `Script: 1_computeRois.py`
105 |
106 | Region-of-interests (ROIs) are computed for each image independently using a 3-step approach: First, Selective Search is used to generate hundreds of ROIs per Image. These ROIs often fit tightly around some objects but miss other objects in the image (see [Selective Search](#selective-search) section). Many of the ROIs are bigger, smaller, etc. than the typical grocery item in our dataset. Hence in a second step these ROIs, as well as ROIs which are too similar, are discarded. Finally, to complement the detected ROIs from Selective Search, ROIs that uniform cover the image are added at different scales and aspect ratios.
107 |
108 | The final ROIs are written for each image separately to the files *[imageName].roi.txt* in the *proc/grocery/rois/* folder.
109 |
110 | For the grocery dataset, selective search typically generates around 1000 ROIs per image, plus on average another 2000 ROIs sampled uniformly from the image. A high number of ROIs typically leads to better object detection performance, at the expense however of longer running time. Hence the parameter `cntk_nrRois` can be used to only keep a subset of the ROIs (e.g. if `cntk_nrRois = 2000` then typically all ROIs from selective search are preserved, plus the 1000 largest ROIs generated using uniform sampling).
111 |
112 | The goodness of these ROIs can be measured by counting how many of the ground truth annotated objects in the image are covered by at least one ROI, where "covered" is defined as having an overlap greater than a given threshold. Script `B1_evaluateRois.py` outputs these counts at different threshold values. For example for a threshold of 0.5 and 2000 ROIs, the recall is around 98%, while with 200 ROIs the recall is around 85%. It is important that the recall at a threshold of 0.5 is close to 100%, since even a perfect classifier cannot find an object in the image if it is not covered by at least one ROI.
113 |
114 | ROIs computed using Selective Search (left); ROIs from the image above after discarding ROIs that are too small, too big, etc. (middle); Final set of ROIs after adding ROIs that uniformly cover the image (right).
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 | ### STEP 2: Computing CNTK inputs
123 | `Script: 2_cntkGenerateInputs.py`
124 |
125 | Each ROI generated in the last step has to run through the CNTK model to compute its 4,096 float Deep Neural Network representation (see the [Fast R-CNN](#fast-r-cnn) section). This requires three CNTK-specific input files to be generated for the training and the test set:
126 | - *{train,test}.txt*: each row contains the path to an image.
127 | - *{train,test}.rois.txt*: each row contains all ROIs for an image in relative (x,y,w,h) co-ordinates.
128 | - *{train,test}.roilabels.txt*: each row contains the labels for the ROIs in one-hot-encoding.
129 |
130 | An in-depth understanding of how these files are structured is not necessary to understand this tutorial. However, two points are worth pointing out:
131 | - CNTK’s fast R-CNN implementation requires all images to be of the same size. For this reason, all images are first scaled and then centered and zero-padded (i.e. columns of gray-colored pixels are added to the left and right of the image, or respectively rows at the top and bottom). Note that the scaling preserves the original aspect ratio. For our experiments we use input width and height of 1000 x 1000 pixels to the Neural Network.
132 | Interestingly, upscaling an image can significantly improve accuracy if the objects to be detected are small (this is due to objects in ImageNet typically having a width and height of 100-200 pixels).
133 | - CNTK expects each image to have the same number of ROIs (for our experiments we use 2000). Hence, if the computation in step 1 returned more ROIs, then only the first 2000 are used. Likewise, if less ROIs were found, then the remaining spots are filled using ROIs with co-ordinates of (0,0,0,0). These “zero-padded” ROIs are only used during CNTK execution and have no influence on the training / test performance.
134 |
135 | This step writes the above mentioned files to the directory *proc/grocery/cntkFiles/*. For debugging, the script `B2_cntkVisualizeInputs.py` can be used to visualize the content of these files (e.g. the Figure at the end of step 4 was generated using this script).
136 |
137 |
138 |
139 | ### STEP 3: Running CNTK
140 | `Script: 3_runCntk.py`
141 |
142 | We can now run the CNTK training which takes as input the co-ordinates and labels files from the last step and writes the 4096 float embedding for each ROI and for each image to *proc/grocery/cntkFiles/{train,test}_svm_parsed/[imageName].dat.npz*. This will take a few minutes, and will automatically run on GPU if detected.
143 |
144 | Note: Look for the line "Using GPU for training." in the console output to make sure the training runs on GPU and not CPU (which would be too slow). Note that a previous CNTK run might still be open and holding a block on the GPU.
145 |
146 |
147 | ### STEP 4: Classifier training
148 | `Script: 4_trainSvm.py`
149 |
150 | We now train the classifier which given an ROI as input, assigns it to one of the grocery items or to a “background” class.
151 |
152 | We use a slightly modified version of the published R-CNN code to train a linear SVM classifier. The main change is to load the 4096 floats ROI embedding from disk rather than to run the network on-the-fly. An in-depth explanation of the training procedure can be found in the [R-CNN paper](http://arxiv.org/abs/1311.2524). For the purpose of this tutorial we consider the training script a black box, which uses the training ROIs as input (or to be precise the 4096 floats representations), and outputs N+1 linear classifiers, one for each class, plus one for the background.
153 |
154 | The training starts by loading all positive ROIs into memory. Positive here corresponds to each ROI that has a significant overlap with a ground truth annotated object. Negatives are then iteratively added using hard negative mining, and the SVM is retrained. A list and short description of the parameters that govern the SVM training can be found in the script `PARAMETERS.py`.
155 |
156 | The learned linear classifiers for each class, i.e. a weight vector of dimension 4096 floats plus a float that represents the bias term, are then written to the folder *proc/grocery/trainedSVMs/*.
157 |
158 |
159 |
160 | ### STEP 5: Evaluation and visualization
161 | `Scripts: 5_evaluateResults.py and 5_visualizeResults.py`
162 |
163 | Once training succeeded, the model can be used to find objects in images. For this, every ROI in an image is classified and assigned a confidence to be orange, ketchup, ... and background. The class with highest confidence is then selected (most often “background”) and optionally a threshold applied to reject detections with low confidence.
164 |
165 | The accuracy of the classifier can be measured using the script `5_evaluateResults.py`. This outputs the mean Average Precision (mAP; see the [Mean Average Precision](#mean-average-precision) section) for either the training or the test set. Keep in mind that the test set only contains 5 images and hence these numbers need to be taken with a grain of salt. Due to randomization effects one might get very different results when running the script.
166 |
167 |
168 |
169 | Results using 200 ROIs (this number is too low to get good accuracy but for demo purposes allows for fast training and scoring):
170 |
171 | |Dataset| AP(orange)|AP(eggBox)|AP(joghurt)|AP(ketchup)| | mAP
172 | |---|---|---|---|---|---|---
173 | |Test Set| 0.45 |1.00 |0.82 |0.76 | |**0.63**
174 |
175 | Results using 2000 ROIs:
176 |
177 | |Dataset| AP(orange)|AP(eggBox)|AP(joghurt)|AP(ketchup)| | mAP
178 | |---|---|---|---|---|---|---
179 | |Test Set| 0.32 | 0.48 | 0.82 | 0.82 | |**0.65**
180 |
181 | The output of the classifier using 2000 ROIs can be visualized using the script `5_visualizeResults.py`. Only ROIs classified as grocery item are shown (not background), and only if the confidence in the detection is greater or above 0.5. Multiple ROIs are combined into single detections using [Non-Maxima Suppression](#non-maxima-suppression), the output of which is visualized below for the test images.
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 | In addition to visualizing the detected objects, script `5_visualizeResults.py` also computes precision and recall after rejecting detections with confidence scores less than a given threshold. This information can be used to set an operating point of the final classifier: for example, given the table below, to reach 85% precision all detections with score less than 5.0 would have to be rejected.
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 | ### STEP 6: Scoring images
200 | `Script: 6_scoreImage`
201 |
202 | Up to now our focus was on training a model and evaluating its performance. Hence all steps were performed one-by-one, and intermediate results were written to and loaded from disk. During scoring, given one or more images, it would be preferable to perform all steps in-memory. Exactly this is done in script `6_scoreImage`: it loads a given image, computes the ROIs, runs each ROI through the DNN, evaluates the trained SVM if needed, and finally outputs a list of the detected objects.
203 |
204 | Note that the script makes call to functions in `cntk_helpers.py` which were originally written for steps 1-5. Loading the model takes a few seconds, but this only has to be done once and can then be kept in-memory (e.g. in a web-service which waits for images to be uploaded).
205 |
206 |
207 |
208 | PART 2
209 | --------------
210 | In part 1 we learned how to classify ROIs by training a linear Support Vector Machine on the output of a given Neural Network. We will now show how to instead perform this classification directly in the Deep Neural Network. This can be achieved by adding a new last layer which, given the input from the last fully connected layer, outputs the probabilities for each ROI to be of a certain class. See section [SVM vs NN training](#svm-vs-nn-training) for pros/cons of the two different approaches.
211 |
212 | Training the Neural Network instead of an SVM is done by simply changing the variable `classifier` in `PARAMETERS.py` from "svm" to "nn". Then, as described in part 1, all the scripts need to be executed in order, except for the SVM training in step 4. This will add a classification layer to the network and train the last layer(s) of the network, and for each ROI write its classification label and confidence to disk (rather than the 4096 floats representation which was required to train the SVM). Note that NN training can cause an out-of-memory error on less powerful machines which can possibly be avoided by reducing the minibatch size and if needed also the number of ROIs per image (see variables `cntk_mb_size` and `cntk_nrRois` in `PARAMETERS.py`).
213 |
214 | The mean Average Precision measure after running all steps should roughly look like the results below.
215 |
216 | Using 200 ROIs:
217 |
218 | |Dataset| AP(orange)|AP(eggBox)|AP(joghurt)|AP(ketchup)| | mAP
219 | |---|---|---|---|---|---|---
220 | |Test Set| 0.45 |0.97 |0.82 |1.00 | |**0.70**
221 |
222 | Using 2000 ROIs:
223 |
224 | |Dataset| AP(orange)|AP(eggBox)|AP(joghurt)|AP(ketchup)| | mAP
225 | |---|---|---|---|---|---|---
226 | |Test Set| 1.00 |0.92 |1.00 |0.07 | |**0.87**
227 |
228 |
245 |
246 |
263 |
264 | The output of the Neural Network with 2000 ROIs on the five test images after Non-Maxima Suppression to combine multiple detections should look like this:
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 | PART 3
274 | --------------
275 | So far we trained and evaluated object detectors using the provided grocery dataset. It is very straight forward to use a custom dataset instead: the necessary scripts for image annotation are included in the repository, and only minor code changes are required to point to a new dataset.
276 |
277 | First, lets have a look at the folder structure and the provided annotation files for the grocery data:
278 | Note how all positive, negative and test images and their annotations are in the subfolders *positive*, *negative* and *testImages* of *data/grocery/*. Each image (with the exception of the negative images) has (i) a similarly named *[imageName].bboxes.txt* file where each row corresponds to the co-ordinates of a manually labeled object (aka. bounding box); and (ii) a *[imageName].bboxes.labels.txt* file where each row corresponds to the class of the object (e.g. avocado or orange).
279 |
280 |
281 | ### Image Annotation
282 |
283 | **Option #1: Visual Object Tagging Tool (Recommended)**
284 |
285 | The [Visual Object Tagging Tool (VOTT)](https://github.com/CatalystCode/VOTT) is a cross platform annotation tool for tagging video and image assets.
286 |
287 | 
288 |
289 | VOTT provides the following **features**:
290 |
291 | - Computer-assisted tagging and tracking of objects in videos using the [Camshift tracking algorithm](http://opencv.jp/opencv-1.0.0_org/docs/papers/camshift.pdf).
292 | - Exporting tags and assets to CNTK Fast-RCNN format for training an object detection model.
293 | - Running and validating a trained CNTK object detection model on new videos to generate stronger models.
294 |
295 | How to annotate with VOTT:
296 |
297 | 1. Download the latest [Release](https://github.com/CatalystCode/VOTT/releases)
298 | 2. Follow the [Readme](https://github.com/CatalystCode/VOTT/blob/master/README.md) to run a tagging job
299 | 3. After tagging Export to the dataset directory
300 |
301 |
302 | **Option #2: Using Annotation Scripts**
303 |
304 | These two *.txt* files per image can be generated using the scripts `A1_annotateImages.py` and `A2_annotateBboxLabels.py`.
305 |
306 | The first script lets the user draw rectangles around each object (see left image below). Once all objects in an image are annotated, pressing key 'n' writes the *.bboxes.txt* file and then proceeds to the next image, 'u' undoes (i.e. removes) the last rectangle, and 'q' quits the annotation tool.
307 |
308 | The second script loads these manually annotated rectangles for each image, displays them one-by-one, and asks the user to provide the object class by clicking on the respective button to the left of the window (see right image below). Ground truth annotations marked as either "undecided" or "exclude" are fully excluded from further processing.
309 |
310 |
311 |
312 |
313 |
314 | ### Using a custom dataset
315 |
316 | If you used VOTT to generate and export your datatset, it will all ready be in sorted in to positive*, *negative* and *testImages* subfolders.
317 |
318 | Otherwise, once all (non-negative) images are annotated using the annotation scripts, the images and *.txt* annotation files should be copied to the *positive*, *negative* and *testImages* subfolders of a new directory called *data/myOwnImages/*, where the string "myOwnImages" can be replaced at will.
319 |
320 | The only required code change is to update the `datasetName` variable in `PARAMETERS.py` to the newly created folder:
321 | ```python
322 | datasetName = "myOwnImages"
323 | ```
324 |
325 | All steps in part 1 can then be executed in order and will use the new dataset.
326 |
327 |
328 | ### How to get good results
329 |
330 | As is true for most Machine Learning project, getting good results requires careful parameter tuning. To help with this, all important parameters are specified, and a short explanation provided, in a single place: the `PARAMETERS.py` file.
331 |
332 | Here now a few tips on how to find good parameters / design a good training set:
333 | - Select images carefully and perform annotations identically across all images. Typically, all objects in the image need to be annotated, even if the image contains many of them. It is common practice to remove such cluttered images. This is similarly true also for images where one is uncertain about the label of an object or where it is unclear whether the object should even be annotated (e.g. due to truncation, occlusion, motion blur, etc.).
334 | - During Region-of-Interest generation in step 1, all ROIs which are deemed too small, too big, etc. are discarded. This filtering step relies on thresholds on the respective properties and are defined in `PARAMETERS.py` (paragraph "ROI generation").
335 | Visualizing the generated ROIs helps tremendously for debugging and can be done either while computing the ROIs in the script `1_computeRois.py` itself, or by visualizing the CNTK training files using the script `B2_cntkVisualizeInputs.py`. In addition, script `B1_evaluateRois.py` computes the percentage of annotated ground truth objects that are covered by one or more ROI (i.e. recall). Generally the more ROIs (variable `cntk_nrRois`) the better the accuracy, but at slower training and scoring speeds.
336 | - Training a linear SVM (step 4) is relatively robust and hence for most problems the corresponding parameters in `PARAMETERS.py` (paragraph "svm training") do not need to be modified.
337 | The evaluation script `5_evaluateResults.py` can be used to verify that the SVM successfully learned to capture the training data (typically the APs are above 0.5).
338 | - Training a Neural Network (part 2) is significantly more difficult, and often requires expert knowledge to make the network converge to a good solution (see [Michael Nielsen's](http://neuralnetworksanddeeplearning.com/) great introduction to Deep Neural Networks). The arguably most important parameter here is the learning rate (parameter `cntk_lr_per_image`).
339 | - In addition to computing mAP, always also visualize the results on the test and on the training set. This is done with script `5_visualizeResults.py` and helps getting an understanding of the error modes, and to verify the model is behaving as expected.
340 |
341 | ### Publishing the model as Rest API
342 |
343 | Finally, the trained model can be used to create a web service or Rest API on Azure. For this, we recommend using a technology called Flask, which makes it easy to run Python code in the cloud. See the tutorial [Creating web apps with Flask in Azure](https://azure.microsoft.com/en-us/documentation/articles/web-sites-python-create-deploy-flask-app/) for an introduction to Flask, and the GitHub repo [Azure-WebApp-w-CNTK](https://github.com/ilkarman/Azure-WebApp-w-CNTK) for an example how to deploy and run CNTK inside a web-service on Azure.
344 |
345 |
346 |
347 |
348 | PART 4
349 | --------------
350 |
354 |
355 | The last part of this tutorial shows how to reproduce published results on the Pascal VOC dataset.
356 |
357 | First, the Pascal VOC data as well as the pre-computed Selective Search boxes need to be downloaded from these links: [VOCtest_06-Nov-2007.tar](http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar),
358 | [VOCtrainval_06-Nov-2007.tar](http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar),
359 | [selective_search_data.tgz](http://www.cs.berkeley.edu/~rbg/fast-rcnn-data/selective_search_data.tgz).
360 |
361 |
362 |
363 |
364 |
365 |
366 | These three tar-compressed files should to be extracted and copied into the *resources/pascalVocData/* directory. Your resources folder should look like this:
367 | ```bash
368 | resources/pascalVocData/selective_search_data
369 | resources/pascalVocData/VOCdevkit2007/VOC2007
370 | resources/pascalVocData/VOCdevkit2007/VOC2007/Annotations
371 | resources/pascalVocData/VOCdevkit2007/VOC2007/ImageSets
372 | resources/pascalVocData/VOCdevkit2007/VOC2007/JPEGImages
373 | ```
374 |
375 | Second, the `datasetName` variable in `PARAMETERS.py` needs to point to the Pascal VOC dataset instead of our grocery dataset:
376 | ```python
377 | datasetName = "pascalVoc"
378 | ```
379 |
380 | Now the steps from part 1 can be executed in order with the exception of:
381 | - Step 1: ROI generation is not necessary since we use the downloaded Selective Search boxes instead.
382 | - Step 4: SVM training is not necessary since the classification is done by adding a new softmax layer to the network (similar to part 2).
383 |
384 | Note that Pascal VOC is a very big dataset and hence some of the steps (especially the CNTK training in step 3) will take hours to complete.
385 |
386 | The table below shows the mean Average Precision (mAP) of our final model, and compares this figure to the corresponding experiment in the [Fast R-CNN](https://arxiv.org/pdf/1504.08083v2.pdf) paper (Table 6, group "S"). Note that this tutorial uses an AlexNet architecture, and we do not perform bounding box regression. To be consistent with the paper, our model is trained using the VOC 2007 "trainval" set, and the mean Average Precision is computed on the VOC 2007 "test" set.
387 |
388 | |Dataset| mAP
389 | |---|---
390 | |Published results|0.52
391 | |Our results|0.48
392 |
393 | More information on training a PascalVOC classifier (including a download link to a trained model) can be found at [CNTK's Fast-RCNN page](https://github.com/Microsoft/CNTK/tree/master/Examples/Image/Detection/FastRCNN).
394 |
395 |
396 | TECHNOLOGY
397 | --------------
398 |
399 | ### Fast R-CNN
400 | R-CNNs for Object Detection were first presented in 2014 by [Ross Girshick et al.](http://arxiv.org/abs/1311.2524), and shown to outperform previous state-of-the-art approaches on one of the major object recognition challenges in the field: [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/). Since then, two follow-up papers were published which contain significant speed improvements: [Fast R-CNN](https://arxiv.org/pdf/1504.08083v2.pdf) and [Faster R-CNN](https://arxiv.org/abs/1506.01497).
401 |
402 | The basic idea of R-CNN is to take a deep Neural Network which was originally trained for image classification using millions of annotated images and modify it for the purpose of object detection. The basic idea from the first R-CNN paper is illustrated in the Figure below (taken from the paper): (1) Given an input image, (2) in a first step, a large number region proposals are generated. (3) These region proposals, or Regions-of-Interests (ROIs), are then each independently sent through the network which outputs a vector of e.g. 4096 floating point values for each ROI. Finally, (4) a classifier is learned which takes the 4096 float ROI representation as input and outputs a label and confidence to each ROI.
403 |
404 |
405 |
406 |
407 | While this approach works well in terms of accuracy, it is very costly to compute since the Neural Network has to be evaluated for each ROI. Fast R-CNN addresses this drawback by only evaluating most of the network (to be specific: the convolution layers) a single time per image. According to the authors, this leads to a 213 times speed-up during testing and a 9x speed-up during training without loss of accuracy.
408 |
409 | The original Caffe implementation used in the R-CNN papers can be found at github:
410 | [RCNN](https://github.com/rbgirshick/rcnn), [Fast R-CNN](https://github.com/rbgirshick/fast-rcnn), and [Faster R-CNN](https://github.com/rbgirshick/py-faster-rcnn). This tutorial uses some of the code from these repositories, notably (but not exclusively) for svm training and model evaluation.
411 |
412 | ### SVM vs NN training
413 | In the last section, we describe how a linear SVM model is trained on the ROI 4096 float embedding. Alternatively, and this has pros/cons which are outlined below, one can do this classification directly in the neural network in a soft-max layer that takes the 4096 floats of the 2nd-to-last fully-connected layer as input.
414 |
415 | The advantage of adding a new soft-max layer is that the full network can be retrained using backpropagation, including all convolution layers, which can lead to (slightly to moderately) better prediction accuracies. Another (implementation-dependent) advantage is that only (number of classes +1) floats per ROI need to be written to disk compared to the 4096 floats ROI embedding used to train a SVM.
416 | On the other hand, training a Neural Network requires a good GPU, is even then 1-2 magnitudes slower than training a SVM, and requires extensive parameter tweaking and expert knowledge.
417 |
418 | ### Selective Search
419 | [Selective Search](http://koen.me/research/pub/uijlings-ijcv2013-draft.pdf) is a method for finding a large set of possible object locations in an image, independent of the class of the actual object. It works by clustering image pixels into segments, and then performing hierarchical clustering to combine segments from the same object into object proposals. The first image in part 1 shows an example output of Selective Search, where each possible object location is visualized by a green rectangle. These rectangles are then used as Regions-of-Interests (ROIs) in the R-CNN pipeline.
420 |
421 | The goal of ROI generation is to find a small set of ROIs which however tightly cover as many objects in the image as possible. This computation has to be sufficiently quick, while at the same time finding object locations at different scales and aspect ratios. Selective Search was shown to perform well for this task, with good accuracy to speed trade-offs.
422 |
423 |
424 | ### Non-maxima suppression
425 | Object detection methods often output multiple detections which fully or partly cover the same object in an image. These ROIs need to be merged to be able to count objects and obtain their exact locations in the image. This is traditionally done using a technique called Non-Maxima Suppression (NMS). The version of NMS we use (and which was also used in the R-CNN publications) does not merge ROIs but instead tries to identify which ROIs best cover the real locations of an object and discards all other ROIs. This is implemented by iteratively selecting the ROI with highest confidence and removing all other ROIs which significantly overlap this ROI and are classified to be of the same class.
426 |
427 | Detection results before (left) and after (right) Non-maxima Suppression:
428 |
429 |
430 |
431 |
432 |
433 | ### Mean Average Precision
434 | Once trained, the quality of the model can be measured using different criteria, such as precision, recall, accuracy, area-under-curve, etc. A common metric which is used for the Pascal VOC object recognition challenge is to measure the Average Precision (AP) for each class. Average Precision takes confidence in the detections into account and hence assigns a smaller penalty to false detections with low confidence. For a description of Average Precision see [Everingham et. al](http://homepages.inf.ed.ac.uk/ckiw/postscript/ijcv_voc09.pdf). The mean Average Precision (mAP) is computed by taking the average over all APs.
435 |
436 | FUTURE WORK
437 | ---------------
438 |
439 | One big item for future work is to use CNTK's Python APIs. Once these are fully available, the following changes can be made which should significantly improve run-time performance and simplify the code:
440 | - Reduce start-up time by loading the model only once and then keeping it persistent in memory. <-- Done in v1.
441 | - Reduce processing time using in-memory calls of the python wrappers, rather than writing all inputs and outputs to file first and subsequently parsing the CNTK output back into memory (e.g. this is especially expensive for the temporary file *train.z* in step 3 which can be many Gigabytes in size). <-- Done in v1.
442 | - Reduce code complexity by evaluating the network for each ROI on-the-fly in the `im_detect()` function rather than pre-computing all outputs in steps 4 and 5.
443 |
444 | Other items for future work include:
445 | - Replace Selective Search with a faster and more accurate implementation. <-- Done in v1.
446 | - Adding bounding box regression.
447 | - Implementation of fast*er* R-CNN, i.e. performing ROI generation inside the DNN.
448 | - Using a more recent DNN topology such as ResNet instead of AlexNet.
449 |
450 |
451 | AUTHOR
452 | ---------------
453 | Patrick Buehler, Senior Data Scientist
454 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/__init__.py
--------------------------------------------------------------------------------
/data/grocery/negative/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/negative/1.jpg
--------------------------------------------------------------------------------
/data/grocery/negative/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/negative/2.jpg
--------------------------------------------------------------------------------
/data/grocery/negative/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/negative/3.jpg
--------------------------------------------------------------------------------
/data/grocery/negative/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/negative/4.jpg
--------------------------------------------------------------------------------
/data/grocery/negative/5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/negative/5.jpg
--------------------------------------------------------------------------------
/data/grocery/positive/0.bboxes.labels.tsv:
--------------------------------------------------------------------------------
1 | joghurt
2 | squash
3 | mushroom
4 | eggBox
5 | ketchup
6 | mustard
7 | water
8 | orange
9 | squash
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/0.bboxes.tsv:
--------------------------------------------------------------------------------
1 | 213 337 329 473
2 | 356 347 476 468
3 | 489 408 621 468
4 | 663 393 804 467
5 | 623 549 720 619
6 | 475 559 565 623
7 | 656 709 726 892
8 | 361 810 435 880
9 | 207 741 327 881
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/0.jpg
--------------------------------------------------------------------------------
/data/grocery/positive/11.bboxes.labels.tsv:
--------------------------------------------------------------------------------
1 | water
2 | squash
3 | mushroom
4 | orange
5 | eggBox
6 | mustard
7 | joghurt
8 | ketchup
9 | squash
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/11.bboxes.tsv:
--------------------------------------------------------------------------------
1 | 175 457 265 536
2 | 419 389 537 528
3 | 617 460 760 527
4 | 724 603 806 670
5 | 536 579 677 677
6 | 694 873 770 962
7 | 499 774 603 916
8 | 383 793 443 868
9 | 296 1010 419 1153
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/11.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/11.jpg
--------------------------------------------------------------------------------
/data/grocery/positive/12.bboxes.labels.tsv:
--------------------------------------------------------------------------------
1 | water
2 | mushroom
3 | squash
4 | eggBox
5 | joghurt
6 | mustard
7 | ketchup
8 | orange
9 | squash
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/12.bboxes.tsv:
--------------------------------------------------------------------------------
1 | 312 316 373 484
2 | 463 423 587 493
3 | 655 361 758 485
4 | 541 541 686 641
5 | 596 718 690 841
6 | 737 848 824 932
7 | 387 749 448 824
8 | 225 814 301 882
9 | 295 957 416 1090
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/12.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/12.jpg
--------------------------------------------------------------------------------
/data/grocery/positive/13.bboxes.labels.tsv:
--------------------------------------------------------------------------------
1 | squash
2 | water
3 | squash
4 | eggBox
5 | mushroom
6 | joghurt
7 | ketchup
8 | orange
9 | mustard
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/13.bboxes.tsv:
--------------------------------------------------------------------------------
1 | 201 348 324 491
2 | 480 425 612 493
3 | 681 349 814 499
4 | 520 545 672 640
5 | 694 573 830 647
6 | 584 721 681 850
7 | 375 754 439 826
8 | 209 821 285 888
9 | 724 856 804 940
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/13.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/13.jpg
--------------------------------------------------------------------------------
/data/grocery/positive/14.bboxes.labels.tsv:
--------------------------------------------------------------------------------
1 | mustard
2 | water
3 | squash
4 | orange
5 | eggBox
6 | joghurt
7 | squash
8 | ketchup
9 | mushroom
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/14.bboxes.tsv:
--------------------------------------------------------------------------------
1 | 304 487 399 537
2 | 485 471 635 527
3 | 700 375 836 527
4 | 553 607 633 684
5 | 706 577 864 676
6 | 613 760 720 886
7 | 728 817 853 962
8 | 461 861 580 954
9 | 236 849 377 949
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/14.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/14.jpg
--------------------------------------------------------------------------------
/data/grocery/positive/17.bboxes.labels.tsv:
--------------------------------------------------------------------------------
1 | mustard
2 | ketchup
3 | orange
4 | squash
5 | water
6 | mushroom
7 | joghurt
8 | squash
9 | eggBox
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/17.bboxes.tsv:
--------------------------------------------------------------------------------
1 | 264 469 333 520
2 | 343 461 440 523
3 | 455 453 533 527
4 | 539 376 670 528
5 | 774 445 901 529
6 | 656 593 820 676
7 | 688 800 820 952
8 | 539 781 648 908
9 | 227 838 509 933
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/17.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/17.jpg
--------------------------------------------------------------------------------
/data/grocery/positive/18.bboxes.labels.tsv:
--------------------------------------------------------------------------------
1 | mustard
2 | ketchup
3 | squash
4 | squash
5 | mushroom
6 | orange
7 | water
8 | joghurt
9 | eggBox
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/18.bboxes.tsv:
--------------------------------------------------------------------------------
1 | 195 492 295 548
2 | 369 412 436 547
3 | 587 403 721 552
4 | 729 399 876 552
5 | 660 623 824 706
6 | 553 625 629 700
7 | 742 793 814 981
8 | 604 820 720 965
9 | 240 813 409 953
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/18.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/18.jpg
--------------------------------------------------------------------------------
/data/grocery/positive/19.bboxes.labels.tsv:
--------------------------------------------------------------------------------
1 | mustard
2 | ketchup
3 | mushroom
4 | eggBox
5 | orange
6 | water
7 | joghurt
8 | squash
9 | squash
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/19.bboxes.tsv:
--------------------------------------------------------------------------------
1 | 244 444 307 565
2 | 336 420 409 564
3 | 563 504 686 569
4 | 689 500 814 571
5 | 532 641 608 714
6 | 697 806 770 990
7 | 563 841 681 997
8 | 376 846 497 978
9 | 216 842 335 981
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/19.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/19.jpg
--------------------------------------------------------------------------------
/data/grocery/positive/2.bboxes.labels.tsv:
--------------------------------------------------------------------------------
1 | eggBox
2 | mustard
3 | joghurt
4 | orange
5 | squash
6 | water
7 | squash
8 | mushroom
9 | ketchup
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/2.bboxes.tsv:
--------------------------------------------------------------------------------
1 | 208 425 369 503
2 | 432 387 503 508
3 | 603 389 701 507
4 | 728 440 808 513
5 | 571 532 700 657
6 | 674 700 730 844
7 | 509 785 627 921
8 | 380 769 504 841
9 | 231 753 291 898
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/2.jpg
--------------------------------------------------------------------------------
/data/grocery/positive/21.bboxes.labels.tsv:
--------------------------------------------------------------------------------
1 | mushroom
2 | eggBox
3 | water
4 | mustard
5 | ketchup
6 | squash
7 | squash
8 | joghurt
9 | orange
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/21.bboxes.tsv:
--------------------------------------------------------------------------------
1 | 176 299 301 451
2 | 188 448 487 527
3 | 573 324 649 531
4 | 736 217 825 371
5 | 724 372 812 531
6 | 704 559 838 680
7 | 649 822 777 960
8 | 251 790 367 922
9 | 240 732 315 798
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/21.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/21.jpg
--------------------------------------------------------------------------------
/data/grocery/positive/22.bboxes.labels.tsv:
--------------------------------------------------------------------------------
1 | eggBox
2 | water
3 | ketchup
4 | squash
5 | mushroom
6 | squash
7 | mustard
8 | orange
9 | joghurt
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/22.bboxes.tsv:
--------------------------------------------------------------------------------
1 | 201 439 511 519
2 | 595 309 677 520
3 | 750 344 841 519
4 | 736 547 868 673
5 | 527 589 668 673
6 | 680 810 808 954
7 | 577 806 655 932
8 | 445 860 517 932
9 | 272 782 391 929
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/22.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/22.jpg
--------------------------------------------------------------------------------
/data/grocery/positive/23.bboxes.labels.tsv:
--------------------------------------------------------------------------------
1 | orange
2 | ketchup
3 | water
4 | squash
5 | squash
6 | mustard
7 | eggBox
8 | mushroom
9 | joghurt
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/23.bboxes.tsv:
--------------------------------------------------------------------------------
1 | 303 424 368 479
2 | 508 360 564 489
3 | 704 319 765 483
4 | 692 517 826 637
5 | 651 774 774 909
6 | 567 714 620 818
7 | 363 814 635 910
8 | 408 750 509 821
9 | 248 756 365 894
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/23.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/23.jpg
--------------------------------------------------------------------------------
/data/grocery/positive/24.bboxes.labels.tsv:
--------------------------------------------------------------------------------
1 | orange
2 | ketchup
3 | water
4 | mustard
5 | squash
6 | mushroom
7 | squash
8 | eggBox
9 | joghurt
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/24.bboxes.tsv:
--------------------------------------------------------------------------------
1 | 365 412 421 471
2 | 515 351 579 485
3 | 672 324 733 487
4 | 832 353 914 496
5 | 744 528 880 645
6 | 549 560 708 643
7 | 682 777 809 918
8 | 440 773 627 910
9 | 291 738 403 877
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/24.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/24.jpg
--------------------------------------------------------------------------------
/data/grocery/positive/26.bboxes.labels.tsv:
--------------------------------------------------------------------------------
1 | mustard
2 | ketchup
3 | eggBox
4 | squash
5 | mushroom
6 | orange
7 | joghurt
8 | squash
9 | water
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/26.bboxes.tsv:
--------------------------------------------------------------------------------
1 | 215 371 275 501
2 | 335 372 403 499
3 | 508 439 681 513
4 | 730 365 866 512
5 | 529 587 690 672
6 | 405 571 487 647
7 | 436 746 543 878
8 | 252 784 379 914
9 | 740 765 816 952
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/26.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/26.jpg
--------------------------------------------------------------------------------
/data/grocery/positive/3.bboxes.labels.tsv:
--------------------------------------------------------------------------------
1 | mustard
2 | eggBox
3 | ketchup
4 | squash
5 | orange
6 | joghurt
7 | mushroom
8 | water
9 | squash
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/3.bboxes.tsv:
--------------------------------------------------------------------------------
1 | 196 468 325 515
2 | 355 455 584 513
3 | 607 459 696 517
4 | 557 539 688 659
5 | 724 596 809 665
6 | 666 784 785 936
7 | 388 785 515 858
8 | 268 746 333 926
9 | 588 1018 716 1162
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/3.jpg
--------------------------------------------------------------------------------
/data/grocery/positive/4.bboxes.labels.tsv:
--------------------------------------------------------------------------------
1 | ketchup
2 | mustard
3 | eggBox
4 | squash
5 | orange
6 | joghurt
7 | water
8 | mushroom
9 | squash
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/4.bboxes.tsv:
--------------------------------------------------------------------------------
1 | 307 481 423 544
2 | 532 489 661 544
3 | 704 471 852 541
4 | 701 568 837 693
5 | 565 621 644 689
6 | 674 766 781 893
7 | 355 724 409 865
8 | 465 872 605 965
9 | 603 1052 730 1201
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/4.jpg
--------------------------------------------------------------------------------
/data/grocery/positive/6.bboxes.labels.tsv:
--------------------------------------------------------------------------------
1 | mushroom
2 | squash
3 | mustard
4 | ketchup
5 | orange
6 | joghurt
7 | eggBox
8 | water
9 | squash
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/6.bboxes.tsv:
--------------------------------------------------------------------------------
1 | 179 463 328 532
2 | 403 391 497 511
3 | 580 381 636 516
4 | 701 343 774 503
5 | 575 585 653 659
6 | 677 777 792 918
7 | 515 786 635 924
8 | 341 737 408 906
9 | 611 1006 741 1152
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/6.jpg
--------------------------------------------------------------------------------
/data/grocery/positive/7.bboxes.labels.tsv:
--------------------------------------------------------------------------------
1 | squash
2 | mushroom
3 | ketchup
4 | mustard
5 | orange
6 | water
7 | eggBox
8 | joghurt
9 | squash
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/7.bboxes.tsv:
--------------------------------------------------------------------------------
1 | 213 344 344 481
2 | 472 424 589 480
3 | 627 329 694 484
4 | 741 361 817 485
5 | 714 567 792 637
6 | 643 845 841 926
7 | 492 757 589 888
8 | 307 752 423 898
9 | 311 969 435 1106
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/7.jpg
--------------------------------------------------------------------------------
/data/grocery/positive/8.bboxes.labels.tsv:
--------------------------------------------------------------------------------
1 | mustard
2 | ketchup
3 | mushroom
4 | squash
5 | water
6 | eggBox
7 | joghurt
8 | orange
9 | squash
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/8.bboxes.tsv:
--------------------------------------------------------------------------------
1 | 248 384 309 509
2 | 347 384 403 511
3 | 519 453 661 520
4 | 726 384 869 525
5 | 682 607 838 682
6 | 702 776 804 905
7 | 347 730 439 852
8 | 277 840 347 909
9 | 299 982 417 1125
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/8.jpg
--------------------------------------------------------------------------------
/data/grocery/positive/9.bboxes.labels.tsv:
--------------------------------------------------------------------------------
1 | mustard
2 | water
3 | squash
4 | mushroom
5 | orange
6 | eggBox
7 | ketchup
8 | joghurt
9 | squash
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/9.bboxes.tsv:
--------------------------------------------------------------------------------
1 | 232 452 379 509
2 | 512 341 572 512
3 | 720 364 866 515
4 | 535 588 685 670
5 | 758 596 837 663
6 | 692 772 805 900
7 | 488 841 625 904
8 | 333 732 427 861
9 | 291 1001 411 1144
10 |
--------------------------------------------------------------------------------
/data/grocery/positive/9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/9.jpg
--------------------------------------------------------------------------------
/data/grocery/testImages/10.bboxes.labels.tsv:
--------------------------------------------------------------------------------
1 | squash
2 | orange
3 | water
4 | mushroom
5 | eggBox
6 | ketchup
7 | mustard
8 | joghurt
9 | squash
10 |
--------------------------------------------------------------------------------
/data/grocery/testImages/10.bboxes.tsv:
--------------------------------------------------------------------------------
1 | 209 361 339 496
2 | 399 424 471 495
3 | 509 319 568 489
4 | 737 332 858 489
5 | 535 541 682 648
6 | 617 820 732 910
7 | 467 812 563 882
8 | 280 762 396 904
9 | 305 980 423 1117
10 |
--------------------------------------------------------------------------------
/data/grocery/testImages/10.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/testImages/10.jpg
--------------------------------------------------------------------------------
/data/grocery/testImages/15.bboxes.labels.tsv:
--------------------------------------------------------------------------------
1 | mustard
2 | ketchup
3 | orange
4 | water
5 | squash
6 | squash
7 | joghurt
8 | eggBox
9 | mushroom
10 |
--------------------------------------------------------------------------------
/data/grocery/testImages/15.bboxes.tsv:
--------------------------------------------------------------------------------
1 | 212 379 295 508
2 | 325 352 408 512
3 | 451 443 532 513
4 | 571 303 644 515
5 | 696 353 837 509
6 | 714 797 842 938
7 | 551 786 670 938
8 | 391 788 509 932
9 | 237 824 372 918
10 |
--------------------------------------------------------------------------------
/data/grocery/testImages/15.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/testImages/15.jpg
--------------------------------------------------------------------------------
/data/grocery/testImages/20.bboxes.labels.tsv:
--------------------------------------------------------------------------------
1 | mushroom
2 | ketchup
3 | eggBox
4 | water
5 | orange
6 | mustard
7 | joghurt
8 | squash
9 | squash
10 |
--------------------------------------------------------------------------------
/data/grocery/testImages/20.bboxes.tsv:
--------------------------------------------------------------------------------
1 | 571 475 693 537
2 | 697 336 768 468
3 | 698 472 814 540
4 | 635 615 832 686
5 | 560 615 635 682
6 | 601 713 673 837
7 | 576 810 684 962
8 | 469 812 577 946
9 | 349 809 465 936
10 |
--------------------------------------------------------------------------------
/data/grocery/testImages/20.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/testImages/20.jpg
--------------------------------------------------------------------------------
/data/grocery/testImages/25.bboxes.labels.tsv:
--------------------------------------------------------------------------------
1 | mustard
2 | ketchup
3 | water
4 | squash
5 | eggBox
6 | mushroom
7 | orange
8 | joghurt
9 | squash
10 |
--------------------------------------------------------------------------------
/data/grocery/testImages/25.bboxes.tsv:
--------------------------------------------------------------------------------
1 | 245 369 315 499
2 | 320 348 392 499
3 | 407 329 463 499
4 | 524 385 624 501
5 | 773 444 920 523
6 | 560 587 716 670
7 | 433 565 513 644
8 | 556 793 673 945
9 | 271 772 395 905
10 |
--------------------------------------------------------------------------------
/data/grocery/testImages/25.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/testImages/25.jpg
--------------------------------------------------------------------------------
/data/grocery/testImages/5.bboxes.labels.tsv:
--------------------------------------------------------------------------------
1 | mushroom
2 | joghurt
3 | eggBox
4 | squash
5 | orange
6 | water
7 | ketchup
8 | mustard
9 | squash
10 |
--------------------------------------------------------------------------------
/data/grocery/testImages/5.bboxes.tsv:
--------------------------------------------------------------------------------
1 | 288 425 445 493
2 | 511 371 613 495
3 | 740 425 893 503
4 | 733 529 872 657
5 | 585 577 666 648
6 | 372 713 436 878
7 | 545 836 690 912
8 | 714 824 821 896
9 | 623 1012 749 1156
10 |
--------------------------------------------------------------------------------
/data/grocery/testImages/5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/testImages/5.jpg
--------------------------------------------------------------------------------
/deprecated_3_runCntk_brainscript.py:
--------------------------------------------------------------------------------
1 | import os, sys, importlib
2 | import shutil, time
3 | import subprocess
4 | import PARAMETERS
5 | locals().update(importlib.import_module("PARAMETERS").__dict__)
6 |
7 |
8 |
9 | ####################################
10 | # Parameters
11 | ####################################
12 | cntkBinariesDir = "C:/local/CNTK-2-0-rc1/cntk/cntk/"
13 |
14 | # no need to change this
15 | cntkCmdStrPattern = "{0}/cntk.exe configFile={1}config.cntk currentDirectory={1}"
16 |
17 |
18 |
19 | ####################################
20 | # Main
21 | ####################################
22 | print("classifier = " + classifier)
23 | if not os.path.exists(cntkBinariesDir + "/cntk.exe"):
24 | raise Exception("Cannot find cntk.exe in directory: " + cntkBinariesDir)
25 | deleteAllFilesInDirectory(cntkFilesDir + "/tmp", None)
26 | shutil.copy(os.path.join(cntkResourcesDir, "config.cntk"), cntkFilesDir)
27 |
28 | #generate cntk command string
29 | cmdStr = cntkCmdStrPattern.format(cntkBinariesDir, cntkFilesDir, classifier)
30 | cmdStr += " ImageH={} ImageW={}".format(cntk_padHeight, cntk_padWidth)
31 | cmdStr += " NumLabels={0} NumTrainROIs={1} NumTestROIs={1}".format(len(classes), cntk_nrRois)
32 | cmdStr += " TrainROIDim={} TrainROILabelDim={}".format(4*cntk_nrRois, cntk_nrRois * cntk_featureDimensions[classifier])
33 | cmdStr += " TestROIDim={} TestROILabelDim={}".format( 4*cntk_nrRois, cntk_nrRois * cntk_featureDimensions[classifier])
34 | if classifier == 'svm':
35 | cmdStr += " [Train=[SGD=[maxEpochs=0]]]" #no need to train the network if just using it as featurizer
36 | cmdStr += " [WriteTest=[outputNodeNames=(z.fcOut.h2.y)]]"
37 | cmdStr += " [WriteTrain=[outputNodeNames=(z.fcOut.h2.y)]]"
38 |
39 | #run cntk
40 | tstart = datetime.datetime.now()
41 | os.environ['ACML_FMA'] = str(0)
42 | print(cmdStr)
43 | pid = subprocess.Popen(cmdStr, cwd = cntkFilesDir) #, creationflags=subprocess.CREATE_NEW_CONSOLE)
44 | pid.wait()
45 | print ("Time running cntk [s]: " + str((datetime.datetime.now() - tstart).total_seconds()))
46 |
47 | #delete model files written during cntk training
48 | filenames = getFilesInDirectory(cntkFilesDir + "/tmp/", postfix = None)
49 | for filename in filenames:
50 | if filename.startswith('Fast-RCNN.'):
51 | os.remove(cntkFilesDir + "/tmp/" + filename)
52 | assert pid.returncode == 0, "ERROR: cntk ended with exit code {}".format(pid.returncode)
53 |
54 | #parse cntk output
55 | print("classifier = " + classifier)
56 | image_sets = ["test", "train"]
57 | for image_set in image_sets:
58 | print("Parsing CNTK output for image set: " + image_set)
59 | cntkImgsListPath = cntkFilesDir + image_set + ".txt"
60 | outParsedDir = cntkFilesDir + image_set + "_" + classifier + "_parsed/"
61 | if classifier == 'svm':
62 | cntkOutputPath = cntkFilesDir + image_set + ".z.fcOut.h2.y"
63 | elif classifier == 'nn':
64 | cntkOutputPath = cntkFilesDir + image_set + ".z"
65 | else:
66 | error
67 |
68 | #write cntk output for each image to separate file
69 | makeDirectory(outParsedDir)
70 | parseCntkOutput(cntkImgsListPath, cntkOutputPath, outParsedDir, cntk_nrRois, cntk_featureDimensions[classifier],
71 | saveCompressed = True, skipCheck = False) #, skip5Mod = 0)
72 |
73 | #delete cntk output file which can be very large and are no longer needed
74 | deleteFile(cntkOutputPath)
75 | print("DONE.")
--------------------------------------------------------------------------------
/doc/0.filter.roi.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/0.filter.roi.jpg
--------------------------------------------------------------------------------
/doc/0.grid.roi.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/0.grid.roi.jpg
--------------------------------------------------------------------------------
/doc/0.ss.roi.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/0.ss.roi.jpg
--------------------------------------------------------------------------------
/doc/anno_boxes.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/anno_boxes.jpg
--------------------------------------------------------------------------------
/doc/anno_labels.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/anno_labels.jpg
--------------------------------------------------------------------------------
/doc/nn_00.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/nn_00.jpg
--------------------------------------------------------------------------------
/doc/nn_00_no_nms.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/nn_00_no_nms.jpg
--------------------------------------------------------------------------------
/doc/nn_01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/nn_01.jpg
--------------------------------------------------------------------------------
/doc/nn_110.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/nn_110.jpg
--------------------------------------------------------------------------------
/doc/nn_215.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/nn_215.jpg
--------------------------------------------------------------------------------
/doc/nn_425.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/nn_425.jpg
--------------------------------------------------------------------------------
/doc/nn_55.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/nn_55.jpg
--------------------------------------------------------------------------------
/doc/precision_recall.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/precision_recall.jpg
--------------------------------------------------------------------------------
/doc/rcnnPipeline.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/rcnnPipeline.JPG
--------------------------------------------------------------------------------
/doc/svm_010.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/svm_010.jpg
--------------------------------------------------------------------------------
/doc/svm_115.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/svm_115.jpg
--------------------------------------------------------------------------------
/doc/svm_220.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/svm_220.jpg
--------------------------------------------------------------------------------
/doc/svm_325.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/svm_325.jpg
--------------------------------------------------------------------------------
/doc/svm_45.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/svm_45.jpg
--------------------------------------------------------------------------------
/fastRCNN/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 | from .imdb import imdb
8 | from .pascal_voc import pascal_voc
9 |
10 |
--------------------------------------------------------------------------------
/fastRCNN/imdb.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 |
8 | import os, sys
9 | import os.path as osp
10 | import PIL
11 | import numpy as np
12 | import scipy.sparse
13 | import platform
14 | from builtins import range
15 |
16 | if sys.version_info[1] == 4 and sys.version_info[0] == 3:
17 | from .utils34_win64.cython_bbox import bbox_overlaps
18 | elif sys.version_info[1] == 5 and sys.version_info[0] == 3:
19 | from .utils35_win64.cython_bbox import bbox_overlaps
20 | else:
21 | print("ERROR: Python version {} not supported".format(sys.version_info))
22 | error
23 |
24 |
25 | class imdb(object):
26 | """Image database."""
27 |
28 | def __init__(self, name):
29 | self._name = name
30 | self._num_classes = 0
31 | self._classes = []
32 | self._image_index = []
33 | self._obj_proposer = 'selective_search'
34 | self._roidb = None
35 | self._roidb_handler = self.default_roidb
36 | # Use this dict for storing dataset specific config options
37 | self.config = {}
38 |
39 | @property
40 | def name(self):
41 | return self._name
42 |
43 | @property
44 | def num_classes(self):
45 | return len(self._classes)
46 |
47 | @property
48 | def classes(self):
49 | return self._classes
50 |
51 | @property
52 | def image_index(self):
53 | return self._image_index
54 |
55 | @property
56 | def roidb_handler(self):
57 | return self._roidb_handler
58 |
59 | @roidb_handler.setter
60 | def roidb_handler(self, val):
61 | self._roidb_handler = val
62 |
63 | @property
64 | def roidb(self):
65 | # A roidb is a list of dictionaries, each with the following keys:
66 | # boxes
67 | # gt_overlaps
68 | # gt_classes
69 | # flipped
70 | if self._roidb is not None:
71 | return self._roidb
72 | self._roidb = self.roidb_handler()
73 | return self._roidb
74 |
75 | # @property
76 | # def cache_path(self):
77 | # cache_path = osp.abspath(osp.join(datasets.ROOT_DIR, 'data', 'cache'))
78 | # print cache_path
79 | # if not os.path.exists(cache_path):
80 | # os.makedirs(cache_path)
81 | # return cache_path
82 |
83 | @property
84 | def num_images(self):
85 | return len(self.image_index)
86 |
87 | def image_path_at(self, i):
88 | raise NotImplementedError
89 |
90 | def default_roidb(self):
91 | raise NotImplementedError
92 |
93 | def evaluate_detections(self, all_boxes, output_dir=None):
94 | """
95 | all_boxes is a list of length number-of-classes.
96 | Each list element is a list of length number-of-images.
97 | Each of those list elements is either an empty list []
98 | or a numpy array of detection.
99 |
100 | all_boxes[class][image] = [] or np.array of shape #dets x 5
101 | """
102 | raise NotImplementedError
103 |
104 | def append_flipped_images(self):
105 | num_images = self.num_images
106 | widths = [PIL.Image.open(self.image_path_at(i)).size[0]
107 | for i in range(num_images)]
108 | for i in range(num_images):
109 | boxes = self.roidb[i]['boxes'].copy()
110 | oldx1 = boxes[:, 0].copy()
111 | oldx2 = boxes[:, 2].copy()
112 | boxes[:, 0] = widths[i] - oldx2 - 1
113 | boxes[:, 2] = widths[i] - oldx1 - 1
114 | assert (boxes[:, 2] >= boxes[:, 0]).all()
115 | entry = {'boxes' : boxes,
116 | 'gt_overlaps' : self.roidb[i]['gt_overlaps'],
117 | 'gt_classes' : self.roidb[i]['gt_classes'],
118 | 'flipped' : True}
119 | self.roidb.append(entry)
120 | self._image_index = self._image_index * 2
121 |
122 | def evaluate_recall(self, candidate_boxes, ar_thresh=0.5):
123 | # Record max overlap value for each gt box
124 | # Return vector of overlap values
125 | gt_overlaps = np.zeros(0)
126 | for i in range(self.num_images):
127 | gt_inds = np.where(self.roidb[i]['gt_classes'] > 0)[0]
128 | gt_boxes = self.roidb[i]['boxes'][gt_inds, :]
129 |
130 | boxes = candidate_boxes[i]
131 | if boxes.shape[0] == 0:
132 | continue
133 | overlaps = bbox_overlaps(boxes.astype(np.float),
134 | gt_boxes.astype(np.float))
135 |
136 | # gt_overlaps = np.hstack((gt_overlaps, overlaps.max(axis=0)))
137 | _gt_overlaps = np.zeros((gt_boxes.shape[0]))
138 | for j in range(gt_boxes.shape[0]):
139 | argmax_overlaps = overlaps.argmax(axis=0)
140 | max_overlaps = overlaps.max(axis=0)
141 | gt_ind = max_overlaps.argmax()
142 | gt_ovr = max_overlaps.max()
143 | assert(gt_ovr >= 0)
144 | box_ind = argmax_overlaps[gt_ind]
145 | _gt_overlaps[j] = overlaps[box_ind, gt_ind]
146 | assert(_gt_overlaps[j] == gt_ovr)
147 | overlaps[box_ind, :] = -1
148 | overlaps[:, gt_ind] = -1
149 |
150 | gt_overlaps = np.hstack((gt_overlaps, _gt_overlaps))
151 |
152 | num_pos = gt_overlaps.size
153 | gt_overlaps = np.sort(gt_overlaps)
154 | step = 0.001
155 | thresholds = np.minimum(np.arange(0.5, 1.0 + step, step), 1.0)
156 | recalls = np.zeros_like(thresholds)
157 | for i, t in enumerate(thresholds):
158 | recalls[i] = (gt_overlaps >= t).sum() / float(num_pos)
159 | ar = 2 * np.trapz(recalls, thresholds)
160 |
161 | return ar, gt_overlaps, recalls, thresholds
162 |
163 | def create_roidb_from_box_list(self, box_list, gt_roidb):
164 | assert len(box_list) == self.num_images, \
165 | 'Number of boxes must match number of ground-truth images'
166 | roidb = []
167 | for i in range(self.num_images):
168 | boxes = box_list[i]
169 | num_boxes = boxes.shape[0]
170 | overlaps = np.zeros((num_boxes, self.num_classes), dtype=np.float32)
171 |
172 | if gt_roidb and gt_roidb[i]:
173 | gt_boxes = gt_roidb[i]['boxes']
174 | gt_classes = gt_roidb[i]['gt_classes']
175 | if len(gt_classes) > 0: #for pascal every image has at least one annotated object. This is not the case however if including negative images
176 | gt_overlaps = bbox_overlaps(boxes.astype(np.float),
177 | gt_boxes.astype(np.float))
178 |
179 | argmaxes = gt_overlaps.argmax(axis=1)
180 | maxes = gt_overlaps.max(axis=1)
181 | I = np.where(maxes > 0)[0]
182 | overlaps[I, gt_classes[argmaxes[I]]] = maxes[I]
183 |
184 | overlaps = scipy.sparse.csr_matrix(overlaps)
185 | roidb.append({'boxes' : boxes,
186 | 'gt_classes' : np.zeros((num_boxes,),
187 | dtype=np.int32),
188 | 'gt_overlaps' : overlaps,
189 | 'flipped' : False})
190 | return roidb
191 |
192 | @staticmethod
193 | def merge_roidbs(a, b):
194 | assert len(a) == len(b)
195 | for i in range(len(a)):
196 | if a[i]: #if image has at least one annotated object
197 | a[i]['boxes'] = np.vstack((a[i]['boxes'], b[i]['boxes']))
198 | a[i]['gt_classes'] = np.hstack((a[i]['gt_classes'],
199 | b[i]['gt_classes']))
200 | a[i]['gt_overlaps'] = scipy.sparse.vstack([a[i]['gt_overlaps'],
201 | b[i]['gt_overlaps']])
202 | else:
203 | a[i] = b[i]
204 | return a
205 |
206 | def competition_mode(self, on):
207 | """Turn competition mode on or off."""
208 | pass
209 |
--------------------------------------------------------------------------------
/fastRCNN/nms.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 |
8 | import numpy as np
9 |
10 | def nms(dets, thresh):
11 | x1 = dets[:, 0]
12 | y1 = dets[:, 1]
13 | x2 = dets[:, 2]
14 | y2 = dets[:, 3]
15 | scores = dets[:, 4]
16 |
17 | areas = (x2 - x1 + 1) * (y2 - y1 + 1)
18 | order = scores.argsort()[::-1]
19 |
20 | keep = []
21 | while order.size > 0:
22 | i = order[0]
23 | keep.append(i)
24 | xx1 = np.maximum(x1[i], x1[order[1:]])
25 | yy1 = np.maximum(y1[i], y1[order[1:]])
26 | xx2 = np.minimum(x2[i], x2[order[1:]])
27 | yy2 = np.minimum(y2[i], y2[order[1:]])
28 |
29 | w = np.maximum(0.0, xx2 - xx1 + 1)
30 | h = np.maximum(0.0, yy2 - yy1 + 1)
31 | inter = w * h
32 | ovr = inter / (areas[i] + areas[order[1:]] - inter)
33 |
34 | inds = np.where(ovr <= thresh)[0]
35 | order = order[inds + 1]
36 |
37 | return keep
38 |
--------------------------------------------------------------------------------
/fastRCNN/pascal_voc.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 |
8 | from __future__ import print_function
9 | import os, pdb
10 | import xml.dom.minidom as minidom
11 | import numpy as np
12 | import scipy.sparse
13 | import scipy.io as sio
14 | import pickle as cp
15 | import subprocess
16 | from .imdb import imdb
17 | from .voc_eval import voc_eval
18 | #from fastRCNN.imdb import imdb
19 | #from fastRCNN.voc_eval import voc_eval
20 |
21 | class pascal_voc(imdb):
22 | def __init__(self, image_set, year, classes, maxNrRois, cacheDir, devkit_path=None):
23 | imdb.__init__(self, 'voc_' + year + '_' + image_set)
24 | self._year = year
25 | self._image_set = image_set
26 | self._maxNrRois = maxNrRois
27 | self._ROOT_DIR = os.path.join(os.path.dirname(__file__), '..')
28 | self._cacheDir = cacheDir
29 | self._devkit_path = self._get_default_path() if devkit_path is None \
30 | else devkit_path
31 | self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year)
32 | self._classes = classes
33 | #('__background__', # always index 0
34 | # 'aeroplane', 'bicycle', 'bird', 'boat',
35 | # 'bottle', 'bus', 'car', 'cat', 'chair',
36 | # 'cow', 'diningtable', 'dog', 'horse',
37 | # 'motorbike', 'person', 'pottedplant',
38 | # 'sheep', 'sofa', 'train', 'tvmonitor')
39 | self._class_to_ind = dict(zip(self.classes, range(self.num_classes)))
40 | self._image_ext = '.jpg'
41 | self._image_index = self._load_image_set_index()
42 | # Default to roidb handler
43 | self._roidb_handler = self.selective_search_roidb
44 |
45 | # PASCAL specific config options
46 | self.config = {'cleanup' : True,
47 | 'use_salt' : True,
48 | 'top_k' : 2000}
49 |
50 | assert os.path.exists(self._devkit_path), \
51 | 'VOCdevkit path does not exist: {}'.format(self._devkit_path)
52 | assert os.path.exists(self._data_path), \
53 | 'Path does not exist: {}'.format(self._data_path)
54 |
55 | @property
56 | def cache_path(self):
57 | cache_path = self._cacheDir
58 | #cache_path = osp.abspath(osp.join(datasets.ROOT_DIR, 'data', 'cache'))
59 | if not os.path.exists(cache_path):
60 | os.makedirs(cache_path)
61 | return cache_path
62 |
63 | def image_path_at(self, i):
64 | """
65 | Return the absolute path to image i in the image sequence.
66 | """
67 | return self.image_path_from_index(self._image_index[i])
68 |
69 | def image_path_from_index(self, index):
70 | """
71 | Construct an image path from the image's "index" identifier.
72 | """
73 | image_path = os.path.join(self._data_path, 'JPEGImages',
74 | index + self._image_ext)
75 | assert os.path.exists(image_path), \
76 | 'Path does not exist: {}'.format(image_path)
77 | return image_path
78 |
79 | def _load_image_set_index(self):
80 | """
81 | Load the indexes listed in this dataset's image set file.
82 | """
83 | # Example path to image set file:
84 | # self._devkit_path + /VOCdevkit2007/VOC2007/ImageSets/Main/val.txt
85 | image_set_file = os.path.join(self._data_path, 'ImageSets', 'Main',
86 | self._image_set + '.txt')
87 | assert os.path.exists(image_set_file), \
88 | 'Path does not exist: {}'.format(image_set_file)
89 | with open(image_set_file) as f:
90 | image_index = [x.strip() for x in f.readlines()]
91 | return image_index
92 |
93 | def _get_default_path(self):
94 | """
95 | Return the default path where PASCAL VOC is expected to be installed.
96 | """
97 | return os.path.join(self._ROOT_DIR, 'resources', 'pascalVocData', 'VOCdevkit' + self._year)
98 |
99 | def gt_roidb(self):
100 | """
101 | Return the database of ground-truth regions of interest.
102 |
103 | This function loads/saves from/to a cache file to speed up future calls.
104 | """
105 | cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl')
106 | if os.path.exists(cache_file):
107 | with open(cache_file, 'rb') as fid:
108 | roidb = cp.load(fid)
109 | print ('{} gt roidb loaded from {}'.format(self.name, cache_file))
110 | return roidb
111 |
112 | gt_roidb = [self._load_pascal_annotation(index)
113 | for index in self.image_index]
114 | with open(cache_file, 'wb') as fid:
115 | cp.dump(gt_roidb, fid, cp.HIGHEST_PROTOCOL)
116 | print ('wrote gt roidb to {}'.format(cache_file))
117 |
118 | return gt_roidb
119 |
120 | def selective_search_roidb(self):
121 | """
122 | Return the database of selective search regions of interest.
123 | Ground-truth ROIs are also included.
124 |
125 | This function loads/saves from/to a cache file to speed up future calls.
126 | """
127 | cache_file = os.path.join(self.cache_path,
128 | self.name + '_selective_search_roidb.pkl')
129 |
130 | if os.path.exists(cache_file):
131 | with open(cache_file, 'rb') as fid:
132 | roidb = cp.load(fid, encoding='latin1')
133 | print ('{} ss roidb loaded from {}'.format(self.name, cache_file))
134 | return roidb
135 |
136 | if int(self._year) == 2007 or not self._image_set.startswith('test'):
137 | gt_roidb = self.gt_roidb()
138 | ss_roidb = self._load_selective_search_roidb(gt_roidb)
139 | roidb = imdb.merge_roidbs(gt_roidb, ss_roidb)
140 | else:
141 | roidb = self._load_selective_search_roidb(None)
142 |
143 | # Keep max of e.g. 2000 rois
144 | if type(self._maxNrRois) == int:
145 | print ("Only keep the first %d ROIs..." % self._maxNrRois)
146 | for i in range(self.num_images):
147 | gt_overlaps = roidb[i]['gt_overlaps']
148 | gt_overlaps = gt_overlaps.todense()[:self._maxNrRois]
149 | gt_overlaps = scipy.sparse.csr_matrix(gt_overlaps)
150 | roidb[i]['boxes'] = roidb[i]['boxes'][:self._maxNrRois, :]
151 | roidb[i]['gt_classes'] = roidb[i]['gt_classes'][:self._maxNrRois]
152 | roidb[i]['gt_overlaps'] = roidb[i]['gt_overlaps'] = gt_overlaps
153 |
154 | with open(cache_file, 'wb') as fid:
155 | cp.dump(roidb, fid, cp.HIGHEST_PROTOCOL)
156 | print ('wrote ss roidb to {}'.format(cache_file))
157 |
158 | return roidb
159 |
160 | def _load_selective_search_roidb(self, gt_roidb):
161 | filename = os.path.abspath(os.path.join(self._devkit_path, '..',
162 | 'selective_search_data',
163 | self.name + '.mat'))
164 | assert os.path.exists(filename), \
165 | 'Selective search data not found at: {}'.format(filename)
166 | raw_data = sio.loadmat(filename)['boxes'].ravel()
167 |
168 | box_list = []
169 | for i in range(raw_data.shape[0]):
170 | box_list.append(raw_data[i][:, (1, 0, 3, 2)] - 1)
171 |
172 | return self.create_roidb_from_box_list(box_list, gt_roidb)
173 |
174 | def selective_search_IJCV_roidb(self):
175 | """
176 | Return the database of selective search regions of interest.
177 | Ground-truth ROIs are also included.
178 |
179 | This function loads/saves from/to a cache file to speed up future calls.
180 | """
181 | cache_file = os.path.join(self.cache_path,
182 | '{:s}_selective_search_IJCV_top_{:d}_roidb.pkl'.
183 | format(self.name, self.config['top_k']))
184 |
185 | if os.path.exists(cache_file):
186 | with open(cache_file, 'rb') as fid:
187 | roidb = cp.load(fid)
188 | print ('{} ss roidb loaded from {}'.format(self.name, cache_file))
189 | return roidb
190 |
191 | gt_roidb = self.gt_roidb()
192 | ss_roidb = self._load_selective_search_IJCV_roidb(gt_roidb)
193 | roidb = imdb.merge_roidbs(gt_roidb, ss_roidb)
194 | with open(cache_file, 'wb') as fid:
195 | cp.dump(roidb, fid, cp.HIGHEST_PROTOCOL)
196 | print ('wrote ss roidb to {}'.format(cache_file))
197 |
198 | return roidb
199 |
200 | def _load_selective_search_IJCV_roidb(self, gt_roidb):
201 | IJCV_path = os.path.abspath(os.path.join(self.cache_path, '..',
202 | 'selective_search_IJCV_data',
203 | 'voc_' + self._year))
204 | assert os.path.exists(IJCV_path), \
205 | 'Selective search IJCV data not found at: {}'.format(IJCV_path)
206 |
207 | top_k = self.config['top_k']
208 | box_list = []
209 | for i in range(self.num_images):
210 | filename = os.path.join(IJCV_path, self.image_index[i] + '.mat')
211 | raw_data = sio.loadmat(filename)
212 | box_list.append((raw_data['boxes'][:top_k, :]-1).astype(np.uint16))
213 |
214 | return self.create_roidb_from_box_list(box_list, gt_roidb)
215 |
216 | def _load_pascal_annotation(self, index):
217 | """
218 | Load image and bounding boxes info from XML file in the PASCAL VOC
219 | format.
220 | """
221 | filename = os.path.join(self._data_path, 'Annotations', index + '.xml')
222 | # print ('Loading: {}'.format(filename))
223 | def get_data_from_tag(node, tag):
224 | return node.getElementsByTagName(tag)[0].childNodes[0].data
225 |
226 | with open(filename) as f:
227 | data = minidom.parseString(f.read())
228 |
229 | objs = data.getElementsByTagName('object')
230 | num_objs = len(objs)
231 |
232 | boxes = np.zeros((num_objs, 4), dtype=np.uint16)
233 | gt_classes = np.zeros((num_objs), dtype=np.int32)
234 | overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32)
235 |
236 | # Load object bounding boxes into a data frame.
237 | for ix, obj in enumerate(objs):
238 | # Make pixel indexes 0-based
239 | x1 = float(get_data_from_tag(obj, 'xmin')) - 1
240 | y1 = float(get_data_from_tag(obj, 'ymin')) - 1
241 | x2 = float(get_data_from_tag(obj, 'xmax')) - 1
242 | y2 = float(get_data_from_tag(obj, 'ymax')) - 1
243 | cls = self._class_to_ind[
244 | str(get_data_from_tag(obj, "name")).lower().strip()]
245 | boxes[ix, :] = [x1, y1, x2, y2]
246 | gt_classes[ix] = cls
247 | overlaps[ix, cls] = 1.0
248 |
249 | overlaps = scipy.sparse.csr_matrix(overlaps)
250 |
251 | return {'boxes' : boxes,
252 | 'gt_classes': gt_classes,
253 | 'gt_overlaps' : overlaps,
254 | 'flipped' : False}
255 |
256 | def _write_voc_results_file(self, all_boxes, output_dir):
257 | comp_id = 'comp4'
258 | if self.config['use_salt']:
259 | comp_id += '-{}'.format(os.getpid())
260 |
261 | for cls_ind, cls in enumerate(self.classes):
262 | if cls == '__background__':
263 | continue
264 | print ('Writing {} VOC results file'.format(cls))
265 | filename = self._get_voc_results_file_template(output_dir).format(cls)
266 | with open(filename, 'wt') as f:
267 | for im_ind, index in enumerate(self.image_index):
268 | dets = all_boxes[cls_ind][im_ind]
269 | if dets == []:
270 | continue
271 | # the VOCdevkit expects 1-based indices
272 | for k in range(dets.shape[0]):
273 | f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.
274 | format(index, dets[k, -1],
275 | dets[k, 0] + 1, dets[k, 1] + 1,
276 | dets[k, 2] + 1, dets[k, 3] + 1))
277 | return comp_id
278 |
279 | def evaluate_detections(self, all_boxes, output_dir, boUsePythonImpl = True, use_07_metric = False):
280 | self._write_voc_results_file(all_boxes, output_dir)
281 | if not boUsePythonImpl:
282 | self._do_matlab_eval(comp_id, output_dir)
283 | else:
284 | self._do_python_eval(output_dir, use_07_metric)
285 | return []
286 |
287 | def _do_matlab_eval(self, comp_id, output_dir='output'):
288 | rm_results = self.config['cleanup']
289 |
290 | path = os.path.join(os.path.dirname(__file__),
291 | 'VOCdevkit-matlab-wrapper')
292 | cmd = 'cd {} && '.format(path)
293 | cmd += '{:s} -nodisplay -nodesktop '.format(datasets.MATLAB)
294 | cmd += '-r "dbstop if error; '
295 | cmd += 'voc_eval(\'{:s}\',\'{:s}\',\'{:s}\',\'{:s}\',{:d}); quit;"' \
296 | .format(self._devkit_path, comp_id,
297 | self._image_set, output_dir, int(rm_results))
298 | print('Running:\n{}'.format(cmd))
299 | status = subprocess.call(cmd, shell=True)
300 |
301 | def competition_mode(self, on):
302 | if on:
303 | self.config['use_salt'] = False
304 | self.config['cleanup'] = False
305 | else:
306 | self.config['use_salt'] = True
307 | self.config['cleanup'] = True
308 |
309 | #########################################################################
310 | # Python evaluation functions (copied from faster-RCNN)
311 | ##########################################################################
312 | def _get_voc_results_file_template(self, evalDir):
313 | if not os.path.exists(evalDir):
314 | os.makedirs(evalDir)
315 | filename = self._image_set + '_{:s}.txt'
316 | return os.path.join(evalDir, filename)
317 |
318 | def _do_python_eval(self, output_dir='output', use_07_metric=None):
319 | annopath = os.path.join(self._devkit_path, 'VOC' + self._year, 'Annotations', '{}.xml')
320 | imagesetfile = os.path.join(
321 | self._devkit_path,
322 | 'VOC' + self._year,
323 | 'ImageSets',
324 | 'Main',
325 | self._image_set + '.txt')
326 | aps = []
327 | # The PASCAL VOC metric changed in 2010
328 | if use_07_metric == None:
329 | use_07_metric = True if int(self._year) < 2010 else False
330 |
331 | print ('VOC07 metric? ' + ('Yes' if use_07_metric else 'No'))
332 | if not os.path.isdir(output_dir):
333 | os.mkdir(output_dir)
334 | for i, cls in enumerate(self._classes):
335 | if cls == '__background__':
336 | continue
337 | filename = self._get_voc_results_file_template(output_dir).format(cls)
338 |
339 | rec, prec, ap = voc_eval(
340 | filename, annopath, imagesetfile, cls, cachedir = output_dir, ovthresh=0.5,
341 | use_07_metric=use_07_metric)
342 | aps += [ap]
343 | print('AP for {} = {:.4f}'.format(cls, ap))
344 | with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f:
345 | cp.dump({'rec': rec, 'prec': prec, 'ap': ap}, f)
346 | print('Mean AP = {:.4f}'.format(np.mean(aps)))
347 | # print('~~~~~~~~')
348 | # print('Results:')
349 | # for ap in aps:
350 | # print('{:.3f}'.format(ap))
351 | # print('{:.3f}'.format(np.mean(aps)))
352 | # print('~~~~~~~~')
353 | # print('')
354 | print('--------------------------------------------------------------')
355 | print('Results computed with the **unofficial** Python eval code.')
356 | print('Results should be very close to the official MATLAB eval code.')
357 | print('Recompute with `./tools/reval.py --matlab ...` for your paper.')
358 | print('-- Thanks, The Management')
359 | print('--------------------------------------------------------------')
360 |
361 | if __name__ == '__main__':
362 | d = datasets.pascal_voc('trainval', '2007')
363 | res = d.roidb
364 | from IPython import embed; embed()
--------------------------------------------------------------------------------
/fastRCNN/test.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 |
8 | """Test a Fast R-CNN network on an imdb (image database)."""
9 |
10 | #from config import cfg #, get_output_dir
11 | #from blob import im_list_to_blob
12 | from __future__ import print_function
13 | import os, sys, cv2, numpy as np, pickle as cp, heapq
14 | from .nms import nms as nmsPython
15 | from .timer import Timer
16 | from helpers import im_detect, apply_nms
17 | from builtins import range
18 | import pdb
19 |
20 | # if sys.version_info[0] < 3:
21 | # from utils2_win64.cython_nms import nms
22 | # else:
23 | # from .utils3_win64.cython_nms import nms
24 |
25 | if sys.version_info[1] == 4 and sys.version_info[0] == 3:
26 | from .utils34_win64.cython_nms import nms
27 | elif sys.version_info[1] == 5 and sys.version_info[0] == 3:
28 | from .utils35_win64.cython_nms import nms
29 | else:
30 | print("ERROR: Python version {} not supported".format(sys.version_info))
31 | error
32 |
33 |
34 |
35 | def _get_image_blob(im):
36 | """Converts an image into a network input.
37 |
38 | Arguments:
39 | im (ndarray): a color image in BGR order
40 |
41 | Returns:
42 | blob (ndarray): a data blob holding an image pyramid
43 | im_scale_factors (list): list of image scales (relative to im) used
44 | in the image pyramid
45 | """
46 | im_orig = im.astype(np.float32, copy=True)
47 | im_orig -= cfg.PIXEL_MEANS
48 |
49 | im_shape = im_orig.shape
50 | im_size_min = np.min(im_shape[0:2])
51 | im_size_max = np.max(im_shape[0:2])
52 |
53 | processed_ims = []
54 | im_scale_factors = []
55 |
56 | for target_size in cfg.TEST.SCALES:
57 | im_scale = float(target_size) / float(im_size_min)
58 | # Prevent the biggest axis from being more than MAX_SIZE
59 | if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE:
60 | im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max)
61 | im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale,
62 | interpolation=cv2.INTER_LINEAR)
63 | im_scale_factors.append(im_scale)
64 | processed_ims.append(im)
65 |
66 | # Create a blob to hold the input images
67 | blob = im_list_to_blob(processed_ims)
68 |
69 | return blob, np.array(im_scale_factors)
70 |
71 | def _get_rois_blob(im_rois, im_scale_factors):
72 | """Converts RoIs into network inputs.
73 |
74 | Arguments:
75 | im_rois (ndarray): R x 4 matrix of RoIs in original image coordinates
76 | im_scale_factors (list): scale factors as returned by _get_image_blob
77 |
78 | Returns:
79 | blob (ndarray): R x 5 matrix of RoIs in the image pyramid
80 | """
81 | rois, levels = _project_im_rois(im_rois, im_scale_factors)
82 | rois_blob = np.hstack((levels, rois))
83 | return rois_blob.astype(np.float32, copy=False)
84 |
85 | def _project_im_rois(im_rois, scales):
86 | """Project image RoIs into the image pyramid built by _get_image_blob.
87 |
88 | Arguments:
89 | im_rois (ndarray): R x 4 matrix of RoIs in original image coordinates
90 | scales (list): scale factors as returned by _get_image_blob
91 |
92 | Returns:
93 | rois (ndarray): R x 4 matrix of projected RoI coordinates
94 | levels (list): image pyramid levels used by each projected RoI
95 | """
96 | im_rois = im_rois.astype(np.float, copy=False)
97 |
98 | if len(scales) > 1:
99 | widths = im_rois[:, 2] - im_rois[:, 0] + 1
100 | heights = im_rois[:, 3] - im_rois[:, 1] + 1
101 |
102 | areas = widths * heights
103 | scaled_areas = areas[:, np.newaxis] * (scales[np.newaxis, :] ** 2)
104 | diff_areas = np.abs(scaled_areas - 224 * 224)
105 | levels = diff_areas.argmin(axis=1)[:, np.newaxis]
106 | else:
107 | levels = np.zeros((im_rois.shape[0], 1), dtype=np.int)
108 |
109 | rois = im_rois * scales[levels]
110 |
111 | return rois, levels
112 |
113 | def _get_blobs(im, rois):
114 | """Convert an image and RoIs within that image into network inputs."""
115 | blobs = {'data' : None, 'rois' : None}
116 | blobs['data'], im_scale_factors = _get_image_blob(im)
117 | blobs['rois'] = _get_rois_blob(rois, im_scale_factors)
118 | return blobs, im_scale_factors
119 |
120 | def _bbox_pred(boxes, box_deltas):
121 | """Transform the set of class-agnostic boxes into class-specific boxes
122 | by applying the predicted offsets (box_deltas)
123 | """
124 | if boxes.shape[0] == 0:
125 | return np.zeros((0, box_deltas.shape[1]))
126 |
127 | boxes = boxes.astype(np.float, copy=False)
128 | widths = boxes[:, 2] - boxes[:, 0] + cfg.EPS
129 | heights = boxes[:, 3] - boxes[:, 1] + cfg.EPS
130 | ctr_x = boxes[:, 0] + 0.5 * widths
131 | ctr_y = boxes[:, 1] + 0.5 * heights
132 |
133 | dx = box_deltas[:, 0::4]
134 | dy = box_deltas[:, 1::4]
135 | dw = box_deltas[:, 2::4]
136 | dh = box_deltas[:, 3::4]
137 |
138 | pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
139 | pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
140 | pred_w = np.exp(dw) * widths[:, np.newaxis]
141 | pred_h = np.exp(dh) * heights[:, np.newaxis]
142 |
143 | pred_boxes = np.zeros(box_deltas.shape)
144 | # x1
145 | pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
146 | # y1
147 | pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
148 | # x2
149 | pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w
150 | # y2
151 | pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h
152 |
153 | return pred_boxes
154 |
155 | def _clip_boxes(boxes, im_shape):
156 | """Clip boxes to image boundaries."""
157 | # x1 >= 0
158 | boxes[:, 0::4] = np.maximum(boxes[:, 0::4], 0)
159 | # y1 >= 0
160 | boxes[:, 1::4] = np.maximum(boxes[:, 1::4], 0)
161 | # x2 < im_shape[1]
162 | boxes[:, 2::4] = np.minimum(boxes[:, 2::4], im_shape[1] - 1)
163 | # y2 < im_shape[0]
164 | boxes[:, 3::4] = np.minimum(boxes[:, 3::4], im_shape[0] - 1)
165 | return boxes
166 |
167 | # def im_detect(net, im, boxes):
168 | # """Detect object classes in an image given object proposals.
169 | #
170 | # Arguments:
171 | # net (caffe.Net): Fast R-CNN network to use
172 | # im (ndarray): color image to test (in BGR order)
173 | # boxes (ndarray): R x 4 array of object proposals
174 | #
175 | # Returns:
176 | # scores (ndarray): R x K array of object class scores (K includes
177 | # background as object category 0)
178 | # boxes (ndarray): R x (4*K) array of predicted bounding boxes
179 | # """
180 | # blobs, unused_im_scale_factors = _get_blobs(im, boxes)
181 | #
182 | # # When mapping from image ROIs to feature map ROIs, there's some aliasing
183 | # # (some distinct image ROIs get mapped to the same feature ROI).
184 | # # Here, we identify duplicate feature ROIs, so we only compute features
185 | # # on the unique subset.
186 | # if cfg.DEDUP_BOXES > 0:
187 | # v = np.array([1, 1e3, 1e6, 1e9, 1e12])
188 | # hashes = np.round(blobs['rois'] * cfg.DEDUP_BOXES).dot(v)
189 | # _, index, inv_index = np.unique(hashes, return_index=True,
190 | # return_inverse=True)
191 | # blobs['rois'] = blobs['rois'][index, :]
192 | # boxes = boxes[index, :]
193 | #
194 | # # reshape network inputs
195 | # net.blobs['data'].reshape(*(blobs['data'].shape))
196 | # net.blobs['rois'].reshape(*(blobs['rois'].shape))
197 | # blobs_out = net.forward(data=blobs['data'].astype(np.float32, copy=False),
198 | # rois=blobs['rois'].astype(np.float32, copy=False))
199 | # if cfg.TEST.SVM:
200 | # # use the raw scores before softmax under the assumption they
201 | # # were trained as linear SVMs
202 | # scores = net.blobs['cls_score'].data
203 | # else:
204 | # # use softmax estimated probabilities
205 | # scores = blobs_out['cls_prob']
206 | #
207 | # if cfg.TEST.BBOX_REG:
208 | # # Apply bounding-box regression deltas
209 | # box_deltas = blobs_out['bbox_pred']
210 | # pred_boxes = _bbox_pred(boxes, box_deltas)
211 | # pred_boxes = _clip_boxes(pred_boxes, im.shape)
212 | # else:
213 | # # Simply repeat the boxes, once for each class
214 | # pred_boxes = np.tile(boxes, (1, scores.shape[1]))
215 | #
216 | # if cfg.DEDUP_BOXES > 0:
217 | # # Map scores and predictions back to the original set of boxes
218 | # scores = scores[inv_index, :]
219 | # pred_boxes = pred_boxes[inv_index, :]
220 | #
221 | # return scores, pred_boxes
222 |
223 | def vis_detections(im, class_name, dets, thresh=0.3):
224 | """Visual debugging of detections."""
225 | import matplotlib.pyplot as plt
226 | im = im[:, :, (2, 1, 0)]
227 | for i in range(np.minimum(10, dets.shape[0])):
228 | bbox = dets[i, :4]
229 | score = dets[i, -1]
230 | if score > thresh:
231 | plt.cla()
232 | plt.imshow(im)
233 | plt.gca().add_patch(
234 | plt.Rectangle((bbox[0], bbox[1]),
235 | bbox[2] - bbox[0],
236 | bbox[3] - bbox[1], fill=False,
237 | edgecolor='g', linewidth=3)
238 | )
239 | plt.title('{} {:.3f}'.format(class_name, score))
240 | plt.show()
241 |
242 |
243 |
244 | # TODO: MOVE THIS TO CNTK HELPERS
245 | # def test_net_noThreshold():
246 | # #boxes = roidb[i]['boxes']
247 | # scores, _, _ = im_detect(net, i, boxes, feature_scale=feature_scale, classifier=classifier)
248 | #
249 | # for j in range(1, imdb.num_classes):
250 | # inds = np.where(roidb[i]['gt_classes'] == 0)[0]
251 | # cls_scores = scores[inds, j]
252 | # cls_boxes = roidb[i]['boxes'][inds]
253 | # all_boxes[j][i] = \
254 | # np.hstack((cls_boxes, cls_scores[:, np.newaxis])) \
255 | # .astype(np.float32, copy=False)
256 | #
257 |
258 |
259 | def test_net(net, imdb, output_dir, feature_scale, classifier = 'svm', nmsThreshold = 0.3,
260 | boUsePythonImpl = False, boThresholdDetections = True, boApplyNms = True,
261 | overlapThreshold=0.5):
262 | """Test a Fast R-CNN network on an image database."""
263 | num_images = len(imdb.image_index)
264 | # heuristic: keep an average of 40 detections per class per images prior
265 | # to NMS
266 | max_per_set = 40 * num_images
267 | # heuristic: keep at most 100 detection per class per image prior to NMS
268 | max_per_image = 100
269 | # detection thresold for each class (this is adaptively set based on the
270 | # max_per_set constraint)
271 | thresh = -np.inf * np.ones(imdb.num_classes)
272 | # top_scores will hold one minheap of scores per class (used to enforce
273 | # the max_per_set constraint)
274 | top_scores = [[] for _ in range(imdb.num_classes)]
275 | # all detections are collected into:
276 | # all_boxes[cls][image] = N x 5 array of detections in
277 | # (x1, y1, x2, y2, score)
278 | all_boxes = [[[] for _ in range(num_images)]
279 | for _ in range(imdb.num_classes)]
280 |
281 | #output_dir = get_output_dir(imdb, net)
282 |
283 | # timers
284 | _t = {'im_detect' : Timer(), 'misc' : Timer()}
285 | roidb = imdb.roidb
286 |
287 | if not boThresholdDetections:
288 | for i in range(num_images):
289 | if i % 100 == 0:
290 | print (" Processing image {} of {}..".format(i, num_images))
291 | scores, _, _ = im_detect(net, i, roidb[i]['boxes'], feature_scale=feature_scale, classifier=classifier)
292 |
293 | for j in range(1, imdb.num_classes):
294 | inds = np.where(roidb[i]['gt_classes'] == 0)[0]
295 | cls_scores = scores[inds, j]
296 | cls_boxes = roidb[i]['boxes'][inds]
297 | all_boxes[j][i] = \
298 | np.hstack((cls_boxes, cls_scores[:, np.newaxis])) \
299 | .astype(np.float32, copy=False)
300 |
301 | else:
302 | for i in range(num_images):
303 | if i % 100 == 0:
304 | print (" Processing image {} of {}..".format(i, num_images))
305 | #im = cv2.imread(imdb.image_path_at(i))
306 | #_t['im_detect'].tic()
307 | scores, _, _ = im_detect(net, i, roidb[i]['boxes'], feature_scale = feature_scale, classifier = classifier)
308 | #_t['im_detect'].toc()
309 |
310 | _t['misc'].tic()
311 | for j in range(1, imdb.num_classes):
312 | #only get detections with high scores AND exclude ground truth ROIs
313 | inds = np.where((scores[:, j] > thresh[j]) &
314 | (roidb[i]['gt_classes'] == 0))[0]
315 | cls_scores = scores[inds, j]
316 |
317 | # cls_boxes = boxes[inds, j * 4:(j + 1) * 4]
318 | boxes = roidb[i]['boxes']
319 | cls_boxes = boxes[inds]
320 |
321 | top_inds = np.argsort(-cls_scores)[:max_per_image]
322 | cls_scores = cls_scores[top_inds]
323 | cls_boxes = cls_boxes[top_inds, :]
324 | # push new scores onto the minheap
325 | for val in cls_scores:
326 | heapq.heappush(top_scores[j], val)
327 | # if we've collected more than the max number of detection,
328 | # then pop items off the minheap and update the class threshold
329 | if len(top_scores[j]) > max_per_set:
330 | while len(top_scores[j]) > max_per_set:
331 | heapq.heappop(top_scores[j])
332 | thresh[j] = top_scores[j][0]
333 |
334 | all_boxes[j][i] = \
335 | np.hstack((cls_boxes, cls_scores[:, np.newaxis])) \
336 | .astype(np.float32, copy=False)
337 |
338 | #visualize rois
339 | if False and i == 6 and j == 15:
340 | im = cv2.imread(imdb.image_path_at(i))
341 | if boUsePythonImpl:
342 | nms_boxes, nms_keepIndices = apply_nms(all_boxes, nmsThreshold, boUsePythonImpl = True)
343 | keep = nms_keepIndices[j][i]
344 | else:
345 | keep = nms(all_boxes[j][i], 0.3)
346 | #vis_detections(im, imdb.classes[j], all_boxes[j][i])
347 | vis_detections(im, imdb.classes[j], all_boxes[j][i][keep, :]) #, thres=-10.0)
348 | _t['misc'].toc()
349 |
350 | # print ('im_detect: {:d}/{:d} {:.3f}s {:.3f}s' \
351 | # .format(i + 1, num_images, _t['im_detect'].average_time,
352 | # _t['misc'].average_time))
353 |
354 | # for j in range(1, imdb.num_classes):
355 | # thresh[j] = max(0.5, thresh[j])
356 | # print("thresh[{}] = {}".format(j, thresh[j]))
357 |
358 | #keep only the boxes with highest score for each class
359 | # shape of all_boxes: e.g. 21 classes x 4952 images x 58 rois x 5 coord+score
360 | for j in range(1, imdb.num_classes):
361 | for i in range(num_images):
362 | inds = np.where(all_boxes[j][i][:, -1] > thresh[j])[0]
363 | if len(inds) == 0:
364 | all_boxes[j][i] = []
365 | else:
366 | all_boxes[j][i] = all_boxes[j][i][inds, :]
367 |
368 | if output_dir:
369 | det_file = os.path.join(output_dir, 'detections.pkl')
370 | with open(det_file, 'wb') as f:
371 | cp.dump(all_boxes, f, cp.HIGHEST_PROTOCOL)
372 |
373 | if boApplyNms:
374 | print ("Number of rois before non-maxima surpression: %d" % sum([len(all_boxes[i][j]) for i in range(imdb.num_classes) for j in range(imdb.num_images)]))
375 | nms_dets,_ = apply_nms(all_boxes, nmsThreshold, boUsePythonImpl)
376 | print ("Number of rois after non-maxima surpression: %d" % sum([len(nms_dets[i][j]) for i in range(imdb.num_classes) for j in range(imdb.num_images)]))
377 | else:
378 | print ("Skipping non-maxima surpression")
379 | nms_dets = all_boxes
380 |
381 | print ('Evaluating detections')
382 | return imdb.evaluate_detections(nms_dets, output_dir, overlapThreshold)
383 |
--------------------------------------------------------------------------------
/fastRCNN/timer.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 |
8 | import time
9 |
10 | class Timer(object):
11 | """A simple timer."""
12 | def __init__(self):
13 | self.total_time = 0.
14 | self.calls = 0
15 | self.start_time = 0.
16 | self.diff = 0.
17 | self.average_time = 0.
18 |
19 | def tic(self):
20 | # using time.time instead of time.clock because time time.clock
21 | # does not normalize for multithreading
22 | self.start_time = time.time()
23 |
24 | def toc(self, average=True):
25 | self.diff = time.time() - self.start_time
26 | self.total_time += self.diff
27 | self.calls += 1
28 | self.average_time = self.total_time / self.calls
29 | if average:
30 | return self.average_time
31 | else:
32 | return self.diff
33 |
--------------------------------------------------------------------------------
/fastRCNN/train_svms.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # --------------------------------------------------------
4 | # Fast R-CNN
5 | # Copyright (c) 2015 Microsoft
6 | # Licensed under The MIT License [see LICENSE for details]
7 | # Written by Ross Girshick
8 | # --------------------------------------------------------
9 |
10 | """
11 | Train post-hoc SVMs using the algorithm and hyper-parameters from
12 | traditional R-CNN.
13 | """
14 |
15 | from .timer import Timer
16 | from sklearn import svm
17 | import numpy as np
18 |
19 |
20 |
21 | #################################################
22 | # Slightly modified SVM training functions
23 | #################################################
24 | class SVMTrainer(object):
25 | """
26 | Trains post-hoc detection SVMs for all classes using the algorithm
27 | and hyper-parameters of traditional R-CNN.
28 | """
29 |
30 | def __init__(self, net, imdb, im_detect, svmWeightsPath, svmBiasPath, svmFeatScalePath,
31 | svm_C, svm_B, svm_nrEpochs, svm_retrainLimit, svm_evictThreshold, svm_posWeight,
32 | svm_targetNorm, svm_penality, svm_loss, svm_rngSeed):
33 | self.net = net
34 | self.imdb = imdb
35 | self.im_detect = im_detect
36 | self.svm_nrEpochs = svm_nrEpochs
37 | self.svm_targetNorm = svm_targetNorm
38 | self.svmWeightsPath = svmWeightsPath
39 | self.svmBiasPath = svmBiasPath
40 | self.svmFeatScalePath = svmFeatScalePath
41 | self.layer = 'fc7'
42 | self.hard_thresh = -1.0001
43 | self.neg_iou_thresh = 0.3
44 | dim = net.params['cls_score'][0].data.shape[1]
45 | self.feature_scale = self._get_feature_scale()
46 | print('Feature dim: {}'.format(dim))
47 | print('Feature scale: {:.3f}'.format(self.feature_scale))
48 | self.trainers = [SVMClassTrainer(cls, dim, self.feature_scale, svm_C, svm_B, svm_posWeight, svm_penality, svm_loss,
49 | svm_rngSeed, svm_retrainLimit, svm_evictThreshold) for cls in imdb.classes]
50 |
51 |
52 | def _get_feature_scale(self, num_images=100):
53 | _t = Timer()
54 | roidb = self.imdb.roidb
55 | total_norm = 0.0
56 | total_sum = 0.0
57 | count = 0.0
58 | num_images = min(num_images, self.imdb.num_images)
59 | inds = np.random.choice(range(self.imdb.num_images), size=num_images, replace=False)
60 |
61 | for i_, i in enumerate(inds):
62 | #im = cv2.imread(self.imdb.image_path_at(i))
63 | #if roidb[i]['flipped']:
64 | # im = im[:, ::-1, :]
65 | #im = self.imdb.image_path_at(i)
66 | _t.tic()
67 | scores, boxes, feat = self.im_detect(self.net, i, roidb[i]['boxes'], boReturnClassifierScore = False)
68 | _t.toc()
69 | #feat = self.net.blobs[self.layer].data
70 | total_norm += np.sqrt((feat ** 2).sum(axis=1)).sum()
71 | total_sum += 1.0 * sum(sum(feat)) / len(feat)
72 | count += feat.shape[0]
73 | print('{}/{}: avg feature norm: {:.3f}, average value: {:.3f}'.format(i_ + 1, num_images,
74 | total_norm / count, total_sum / count))
75 |
76 | return self.svm_targetNorm * 1.0 / (total_norm / count)
77 |
78 | def _get_pos_counts(self):
79 | counts = np.zeros((len(self.imdb.classes)), dtype=np.int)
80 | roidb = self.imdb.roidb
81 | for i in range(len(roidb)):
82 | for j in range(1, self.imdb.num_classes):
83 | I = np.where(roidb[i]['gt_classes'] == j)[0]
84 | counts[j] += len(I)
85 |
86 | for j in range(1, self.imdb.num_classes):
87 | print('class {:s} has {:d} positives'.
88 | format(self.imdb.classes[j], counts[j]))
89 |
90 | return counts
91 |
92 | def get_pos_examples(self):
93 | counts = self._get_pos_counts()
94 | for i in range(len(counts)):
95 | self.trainers[i].alloc_pos(counts[i])
96 |
97 | _t = Timer()
98 | roidb = self.imdb.roidb
99 | num_images = len(roidb)
100 | for i in range(num_images):
101 | #im = cv2.imread(self.imdb.image_path_at(i))
102 | #if roidb[i]['flipped']:
103 | # im = im[:, ::-1, :]
104 | #im = self.imdb.image_path_at(i)
105 | gt_inds = np.where(roidb[i]['gt_classes'] > 0)[0]
106 | gt_boxes = roidb[i]['boxes'][gt_inds]
107 | _t.tic()
108 | scores, boxes, feat = self.im_detect(self.net, i, gt_boxes, self.feature_scale, gt_inds, boReturnClassifierScore = False)
109 | _t.toc()
110 | #feat = self.net.blobs[self.layer].data
111 | for j in range(1, self.imdb.num_classes):
112 | cls_inds = np.where(roidb[i]['gt_classes'][gt_inds] == j)[0]
113 | if len(cls_inds) > 0:
114 | cls_feat = feat[cls_inds, :]
115 | self.trainers[j].append_pos(cls_feat)
116 | if i % 50 == 0:
117 | print('get_pos_examples: {:d}/{:d} {:.3f}s' \
118 | .format(i + 1, len(roidb), _t.average_time))
119 |
120 | def initialize_net(self):
121 | # Start all SVM parameters at zero
122 | self.net.params['cls_score'][0].data[...] = 0
123 | self.net.params['cls_score'][1].data[...] = 0
124 |
125 | # Initialize SVMs in a smart way. Not doing this because its such
126 | # a good initialization that we might not learn something close to
127 | # the SVM solution.
128 | # # subtract background weights and biases for the foreground classes
129 | # w_bg = self.net.params['cls_score'][0].data[0, :]
130 | # b_bg = self.net.params['cls_score'][1].data[0]
131 | # self.net.params['cls_score'][0].data[1:, :] -= w_bg
132 | # self.net.params['cls_score'][1].data[1:] -= b_bg
133 | # # set the background weights and biases to 0 (where they shall remain)
134 | # self.net.params['cls_score'][0].data[0, :] = 0
135 | # self.net.params['cls_score'][1].data[0] = 0
136 |
137 | def update_net(self, cls_ind, w, b):
138 | self.net.params['cls_score'][0].data[cls_ind, :] = w
139 | self.net.params['cls_score'][1].data[cls_ind] = b
140 |
141 | def train_with_hard_negatives(self):
142 | _t = Timer()
143 | roidb = self.imdb.roidb
144 | num_images = len(roidb)
145 |
146 | for epoch in range(0,self.svm_nrEpochs):
147 |
148 | # num_images = 100
149 | for i in range(num_images):
150 | print("*** EPOCH = %d, IMAGE = %d *** " % (epoch, i))
151 | #im = cv2.imread(self.imdb.image_path_at(i))
152 | #if roidb[i]['flipped']:
153 | # im = im[:, ::-1, :]
154 | #im = self.imdb.image_path_at(i)
155 | _t.tic()
156 | scores, boxes, feat = self.im_detect(self.net, i, roidb[i]['boxes'], self.feature_scale)
157 | _t.toc()
158 | #feat = self.net.blobs[self.layer].data
159 | for j in range(1, self.imdb.num_classes):
160 | hard_inds = \
161 | np.where((scores[:, j] > self.hard_thresh) &
162 | (roidb[i]['gt_overlaps'][:, j].toarray().ravel() <
163 | self.neg_iou_thresh))[0]
164 | if len(hard_inds) > 0:
165 | hard_feat = feat[hard_inds, :].copy()
166 | new_w_b = \
167 | self.trainers[j].append_neg_and_retrain(feat=hard_feat)
168 | if new_w_b is not None:
169 | self.update_net(j, new_w_b[0], new_w_b[1])
170 | np.savetxt(self.svmWeightsPath[:-4] + "_epoch" + str(epoch) + ".txt", self.net.params['cls_score'][0].data)
171 | np.savetxt(self.svmBiasPath[:-4] + "_epoch" + str(epoch) + ".txt", self.net.params['cls_score'][1].data)
172 | np.savetxt(self.svmFeatScalePath[:-4] + "_epoch" + str(epoch) + ".txt", [self.feature_scale])
173 |
174 | print(('train_with_hard_negatives: '
175 | '{:d}/{:d} {:.3f}s').format(i + 1, len(roidb),
176 | _t.average_time))
177 |
178 | def train(self):
179 | # Initialize SVMs using
180 | # a. w_i = fc8_w_i - fc8_w_0
181 | # b. b_i = fc8_b_i - fc8_b_0
182 | # c. Install SVMs into net
183 | self.initialize_net()
184 |
185 | # Pass over roidb to count num positives for each class
186 | # a. Pre-allocate arrays for positive feature vectors
187 | # Pass over roidb, computing features for positives only
188 | self.get_pos_examples()
189 |
190 | # Pass over roidb
191 | # a. Compute cls_score with forward pass
192 | # b. For each class
193 | # i. Select hard negatives
194 | # ii. Add them to cache
195 | # c. For each class
196 | # i. If SVM retrain criteria met, update SVM
197 | # ii. Install new SVM into net
198 | self.train_with_hard_negatives()
199 |
200 | # One final SVM retraining for each class
201 | # Install SVMs into net
202 | for j in range(1, self.imdb.num_classes):
203 | new_w_b = self.trainers[j].append_neg_and_retrain(force=True)
204 | self.update_net(j, new_w_b[0], new_w_b[1])
205 |
206 | #save svm
207 | np.savetxt(self.svmWeightsPath, self.net.params['cls_score'][0].data)
208 | np.savetxt(self.svmBiasPath, self.net.params['cls_score'][1].data)
209 | np.savetxt(self.svmFeatScalePath, [self.feature_scale])
210 |
211 |
212 | class SVMClassTrainer(object):
213 | """Manages post-hoc SVM training for a single object class."""
214 |
215 | def __init__(self, cls, dim, feature_scale,
216 | C, B, pos_weight, svm_penality, svm_loss, svm_rngSeed, svm_retrainLimit, svm_evictThreshold):
217 | self.pos = np.zeros((0, dim), dtype=np.float32)
218 | self.neg = np.zeros((0, dim), dtype=np.float32)
219 | self.B = B
220 | self.C = C
221 | self.cls = cls
222 | self.pos_weight = pos_weight
223 | self.dim = dim
224 | self.feature_scale = feature_scale
225 | if type(pos_weight) == str: #e.g. pos_weight == 'auto'
226 | class_weight = pos_weight
227 | else:
228 | class_weight = {1: pos_weight, -1: 1}
229 |
230 | self.svm = svm.LinearSVC(C=C, class_weight=class_weight,
231 | intercept_scaling=B, verbose=1,
232 | penalty=svm_penality, loss=svm_loss,
233 | random_state=svm_rngSeed, dual=True)
234 |
235 | self.pos_cur = 0
236 | self.num_neg_added = 0
237 | self.retrain_limit = svm_retrainLimit
238 | self.evict_thresh = svm_evictThreshold
239 | self.loss_history = []
240 |
241 | def alloc_pos(self, count):
242 | self.pos_cur = 0
243 | self.pos = np.zeros((count, self.dim), dtype=np.float32)
244 |
245 | def append_pos(self, feat):
246 | num = feat.shape[0]
247 | self.pos[self.pos_cur:self.pos_cur + num, :] = feat
248 | self.pos_cur += num
249 |
250 | def train(self):
251 | print('>>> Updating {} detector <<<'.format(self.cls))
252 | num_pos = self.pos.shape[0]
253 | num_neg = self.neg.shape[0]
254 | print('Cache holds {} pos examples and {} neg examples'.
255 | format(num_pos, num_neg))
256 | X = np.vstack((self.pos, self.neg)) * self.feature_scale
257 | y = np.hstack((np.ones(num_pos),
258 | -np.ones(num_neg)))
259 | self.svm.fit(X, y)
260 | w = self.svm.coef_
261 | b = self.svm.intercept_[0]
262 |
263 | scores = self.svm.decision_function(X)
264 | pos_scores = scores[:num_pos]
265 | neg_scores = scores[num_pos:]
266 |
267 | num_neg_wrong = sum(neg_scores > 0)
268 | num_pos_wrong = sum(pos_scores < 0)
269 | meanAcc = 0.5 * (num_pos - num_pos_wrong) / num_pos + 0.5*(num_neg - num_neg_wrong) / num_neg
270 | if type(self.pos_weight) == str:
271 | pos_loss = 0
272 | else:
273 | pos_loss = (self.C * self.pos_weight *
274 | np.maximum(0, 1 - pos_scores).sum())
275 | neg_loss = self.C * np.maximum(0, 1 + neg_scores).sum()
276 | reg_loss = 0.5 * np.dot(w.ravel(), w.ravel()) + 0.5 * b ** 2
277 | tot_loss = pos_loss + neg_loss + reg_loss
278 | self.loss_history.append((meanAcc, num_pos_wrong, num_pos, num_neg_wrong, num_neg, tot_loss, pos_loss, neg_loss, reg_loss))
279 | for i, losses in enumerate(self.loss_history):
280 | print((' {:4d}: meanAcc={:.3f} -- pos wrong: {:5}/{:5}; neg wrong: {:5}/{:5}; '
281 | ' obj val: {:.3f} = {:.3f} (posUnscaled) + {:.3f} (neg) + {:.3f} (reg)').format(i, *losses))
282 |
283 | # Sanity check
284 |
285 | scores_ret = (
286 | X * 1.0 / self.feature_scale).dot(w.T * self.feature_scale) + b
287 | assert np.allclose(scores, scores_ret[:, 0], atol=1e-5), \
288 | "Scores from returned model don't match decision function"
289 |
290 | return ((w * self.feature_scale, b), pos_scores, neg_scores)
291 |
292 | def append_neg_and_retrain(self, feat=None, force=False):
293 | if feat is not None:
294 | num = feat.shape[0]
295 | self.neg = np.vstack((self.neg, feat))
296 | self.num_neg_added += num
297 | if self.num_neg_added > self.retrain_limit or force:
298 | self.num_neg_added = 0
299 | new_w_b, pos_scores, neg_scores = self.train()
300 | # scores = np.dot(self.neg, new_w_b[0].T) + new_w_b[1]
301 | # easy_inds = np.where(neg_scores < self.evict_thresh)[0]
302 | print(' Pruning easy negatives')
303 | print(' before pruning: #neg = ' + str(len(self.neg)))
304 | not_easy_inds = np.where(neg_scores >= self.evict_thresh)[0]
305 | if len(not_easy_inds) > 0:
306 | self.neg = self.neg[not_easy_inds, :]
307 | # self.neg = np.delete(self.neg, easy_inds)
308 | print(' after pruning: #neg = ' + str(len(self.neg)))
309 | print(' Cache holds {} pos examples and {} neg examples'.
310 | format(self.pos.shape[0], self.neg.shape[0]))
311 | print(' {} pos support vectors'.format((pos_scores <= 1).sum()))
312 | print(' {} neg support vectors'.format((neg_scores >= -1).sum()))
313 | return new_w_b
314 | else:
315 | return None
316 |
--------------------------------------------------------------------------------
/fastRCNN/utils34_win64/cython_bbox.pyd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/fastRCNN/utils34_win64/cython_bbox.pyd
--------------------------------------------------------------------------------
/fastRCNN/utils34_win64/cython_nms.pyd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/fastRCNN/utils34_win64/cython_nms.pyd
--------------------------------------------------------------------------------
/fastRCNN/utils35_win64/cython_bbox.pyd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/fastRCNN/utils35_win64/cython_bbox.pyd
--------------------------------------------------------------------------------
/fastRCNN/utils35_win64/cython_nms.pyd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/fastRCNN/utils35_win64/cython_nms.pyd
--------------------------------------------------------------------------------
/fastRCNN/voc_eval.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast/er R-CNN
3 | # Licensed under The MIT License [see LICENSE for details]
4 | # Written by Bharath Hariharan
5 | # --------------------------------------------------------
6 |
7 | from __future__ import print_function
8 | import xml.etree.ElementTree as ET
9 | import os
10 | import pickle as cp
11 | import numpy as np
12 |
13 | def parse_rec(filename):
14 | """ Parse a PASCAL VOC xml file """
15 | tree = ET.parse(filename)
16 | objects = []
17 | for obj in tree.findall('object'):
18 | obj_struct = {}
19 | obj_struct['name'] = obj.find('name').text
20 | obj_struct['pose'] = obj.find('pose').text
21 | obj_struct['truncated'] = int(obj.find('truncated').text)
22 | obj_struct['difficult'] = int(obj.find('difficult').text)
23 | bbox = obj.find('bndbox')
24 | obj_struct['bbox'] = [int(bbox.find('xmin').text),
25 | int(bbox.find('ymin').text),
26 | int(bbox.find('xmax').text),
27 | int(bbox.find('ymax').text)]
28 | objects.append(obj_struct)
29 |
30 | return objects
31 |
32 | def voc_ap(rec, prec, use_07_metric=False):
33 | """ ap = voc_ap(rec, prec, [use_07_metric])
34 | Compute VOC AP given precision and recall.
35 | If use_07_metric is true, uses the
36 | VOC 07 11 point method (default:False).
37 | """
38 | if use_07_metric:
39 | # 11 point metric
40 | ap = 0.
41 | for t in np.arange(0., 1.1, 0.1):
42 | if np.sum(rec >= t) == 0:
43 | p = 0
44 | else:
45 | p = np.max(prec[rec >= t])
46 | ap = ap + p / 11.
47 | else:
48 | # correct AP calculation
49 | # first append sentinel values at the end
50 | mrec = np.concatenate(([0.], rec, [1.]))
51 | mpre = np.concatenate(([0.], prec, [0.]))
52 |
53 | # compute the precision envelope
54 | for i in range(mpre.size - 1, 0, -1):
55 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
56 |
57 | # to calculate area under PR curve, look for points
58 | # where X axis (recall) changes value
59 | i = np.where(mrec[1:] != mrec[:-1])[0]
60 |
61 | # and sum (\Delta recall) * prec
62 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
63 | return ap
64 |
65 | def voc_eval(detpath,
66 | annopath,
67 | imagesetfile,
68 | classname,
69 | cachedir,
70 | ovthresh=0.5,
71 | use_07_metric=False):
72 | """rec, prec, ap = voc_eval(detpath,
73 | annopath,
74 | imagesetfile,
75 | classname,
76 | [ovthresh],
77 | [use_07_metric])
78 |
79 | Top level function that does the PASCAL VOC evaluation.
80 |
81 | detpath: Path to detections
82 | detpath.format(classname) should produce the detection results file.
83 | annopath: Path to annotations
84 | annopath.format(imagename) should be the xml annotations file.
85 | imagesetfile: Text file containing the list of images, one image per line.
86 | classname: Category name (duh)
87 | cachedir: Directory for caching the annotations
88 | [ovthresh]: Overlap threshold (default = 0.5)
89 | [use_07_metric]: Whether to use VOC07's 11 point AP computation
90 | (default False)
91 | """
92 | # assumes detections are in detpath.format(classname)
93 | # assumes annotations are in annopath.format(imagename)
94 | # assumes imagesetfile is a text file with each line an image name
95 | # cachedir caches the annotations in a pickle file
96 |
97 | # first load gt
98 | if cachedir:
99 | if not os.path.isdir(cachedir):
100 | os.mkdir(cachedir)
101 | cachefile = os.path.join(cachedir, 'annots.pkl')
102 | # read list of images
103 | with open(imagesetfile, 'r') as f:
104 | lines = f.readlines()
105 | imagenames = [x.strip() for x in lines]
106 |
107 | if not cachedir or not os.path.isfile(cachefile):
108 | # load annots
109 | recs = {}
110 | for i, imagename in enumerate(imagenames):
111 | recs[imagename] = parse_rec(annopath.format(imagename))
112 | if i % 1000 == 0:
113 | print ('Reading annotation for {:d}/{:d}'.format(
114 | i + 1, len(imagenames)))
115 | # save
116 | if cachedir:
117 | print ('Saving cached annotations to {:s}'.format(cachefile))
118 | with open(cachefile, 'wb') as f:
119 | cp.dump(recs, f)
120 | else:
121 | # load
122 | with open(cachefile, 'rb') as f:
123 | recs = cp.load(f)
124 |
125 | # extract gt objects for this class
126 | class_recs = {}
127 | npos = 0
128 | for imagename in imagenames:
129 | R = [obj for obj in recs[imagename] if obj['name'] == classname]
130 | bbox = np.array([x['bbox'] for x in R])
131 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
132 | det = [False] * len(R)
133 | npos = npos + sum(~difficult)
134 | class_recs[imagename] = {'bbox': bbox,
135 | 'difficult': difficult,
136 | 'det': det}
137 |
138 | # read dets
139 | detfile = detpath.format(classname)
140 | with open(detfile, 'r') as f:
141 | lines = f.readlines()
142 |
143 | splitlines = [x.strip().split(' ') for x in lines]
144 | image_ids = [x[0] for x in splitlines]
145 | confidence = np.array([float(x[1]) for x in splitlines])
146 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
147 |
148 | # sort by confidence
149 | sorted_ind = np.argsort(-confidence)
150 | sorted_scores = np.sort(-confidence)
151 |
152 | BB = BB[sorted_ind, :]
153 | image_ids = [image_ids[x] for x in sorted_ind]
154 |
155 | # go down dets and mark TPs and FPs
156 | nd = len(image_ids)
157 | tp = np.zeros(nd)
158 | fp = np.zeros(nd)
159 | for d in range(nd):
160 | R = class_recs[image_ids[d]]
161 | bb = BB[d, :].astype(float)
162 | ovmax = -np.inf
163 | BBGT = R['bbox'].astype(float)
164 |
165 | if BBGT.size > 0:
166 | # compute overlaps
167 | # intersection
168 | ixmin = np.maximum(BBGT[:, 0], bb[0])
169 | iymin = np.maximum(BBGT[:, 1], bb[1])
170 | ixmax = np.minimum(BBGT[:, 2], bb[2])
171 | iymax = np.minimum(BBGT[:, 3], bb[3])
172 | iw = np.maximum(ixmax - ixmin + 1., 0.)
173 | ih = np.maximum(iymax - iymin + 1., 0.)
174 | inters = iw * ih
175 |
176 | # union
177 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
178 | (BBGT[:, 2] - BBGT[:, 0] + 1.) *
179 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
180 |
181 | overlaps = inters / uni
182 | ovmax = np.max(overlaps)
183 | jmax = np.argmax(overlaps)
184 |
185 | if ovmax > ovthresh:
186 | if not R['difficult'][jmax]:
187 | if not R['det'][jmax]:
188 | tp[d] = 1.
189 | R['det'][jmax] = 1
190 | else:
191 | fp[d] = 1.
192 | else:
193 | fp[d] = 1.
194 |
195 | # compute precision recall
196 | fp = np.cumsum(fp)
197 | tp = np.cumsum(tp)
198 | rec = tp / float(npos)
199 | # avoid divide by zero in case the first detection matches a difficult
200 | # ground truth
201 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
202 | ap = voc_ap(rec, prec, use_07_metric)
203 |
204 | return rec, prec, ap
205 |
--------------------------------------------------------------------------------
/helpers_cntk.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | from __future__ import print_function
3 | from past.utils import old_div
4 |
5 | import os, pdb, sys, numpy as np
6 | from os.path import join
7 | from helpers import readTable
8 |
9 | from cntk import load_model, Trainer, UnitType, use_default_device, placeholder, constant, cross_entropy_with_softmax, classification_error
10 | from cntk.device import use_default_device #default #gpu, set_default_device
11 | from cntk.initializer import glorot_uniform
12 | from cntk.io import MinibatchSource, ImageDeserializer, CTFDeserializer, StreamDefs, StreamDef
13 | from cntk.io.transforms import scale
14 | from cntk.learners import momentum_sgd, learning_rate_schedule, momentum_as_time_constant_schedule
15 | from cntk.logging import log_number_of_parameters, ProgressPrinter, TensorBoardProgressWriter
16 | from cntk.logging.graph import find_by_name, plot
17 | from cntk.ops import input_variable, parameter, times, combine, roipooling
18 | from cntk.ops.functions import CloneMethod
19 |
20 |
21 | ####################################
22 | # CNTK-python wrapper functions
23 | ####################################
24 | def create_mb_source(data_set, img_height, img_width, n_classes, n_rois, data_path, randomize):
25 | # set paths
26 | map_file = join(data_path, data_set + '.txt')
27 | roi_file = join(data_path, data_set + '.rois.txt')
28 | label_file = join(data_path, data_set + '.roilabels.txt')
29 | if not os.path.exists(map_file) or not os.path.exists(roi_file) or not os.path.exists(label_file):
30 | raise RuntimeError("File '%s', '%s' or '%s' does not exist. " % (map_file, roi_file, label_file))
31 |
32 | # read images
33 | nrImages = len(readTable(map_file))
34 | transforms = [scale(width=img_width, height=img_height, channels=3,
35 | scale_mode="pad", pad_value=114, interpolations='linear')]
36 | image_source = ImageDeserializer(map_file, StreamDefs(features = StreamDef(field='image', transforms=transforms)))
37 |
38 | # read rois and labels
39 | rois_dim = 4 * n_rois
40 | label_dim = n_classes * n_rois
41 | roi_source = CTFDeserializer(roi_file, StreamDefs(
42 | rois = StreamDef(field='rois', shape=rois_dim, is_sparse=False)))
43 | label_source = CTFDeserializer(label_file, StreamDefs(
44 | roiLabels = StreamDef(field='roiLabels', shape=label_dim, is_sparse=False)))
45 |
46 | # define a composite reader
47 | mb = MinibatchSource([image_source, roi_source, label_source], max_samples=sys.maxsize, randomize=randomize)
48 | return (mb, nrImages)
49 |
50 |
51 | # Defines the Fast R-CNN network model for detecting objects in images
52 | def frcn_predictor(features, rois, n_classes, base_path):
53 | # model specific variables for AlexNet
54 | model_file = base_path + "/../../../resources/cntk/AlexNet.model"
55 | roi_dim = 6
56 | feature_node_name = "features"
57 | last_conv_node_name = "conv5.y"
58 | pool_node_name = "pool3"
59 | last_hidden_node_name = "h2_d"
60 |
61 | # Load the pretrained classification net and find nodes
62 | print("Loading pre-trained model...")
63 | loaded_model = load_model(model_file)
64 | print("Loading pre-trained model... DONE.")
65 | feature_node = find_by_name(loaded_model, feature_node_name)
66 | conv_node = find_by_name(loaded_model, last_conv_node_name)
67 | pool_node = find_by_name(loaded_model, pool_node_name)
68 | last_node = find_by_name(loaded_model, last_hidden_node_name)
69 |
70 | # Clone the conv layers and the fully connected layers of the network
71 | conv_layers = combine([conv_node.owner]).clone(CloneMethod.freeze, {feature_node: placeholder()})
72 | fc_layers = combine([last_node.owner]).clone(CloneMethod.clone, {pool_node: placeholder()})
73 |
74 | # Create the Fast R-CNN model
75 | feat_norm = features - constant(114)
76 | conv_out = conv_layers(feat_norm)
77 | roi_out = roipooling(conv_out, rois, (roi_dim, roi_dim))
78 | fc_out = fc_layers(roi_out)
79 | #fc_out.set_name("fc_out")
80 |
81 | # z = Dense(rois[0], num_classes, map_rank=1)(fc_out) # --> map_rank=1 is not yet supported
82 | W = parameter(shape=(4096, n_classes), init=glorot_uniform())
83 | b = parameter(shape=n_classes, init=0)
84 | z = times(fc_out, W) + b
85 | return z, fc_out
86 |
87 |
88 | # Initialize and train a Fast R-CNN model
89 | def init_train_fast_rcnn(image_height, image_width, num_classes, num_rois, mb_size, max_epochs, cntk_lr_per_image, l2_reg_weight,
90 | momentum_time_constant, base_path, boSkipTraining = False, debug_output=False, tensorboardLogDir = None):
91 |
92 | #make sure we use GPU for training
93 | if use_default_device().type() == 0:
94 | print("WARNING: using CPU for training.")
95 | else:
96 | print("Using GPU for training.")
97 |
98 | # Instantiate the Fast R-CNN prediction model
99 | image_input = input_variable((3, image_height, image_width))
100 | roi_input = input_variable((num_rois, 4))
101 | label_input = input_variable((num_rois, num_classes))
102 | frcn_output, frcn_penultimateLayer = frcn_predictor(image_input, roi_input, num_classes, base_path)
103 |
104 | if boSkipTraining:
105 | print("Using pre-trained DNN without refinement")
106 | return frcn_penultimateLayer
107 |
108 | # Create the minibatch source and define mapping from reader streams to network inputs
109 | minibatch_source, epoch_size = create_mb_source("train", image_height, image_width, num_classes, num_rois,
110 | base_path, randomize=True)
111 | input_map = {
112 | image_input: minibatch_source.streams.features,
113 | roi_input: minibatch_source.streams.rois,
114 | label_input: minibatch_source.streams.roiLabels
115 | }
116 |
117 | # set loss / error functions
118 | ce = cross_entropy_with_softmax(frcn_output, label_input, axis=1)
119 | pe = classification_error(frcn_output, label_input, axis=1)
120 | if debug_output:
121 | plot(frcn_output, "graph_frcn.png")
122 |
123 | # set the progress printer(s)
124 | progress_writers = [ProgressPrinter(tag='Training', num_epochs=max_epochs)]
125 | if tensorboardLogDir != None:
126 | tensorboard_writer = TensorBoardProgressWriter(freq=10, log_dir=tensorboardLogDir, model=frcn_output)
127 | progress_writers.append(tensorboard_writer)
128 |
129 | # Set learning parameters and instantiate the trainer object
130 | lr_per_sample = [f/float(num_rois) for f in cntk_lr_per_image]
131 | lr_schedule = learning_rate_schedule(lr_per_sample, unit=UnitType.sample)
132 | mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant)
133 | learner = momentum_sgd(frcn_output.parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight)
134 | trainer = Trainer(frcn_output, (ce, pe), learner, progress_writers)
135 |
136 | # Get minibatches of images and perform model training
137 | print("Training Fast R-CNN model for %s epochs." % max_epochs)
138 | log_number_of_parameters(frcn_output)
139 | for epoch in range(max_epochs):
140 | sample_count = 0
141 |
142 | # loop over minibatches in the epoch
143 | while sample_count < epoch_size:
144 | data = minibatch_source.next_minibatch(min(mb_size, epoch_size - sample_count), input_map=input_map)
145 | if sample_count % 100 == 1:
146 | print("Training in progress: epoch {} of {}, sample count {} of {}".format(epoch, max_epochs, sample_count, epoch_size))
147 | trainer.train_minibatch(data)
148 | sample_count += trainer.previous_minibatch_sample_count # count samples processed so far
149 | trainer.summarize_training_progress()
150 |
151 | # Log mean of each parameter tensor, so that we can confirm that the parameters change indeed.
152 | if tensorboardLogDir != None:
153 | for parameter in frcn_output.parameters:
154 | tensorboard_writer.write_value(parameter.uid + "/mean", np.mean(parameter.value), epoch)
155 | tensorboard_writer.write_value(parameter.uid + "/std", np.std(parameter.value), epoch)
156 | tensorboard_writer.write_value(parameter.uid + "/absSum", np.sum(np.abs(parameter.value)), epoch)
157 |
158 | if debug_output:
159 | frcn_output.save_model("frcn_py_%s.model" % (epoch + 1))
160 | return frcn_output
161 |
162 |
163 | def run_fast_rcnn(model, data_set, image_height, image_width, num_classes, num_rois, base_path, outDir):
164 | # Create the minibatch source and define mapping from reader streams to network inputs
165 | minibatch_source, num_images = create_mb_source(data_set, image_height, image_width, num_classes, num_rois, base_path, randomize=False)
166 | input_map = {
167 | model.arguments[0]: minibatch_source['features'],
168 | model.arguments[1]: minibatch_source['rois']
169 | }
170 |
171 | # evaluate test images and write to file
172 | for imgIndex in range(0, num_images):
173 | if imgIndex % 100 == 1:
174 | print("Evaluating images {} of {}".format(imgIndex, num_images))
175 | data = minibatch_source.next_minibatch(1, input_map=input_map)
176 | output = model.eval(data)[0]
177 | output = np.array(output, np.float32)
178 |
179 | # write to disk
180 | if imgIndex % 100 == 1:
181 | print("Writing DNN output of dimension {} to disk".format(output.shape))
182 | outPath = outDir + str(imgIndex) + ".dat"
183 | np.savez_compressed(outPath, output)
--------------------------------------------------------------------------------
/imdb_data.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 |
8 | from __future__ import print_function
9 | from builtins import range
10 | import sys, os
11 | from helpers import *
12 | import scipy.sparse
13 | import scipy.io as sio
14 | import pickle as cp
15 | import numpy as np
16 | import fastRCNN
17 |
18 |
19 | class imdb_data(fastRCNN.imdb):
20 | def __init__(self, image_set, classes, maxNrRois, imgDir, roiDir, cacheDir, boAddGroundTruthRois):
21 | fastRCNN.imdb.__init__(self, image_set + ".cache") #'data_' + image_set)
22 | self._image_set = image_set
23 | self._maxNrRois = maxNrRois
24 | self._imgDir = imgDir
25 | self._roiDir = roiDir
26 | self._cacheDir = cacheDir #cache_path
27 | self._imgSubdirs ={'train': ['positive', 'negative'], 'test': ['testImages']}
28 | self._classes = classes
29 | self._class_to_ind = dict(zip(self.classes, range(self.num_classes)))
30 | self._image_ext = '.jpg'
31 | self._image_index, self._image_subdirs = self._load_image_set_index()
32 | self._roidb_handler = self.selective_search_roidb
33 | self._boAddGroundTruthRois = boAddGroundTruthRois
34 |
35 |
36 | #overwrite parent definition
37 | @property
38 | def cache_path(self):
39 | return self._cacheDir
40 |
41 | def image_path_at(self, i):
42 | """
43 | Return the absolute path to image i in the image sequence.
44 | """
45 | return self.image_path_from_index(self._image_subdirs[i], self._image_index[i])
46 |
47 | def image_path_from_index(self, subdir, fname):
48 | """
49 | Construct an image path from the image's "index" identifier.
50 | """
51 | image_path = os.path.join(self._imgDir, subdir, fname)
52 | assert os.path.exists(image_path), \
53 | 'Path does not exist: {}'.format(image_path)
54 | return image_path
55 |
56 | def _load_image_set_index(self):
57 | """
58 | Compile list of image indices and the subdirectories they are in.
59 | """
60 | image_index = []
61 | image_subdirs = []
62 | for subdir in self._imgSubdirs[self._image_set]:
63 | imgFilenames = getFilesInDirectory(os.path.join(self._imgDir,subdir), self._image_ext)
64 | image_index += imgFilenames
65 | image_subdirs += [subdir] * len(imgFilenames)
66 | return image_index, image_subdirs
67 |
68 | def gt_roidb(self):
69 | """
70 | Return the database of ground-truth regions of interest.
71 |
72 | This function loads/saves from/to a cache file to speed up future calls.
73 | """
74 | cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl')
75 | if os.path.exists(cache_file):
76 | with open(cache_file, 'rb') as fid:
77 | roidb = cp.load(fid)
78 | print ('{} gt roidb loaded from {}'.format(self.name, cache_file))
79 | return roidb
80 |
81 | gt_roidb = [self._load_annotation(i) for i in range(self.num_images)]
82 | with open(cache_file, 'wb') as fid:
83 | cp.dump(gt_roidb, fid, cp.HIGHEST_PROTOCOL)
84 | print ('wrote gt roidb to {}'.format(cache_file))
85 |
86 | return gt_roidb
87 |
88 | def selective_search_roidb(self):
89 | """
90 | Return the database of selective search regions of interest.
91 | Ground-truth ROIs are also included.
92 |
93 | This function loads/saves from/to a cache file to speed up future calls.
94 | """
95 | cache_file = os.path.join(self.cache_path,
96 | self.name + '_selective_search_roidb.pkl')
97 |
98 | if os.path.exists(cache_file):
99 | with open(cache_file, 'rb') as fid:
100 | if sys.version_info[0] < 3:
101 | roidb = cp.load(fid)
102 | else:
103 | roidb = cp.load(fid, encoding='latin1')
104 | print ('{} ss roidb loaded from {}'.format(self.name, cache_file))
105 | return roidb
106 |
107 | gt_roidb = self.gt_roidb()
108 | ss_roidb = self._load_selective_search_roidb(gt_roidb)
109 |
110 | #add ground truth ROIs
111 | if self._boAddGroundTruthRois:
112 | roidb = self.merge_roidbs(gt_roidb, ss_roidb)
113 | else:
114 | roidb = ss_roidb
115 |
116 | #Keep max of e.g. 2000 rois
117 | if self._maxNrRois and self._maxNrRois > 0:
118 | print ("Only keeping the first %d ROIs.." % self._maxNrRois)
119 | for i in range(self.num_images):
120 | gt_overlaps = roidb[i]['gt_overlaps']
121 | gt_overlaps = gt_overlaps.todense()[:self._maxNrRois]
122 | gt_overlaps = scipy.sparse.csr_matrix(gt_overlaps)
123 | roidb[i]['gt_overlaps'] = gt_overlaps
124 | roidb[i]['boxes'] = roidb[i]['boxes'][:self._maxNrRois,:]
125 | roidb[i]['gt_classes'] = roidb[i]['gt_classes'][:self._maxNrRois]
126 |
127 | with open(cache_file, 'wb') as fid:
128 | cp.dump(roidb, fid, cp.HIGHEST_PROTOCOL)
129 | print ('wrote ss roidb to {}'.format(cache_file))
130 |
131 | return roidb
132 |
133 | def _load_selective_search_roidb(self, gt_roidb):
134 | # box_list = nrImages x nrBoxes x 4
135 | box_list = []
136 | for imgFilename, subdir in zip(self._image_index, self._image_subdirs):
137 | roiPath = "{}/{}/{}.roi.txt".format(self._roiDir, subdir, imgFilename[:-4])
138 | assert os.path.exists(roiPath), "Error: rois file not found: " + roiPath
139 | rois = np.loadtxt(roiPath, np.int32)
140 | box_list.append(rois)
141 | return self.create_roidb_from_box_list(box_list, gt_roidb)
142 |
143 | def _load_annotation(self, imgIndex):
144 | """
145 | Load image and bounding boxes info from human annotations.
146 | """
147 | #negative images do not have any ground truth annotations
148 | if self._image_subdirs[imgIndex].lower() == "negative":
149 | return None
150 |
151 | imgPath = self.image_path_at(imgIndex)
152 | bboxesPaths = imgPath[:-4] + ".bboxes.tsv"
153 | labelsPaths = imgPath[:-4] + ".bboxes.labels.tsv"
154 | assert os.path.exists(bboxesPaths), "Error: ground truth bounding boxes file not found: " + bboxesPaths
155 | assert os.path.exists(labelsPaths), "Error: ground truth labels file not found: " + bboxesPaths
156 | bboxes = np.loadtxt(bboxesPaths, np.float32)
157 | labels = readFile(labelsPaths)
158 |
159 | # in case there's only one annotation and numpy read the array as single array,
160 | # we need to make sure the input is treated as a multi dimensional array instead of a list/ 1D array
161 | #if len(bboxes.shape) == 1:
162 | if len(bboxes)>0 and type(bboxes[0]) == np.float32:
163 | bboxes = np.array([bboxes])
164 |
165 | #remove boxes marked as 'undecided' or 'exclude'
166 | indicesToKeep = find(labels, lambda x: x!='EXCLUDE' and x!='UNDECIDED')
167 | bboxes = [bboxes[i] for i in indicesToKeep]
168 | labels = [labels[i] for i in indicesToKeep]
169 |
170 | # Load object bounding boxes into a data frame.
171 | num_objs = len(bboxes)
172 | boxes = np.zeros((num_objs,4), dtype=np.uint16)
173 | gt_classes = np.zeros(num_objs, dtype=np.int32)
174 | overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32)
175 | for bboxIndex,(bbox,label) in enumerate(zip(bboxes,labels)):
176 | cls = self._class_to_ind[label] #.decode('utf-8')]
177 | boxes[bboxIndex, :] = bbox
178 | gt_classes[bboxIndex] = cls
179 | overlaps[bboxIndex, cls] = 1.0
180 |
181 | overlaps = scipy.sparse.csr_matrix(overlaps)
182 |
183 | return {'boxes' : boxes,
184 | 'gt_classes': gt_classes,
185 | 'gt_overlaps' : overlaps,
186 | 'flipped' : False}
187 |
188 | # main call to compute per-calass average precision
189 | # shape of all_boxes: e.g. 21 classes x 4952 images x 58 rois x 5 coords+score
190 | # (see also test_net() in fastRCNN\test.py)
191 | def evaluate_detections(self, all_boxes, output_dir, use_07_metric=False, overlapThreshold = 0.5):
192 | aps = []
193 | for classIndex, className in enumerate(self._classes):
194 | if className != '__background__':
195 | rec, prec, ap = self._evaluate_detections(classIndex, all_boxes, use_07_metric, overlapThreshold)
196 | aps += [[className,ap]]
197 | print('AP for {:>15} = {:.4f}'.format(className, ap))
198 | print('Mean AP = {:.4f}'.format(np.nanmean(getColumn(aps,1))))
199 | return aps
200 |
201 | def _evaluate_detections(self, classIndex, all_boxes, use_07_metric = False, overlapThreshold = 0.5):
202 | """
203 | Top level function that does the PASCAL VOC evaluation.
204 |
205 | [overlapThreshold]: Overlap threshold (default = 0.5)
206 | [use_07_metric]: Whether to use VOC07's 11 point AP computation (default False)
207 | """
208 | assert (len(all_boxes) == self.num_classes)
209 | assert (len(all_boxes[0]) == self.num_images)
210 |
211 | # load ground truth annotations for this class
212 | gtInfos = []
213 | for imgIndex in range(self.num_images):
214 | imgPath = self.image_path_at(imgIndex)
215 | imgSubir = os.path.normpath(imgPath).split(os.path.sep)[-2]
216 | if imgSubir != 'negative':
217 | gtBoxes, gtLabels = readGtAnnotation(imgPath)
218 | gtBoxes = [box for box, label in zip(gtBoxes, gtLabels) if label == self.classes[classIndex]] #.decode('utf-8')
219 | else:
220 | gtBoxes = []
221 | gtInfos.append({'bbox': np.array(gtBoxes),
222 | 'difficult': [False] * len(gtBoxes),
223 | 'det': [False] * len(gtBoxes)})
224 |
225 | # parse detections for this class
226 | # shape of all_boxes: e.g. 21 classes x 4952 images x 58 rois x 5 coords+score
227 | detBboxes = []
228 | detImgIndices = []
229 | detConfidences = []
230 | for imgIndex in range(self.num_images):
231 | dets = all_boxes[classIndex][imgIndex]
232 | if dets != []:
233 | for k in range(dets.shape[0]):
234 | detImgIndices.append(imgIndex)
235 | detConfidences.append(dets[k, -1])
236 | # the VOCdevkit expects 1-based indices
237 | detBboxes.append([dets[k, 0] + 1, dets[k, 1] + 1, dets[k, 2] + 1, dets[k, 3] + 1])
238 | detBboxes = np.array(detBboxes)
239 | detConfidences = np.array(detConfidences)
240 |
241 | # debug: visualize GT and detections
242 | # if classIndex == 15: # and imgPath.endswith("WIN_20160803_11_42_36_Pro.jpg"):
243 | # imgIndex = 6
244 | # imgPath = self.image_path_at(imgIndex)
245 | # img = imread(imgPath)
246 | # tmp_gtBoxes = gtInfos[imgIndex]['bbox']
247 | # inds = np.where(np.array(detImgIndices) == 1)[0]
248 | # tmp_detBoxes = detBboxes[inds]
249 | # print(detConfidences[inds])
250 | # drawRectangles(img, tmp_gtBoxes, color = (255, 0, 0)) #thickness=thickness)
251 | # drawRectangles(img, tmp_detBoxes, color= (0, 255, 0)) # thickness=thickness)
252 | # imshow(img, maxDim=800)
253 |
254 | # compute precision / recall / ap
255 | rec, prec, ap = self._voc_computePrecisionRecallAp(
256 | class_recs=gtInfos,
257 | confidence=detConfidences,
258 | image_ids=detImgIndices,
259 | BB=detBboxes,
260 | ovthresh=overlapThreshold,
261 | use_07_metric=use_07_metric)
262 |
263 | return rec, prec, ap
264 |
265 |
266 | #########################################################################
267 | # Python evaluation functions (copied/refactored from faster-RCNN)
268 | ##########################################################################
269 | def _voc_computePrecisionRecallAp(self, class_recs, confidence, image_ids, BB, ovthresh=0.5, use_07_metric=False):
270 | # sort by confidence
271 | sorted_ind = np.argsort(-confidence)
272 | BB = BB[sorted_ind, :]
273 | image_ids = [image_ids[x] for x in sorted_ind]
274 |
275 | # go down dets and mark TPs and FPs
276 | nd = len(image_ids)
277 | tp = np.zeros(nd)
278 | fp = np.zeros(nd)
279 | for d in range(nd):
280 | R = class_recs[image_ids[d]]
281 | bb = BB[d, :].astype(float)
282 | ovmax = -np.inf
283 | BBGT = R['bbox'].astype(float)
284 |
285 | if BBGT.size > 0:
286 | # compute overlaps
287 | ixmin = np.maximum(BBGT[:, 0], bb[0])
288 | iymin = np.maximum(BBGT[:, 1], bb[1])
289 | ixmax = np.minimum(BBGT[:, 2], bb[2])
290 | iymax = np.minimum(BBGT[:, 3], bb[3])
291 | iw = np.maximum(ixmax - ixmin + 1., 0.)
292 | ih = np.maximum(iymax - iymin + 1., 0.)
293 | inters = iw * ih
294 |
295 | # union
296 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
297 | (BBGT[:, 2] - BBGT[:, 0] + 1.) *
298 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
299 |
300 | overlaps = inters / uni
301 | ovmax = np.max(overlaps)
302 | jmax = np.argmax(overlaps)
303 |
304 | if ovmax > ovthresh:
305 | if not R['difficult'][jmax]:
306 | if not R['det'][jmax]:
307 | tp[d] = 1.
308 | R['det'][jmax] = 1
309 | else:
310 | fp[d] = 1.
311 | else:
312 | fp[d] = 1.
313 |
314 | # compute precision recall
315 | npos = sum([len(cr['bbox']) for cr in class_recs])
316 | fp = np.cumsum(fp)
317 | tp = np.cumsum(tp)
318 | rec = tp / float(npos)
319 | # avoid divide by zero in case the first detection matches a difficult
320 | # ground truth
321 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
322 | ap = computeAveragePrecision(rec, prec, use_07_metric)
323 | return rec, prec, ap
--------------------------------------------------------------------------------
/resources/cntk/config.cntk:
--------------------------------------------------------------------------------
1 | # Fast-RCNN configuration for CNTK
2 | # For algorithm and details see http://arxiv.org/abs/1504.08083
3 | # Overview:
4 | # The Fast-RCNN algorithm uses a DNN that takes as inputs a set of images
5 | # and for each image a set of ROIs (Regions of interest). It first computes
6 | # a convolutional feature map for the entire image using a series of
7 | # of convolutional layers (usually from a pretrained network). Then it
8 | # employs ROI pooling to crop out the part of the conv feature map
9 | # that corresponds to an ROI and resizes it to the input size expected
10 | # by the following layer (usually a set of pretrained fully connected layers).
11 | # Classification error and evaluation criterion are computed for each ROI.
12 |
13 | #makeMode = false
14 | command = Train:WriteTest:WriteTrain
15 |
16 | deviceId = "Auto"
17 | precision = "float"
18 | parallelTrain = "false"
19 | traceLevel = 1
20 |
21 | rootDir = "."
22 | dataDir = "$rootDir$"
23 | outputDir = "$rootDir$/tmp"
24 |
25 | modelPath = "$outputDir$/Fast-RCNN"
26 | #stderr = "$outputDir$/Fast-RCNN.log"
27 |
28 | ImageH = 1000
29 | ImageW = 1000
30 | ImageC = 3
31 |
32 | NumLabels = 22
33 | NumTrainROIs = 200
34 | NumTestROIs = 1000
35 |
36 | TrainROIDim = 800 # $NumTrainROIs$ * 4
37 | TrainROILabelDim = 4400 # $NumTrainROIs$ * $NumLabels$
38 | TestROIDim = 4000 # $NumTestROIs$ * 4
39 | TestROILabelDim = 22000 # $NumTestROIs$ * $NumLabels$
40 |
41 | # For training we load a pretrained AlexNet model (AlexNet.model) and clone three parts of it.
42 | # For the first part (up to pool1) we keep the weights fixed. The middle part contains the
43 | # remaining convolutional and pooling layers and the last part are the FC layers.
44 | # In the model we apply the first two cloned parts, then an ROI pooling layer and
45 | # finally the pretrained FC layers followed by a new FC layer that maps to the new
46 | # label dimensionality of 21 classes.
47 | # The inputs are images (1000 x 1000 x 3), ROIs (64 ROIs x 4 coordinates (x, y, w, h))
48 | # and ground truht labels per ROI (64 ROIs x 21 classes).
49 | Train = {
50 | action = "train"
51 |
52 | BrainScriptNetworkBuilder = {
53 | imageShape = $ImageH$:$ImageW$:$ImageC$ # 1000:1000:3
54 | labelShape = $NumLabels$:$NumTrainROIs$ # 21:64
55 | ROIShape = 4:$NumTrainROIs$ # 4:64
56 |
57 | network = BS.Network.Load ("../../../resources/cntk/AlexNet.model")
58 | convLayers = BS.Network.CloneFunction(network.features, network.conv5_y, parameters = "constant")
59 | fcLayers = BS.Network.CloneFunction(network.pool3, network.h2_d)
60 |
61 |
62 | model (features, rois) = {
63 | featNorm = features - 114
64 | convOut = convLayers (featNorm)
65 | roiOut = ROIPooling (convOut, rois, (6:6))
66 | fcOut = fcLayers (roiOut)
67 | W = ParameterTensor{($NumLabels$:4096), init="glorotUniform"}
68 | b = ParameterTensor{$NumLabels$, init = 'zero'}
69 | z = W * fcOut + b
70 | }.z
71 |
72 | features = Input {imageShape}
73 | roiLabels = Input {labelShape}
74 | rois = Input {ROIShape}
75 |
76 | z = model (features, rois)
77 |
78 | ce = CrossEntropyWithSoftmax(roiLabels, z, axis = 1)
79 | errs = ClassificationError(roiLabels, z, axis = 1)
80 |
81 | featureNodes = (features:rois)
82 | labelNodes = (roiLabels)
83 | criterionNodes = (ce)
84 | evaluationNodes = (errs)
85 | outputNodes = (z)
86 | }
87 |
88 | SGD = {
89 | epochSize = 0
90 | minibatchSize = 1
91 | maxEpochs = 17
92 |
93 | #learningRatesPerSample = 0.00001
94 | #momentumAsTimeConstant = 0*5:10
95 | #dropoutRate = 0
96 |
97 | learningRatesPerMB=0.00001*10:0.000001*5:0.0000001
98 | momentumPerMB=0.9
99 | gradUpdateType=None
100 | L2RegWeight=0.0005
101 | dropoutRate=0.5 #0*5:0.5
102 |
103 | numMBsToShowResult = 50
104 | }
105 |
106 | reader = {
107 | randomize = true
108 | verbosity = 2
109 | deserializers = ({
110 | type = "CNTKTextFormatDeserializer" ; module = "CNTKTextFormatReader"
111 | file = $dataDir$/train.rois.txt
112 | input = { rois = { dim = $TrainROIDim$ ; format = "dense" } }
113 | }:{
114 | type = "CNTKTextFormatDeserializer" ; module = "CNTKTextFormatReader"
115 | file = $dataDir$/train.roilabels.txt
116 | input = { roiLabels = { dim = $TrainROILabelDim$ ; format = "dense" } }
117 | }:{
118 | type = "ImageDeserializer" ; module = "ImageReader"
119 | file = $dataDir$/train.txt
120 | input = {
121 | features = { transforms = (
122 | { type = "Scale" ; width = $ImageW$ ; height = $ImageW$ ; channels = $ImageC$ ; scaleMode = "pad" ; padValue = 114 }:
123 | { type = "Transpose" }
124 | )}
125 | ignored = {labelDim = 1000}
126 | }
127 | })
128 | }
129 | }
130 |
131 | # Write network output for entire test data set
132 | WriteTest = {
133 | action = "write"
134 | minibatchSize = 1
135 |
136 | # outputPath = "$OutputDir$/fastrcnnNetOutput"
137 | outputPath=test
138 |
139 | reader = {
140 | randomize = false
141 | verbosity = 2
142 | deserializers = ({
143 | type = "CNTKTextFormatDeserializer" ; module = "CNTKTextFormatReader"
144 | file = $dataDir$/test.rois.txt
145 | input = { rois = { dim = $TestROIDim$ ; format = "dense" } }
146 | }:{
147 | type = "CNTKTextFormatDeserializer" ; module = "CNTKTextFormatReader"
148 | file = $dataDir$/test.roilabels.txt
149 | input = { roiLabels = { dim = $TestROILabelDim$ ; format = "dense" } }
150 | }:{
151 | type = "ImageDeserializer" ; module = "ImageReader"
152 | file = $dataDir$/test.txt
153 | input = {
154 | features = { transforms = (
155 | { type = "Scale" ; width = $ImageW$ ; height = $ImageW$ ; channels = $ImageC$ ; scaleMode = "pad" ; padValue = 114 }:
156 | { type = "Transpose" }
157 | )}
158 | ignored = {labelDim = 1000}
159 | }
160 | })
161 | }
162 | }
163 |
164 | # Write network output for entire train data set
165 | WriteTrain = {
166 | action = "write"
167 | minibatchSize = 1
168 |
169 | # outputPath = "$OutputDir$/fastrcnnNetOutput"
170 | outputPath=train
171 |
172 | reader = {
173 | randomize = false
174 | verbosity = 2
175 | deserializers = ({
176 | type = "CNTKTextFormatDeserializer" ; module = "CNTKTextFormatReader"
177 | file = $dataDir$/train.rois.txt
178 | input = { rois = { dim = $TestROIDim$ ; format = "dense" } }
179 | }:{
180 | type = "CNTKTextFormatDeserializer" ; module = "CNTKTextFormatReader"
181 | file = $dataDir$/train.roilabels.txt
182 | input = { roiLabels = { dim = $TestROILabelDim$ ; format = "dense" } }
183 | }:{
184 | type = "ImageDeserializer" ; module = "ImageReader"
185 | file = $dataDir$/train.txt
186 | input = {
187 | features = { transforms = (
188 | { type = "Scale" ; width = $ImageW$ ; height = $ImageW$ ; channels = $ImageC$ ; scaleMode = "pad" ; padValue = 114 }:
189 | { type = "Transpose" }
190 | )}
191 | ignored = {labelDim = 1000}
192 | }
193 | })
194 | }
195 | }
196 |
197 |
--------------------------------------------------------------------------------
/resources/cntk/model.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/resources/cntk/model.pdf
--------------------------------------------------------------------------------
/resources/python35_64bit_requirements/opencv_python-3.2.0-cp35-cp35m-win_amd64.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/resources/python35_64bit_requirements/opencv_python-3.2.0-cp35-cp35m-win_amd64.whl
--------------------------------------------------------------------------------
/resources/python35_64bit_requirements/requirements.txt:
--------------------------------------------------------------------------------
1 | ./opencv_python-3.2.0-cp35-cp35m-win_amd64.whl
2 | scikit-learn
3 | Pillow
4 | future
5 | dlib
6 | EasyDict
--------------------------------------------------------------------------------