├── .gitignore ├── 1_computeRois.py ├── 2_cntkGenerateInputs.py ├── 3_runCntk.py ├── 4_trainSvm.py ├── 5_evaluateResults.py ├── 5_visualizeResults.py ├── 6_scoreImage.py ├── A1_annotateImages.py ├── A2_annotateBboxLabels.py ├── B1_evaluateRois.py ├── B2_cntkVisualizeInputs.py ├── B3_cntkAnalyzeInputs.py ├── PARAMETERS.py ├── README.md ├── __init__.py ├── data └── grocery │ ├── negative │ ├── 1.jpg │ ├── 2.jpg │ ├── 3.jpg │ ├── 4.jpg │ └── 5.jpg │ ├── positive │ ├── 0.bboxes.labels.tsv │ ├── 0.bboxes.tsv │ ├── 0.jpg │ ├── 11.bboxes.labels.tsv │ ├── 11.bboxes.tsv │ ├── 11.jpg │ ├── 12.bboxes.labels.tsv │ ├── 12.bboxes.tsv │ ├── 12.jpg │ ├── 13.bboxes.labels.tsv │ ├── 13.bboxes.tsv │ ├── 13.jpg │ ├── 14.bboxes.labels.tsv │ ├── 14.bboxes.tsv │ ├── 14.jpg │ ├── 17.bboxes.labels.tsv │ ├── 17.bboxes.tsv │ ├── 17.jpg │ ├── 18.bboxes.labels.tsv │ ├── 18.bboxes.tsv │ ├── 18.jpg │ ├── 19.bboxes.labels.tsv │ ├── 19.bboxes.tsv │ ├── 19.jpg │ ├── 2.bboxes.labels.tsv │ ├── 2.bboxes.tsv │ ├── 2.jpg │ ├── 21.bboxes.labels.tsv │ ├── 21.bboxes.tsv │ ├── 21.jpg │ ├── 22.bboxes.labels.tsv │ ├── 22.bboxes.tsv │ ├── 22.jpg │ ├── 23.bboxes.labels.tsv │ ├── 23.bboxes.tsv │ ├── 23.jpg │ ├── 24.bboxes.labels.tsv │ ├── 24.bboxes.tsv │ ├── 24.jpg │ ├── 26.bboxes.labels.tsv │ ├── 26.bboxes.tsv │ ├── 26.jpg │ ├── 3.bboxes.labels.tsv │ ├── 3.bboxes.tsv │ ├── 3.jpg │ ├── 4.bboxes.labels.tsv │ ├── 4.bboxes.tsv │ ├── 4.jpg │ ├── 6.bboxes.labels.tsv │ ├── 6.bboxes.tsv │ ├── 6.jpg │ ├── 7.bboxes.labels.tsv │ ├── 7.bboxes.tsv │ ├── 7.jpg │ ├── 8.bboxes.labels.tsv │ ├── 8.bboxes.tsv │ ├── 8.jpg │ ├── 9.bboxes.labels.tsv │ ├── 9.bboxes.tsv │ └── 9.jpg │ └── testImages │ ├── 10.bboxes.labels.tsv │ ├── 10.bboxes.tsv │ ├── 10.jpg │ ├── 15.bboxes.labels.tsv │ ├── 15.bboxes.tsv │ ├── 15.jpg │ ├── 20.bboxes.labels.tsv │ ├── 20.bboxes.tsv │ ├── 20.jpg │ ├── 25.bboxes.labels.tsv │ ├── 25.bboxes.tsv │ ├── 25.jpg │ ├── 5.bboxes.labels.tsv │ ├── 5.bboxes.tsv │ └── 5.jpg ├── deprecated_3_runCntk_brainscript.py ├── doc ├── 0.filter.roi.jpg ├── 0.grid.roi.jpg ├── 0.ss.roi.jpg ├── anno_boxes.jpg ├── anno_labels.jpg ├── nn_00.jpg ├── nn_00_no_nms.jpg ├── nn_01.jpg ├── nn_110.jpg ├── nn_215.jpg ├── nn_425.jpg ├── nn_55.jpg ├── precision_recall.jpg ├── rcnnPipeline.JPG ├── svm_010.jpg ├── svm_115.jpg ├── svm_220.jpg ├── svm_325.jpg └── svm_45.jpg ├── fastRCNN ├── __init__.py ├── imdb.py ├── nms.py ├── pascal_voc.py ├── test.py ├── timer.py ├── train_svms.py ├── utils34_win64 │ ├── cython_bbox.pyd │ └── cython_nms.pyd ├── utils35_win64 │ ├── cython_bbox.pyd │ └── cython_nms.pyd └── voc_eval.py ├── helpers.py ├── helpers_cntk.py ├── imdb_data.py └── resources ├── cntk ├── config.cntk └── model.pdf └── python35_64bit_requirements ├── opencv_python-3.2.0-cp35-cp35m-win_amd64.whl └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.pyx 3 | /__pycache__/ 4 | /backup_v0/ 5 | /data/liebherr_v4/ 6 | /fastRCNN/__pycache__/ 7 | /proc/ 8 | /resources/cntk/AlexNet.model 9 | /resources/pascalVocData/ 10 | /results/ 11 | /selectivesearch/ 12 | -------------------------------------------------------------------------------- /1_computeRois.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys, os, importlib, random 3 | import PARAMETERS 4 | locals().update(importlib.import_module("PARAMETERS").__dict__) 5 | 6 | 7 | #################################### 8 | # Parameters 9 | #################################### 10 | boShowImg = True 11 | subdirs = ['positive', 'testImages', 'negative'] 12 | 13 | #no need to change these parameters 14 | boAddSelectiveSearchROIs = True 15 | boAddGridROIs = True 16 | boFilterROIs = True 17 | if datasetName.lower() == "pascalvoc": 18 | print("No need to run ROI computation since Pascal VOC comes with pre-computed ROIs.") 19 | exit() 20 | 21 | 22 | #################################### 23 | # Main 24 | #################################### 25 | #init 26 | for subdir in subdirs: 27 | makeDirectory(roiDir) 28 | makeDirectory(roiDir + subdir) 29 | imgFilenames = getFilesInDirectory(imgDir + subdir, ".jpg") 30 | 31 | #loop over all images 32 | times = [] 33 | for imgIndex, imgFilename in enumerate(imgFilenames): 34 | #if os.path.exists(roiPath): 35 | # print "Skipping image since roi file already exists: " + imgFilename, imgIndex 36 | # continue 37 | 38 | # load image 39 | print("Processing image {} of {}: subdir={}, filename={}".format(imgIndex, len(imgFilenames), subdir, imgFilename)) 40 | imgPath = join(imgDir, subdir, imgFilename) 41 | imgOrig = imread(imgPath) 42 | 43 | # compute ROIs 44 | tstart = datetime.datetime.now() 45 | rois = computeRois(imgOrig, boAddSelectiveSearchROIs, boAddGridROIs, boFilterROIs, ss_kvals, ss_minSize, ss_max_merging_iterations, ss_nmsThreshold, 46 | roi_minDimRel, roi_maxDimRel, roi_maxImgDim, roi_maxAspectRatio, roi_minNrPixelsRel, roi_maxNrPixelsRel, 47 | grid_nrScales, grid_aspectRatios, grid_downscaleRatioPerIteration, grid_stepSizeRel) 48 | times.append((datetime.datetime.now() - tstart).total_seconds() * 1000) 49 | print(" Time roi computation [ms]: " + str((datetime.datetime.now() - tstart).total_seconds() * 1000)) 50 | roiPath = "{}/{}/{}.roi.txt".format(roiDir, subdir, imgFilename[:-4]) 51 | np.savetxt(roiPath, rois, fmt='%d') 52 | 53 | #visualize ROIs 54 | if boShowImg: 55 | debugScale = 800.0 / max(imWidthHeight(imgOrig)) 56 | img = imresize(imgOrig, debugScale) 57 | drawRectangles(img, rois*debugScale, color=(0, 255, 0), thickness=1) 58 | imshow(img, waitDuration = 1) 59 | roiImgPath = os.path.join(roiDir, subdir, imgFilename[:-4] + ".roi.jpg") 60 | imwrite(img, roiImgPath) 61 | 62 | print("Time per image [ms]: median={:.1f}, std={:.1f}, 90%-percentile={:.1f}".format(np.median(times), np.std(times), np.percentile(times, 90))) 63 | print("DONE.") -------------------------------------------------------------------------------- /2_cntkGenerateInputs.py: -------------------------------------------------------------------------------- 1 | import os, sys, importlib 2 | import shutil, time 3 | import PARAMETERS 4 | locals().update(importlib.import_module("PARAMETERS").__dict__) 5 | 6 | 7 | #################################### 8 | # Parameters 9 | #################################### 10 | image_sets = ["train", "test"] 11 | 12 | 13 | #################################### 14 | # Main 15 | #################################### 16 | #clear imdb cache and other files 17 | if os.path.exists(cntkFilesDir): 18 | assert(cntkFilesDir.endswith("cntkFiles/")) 19 | userInput = input('--> INPUT: Press "y" to delete directory ' + cntkFilesDir + ": ") 20 | if userInput.lower() not in ['y', 'yes']: 21 | print("User input is %s: exiting now." % userInput) 22 | exit(-1) 23 | shutil.rmtree(cntkFilesDir) 24 | time.sleep(0.2) #avoid file access errors 25 | 26 | 27 | #create cntk representation for each image 28 | makeDirectory(cntkFilesDir) 29 | for image_set in image_sets: 30 | imdb = imdbs[image_set] 31 | counterGt = np.zeros(len(classes), np.int32) 32 | print("Number of images in set '{}' = {}".format(image_set, imdb.num_images)) 33 | 34 | #open files for writing 35 | cntkImgsPath, cntkRoiCoordsPath, cntkRoiLabelsPath, nrRoisPath = cntkInputPaths(cntkFilesDir, image_set) 36 | with open(cntkImgsPath, 'w') as cntkImgsFile, \ 37 | open(cntkRoiCoordsPath, 'w') as cntkRoiCoordsFile, \ 38 | open(cntkRoiLabelsPath, 'w') as cntkRoiLabelsFile, \ 39 | open(nrRoisPath, 'w') as nrRoisFile: 40 | 41 | # for each image, transform rois etc to cntk format 42 | for imgIndex in range(0, imdb.num_images): 43 | if imgIndex % 200 == 0: 44 | print("Processing image set '{}', image {} of {}".format(image_set, imgIndex, imdb.num_images)) 45 | imgPath = imdb.image_path_at(imgIndex) 46 | currRois = imdb.roidb[imgIndex]['boxes'] 47 | currGtOverlaps = imdb.roidb[imgIndex]['gt_overlaps'] 48 | for i in imdb.roidb[imgIndex]['gt_classes']: 49 | counterGt[i] += 1 50 | 51 | #get DNN inputs for image 52 | #Note: this also marks other ROIs as 'positives', if overlap with GT is above a threshold 53 | labelsStr, roisStr, _ = getCntkInputs(imgPath, currRois, currGtOverlaps, train_posOverlapThres, nrClasses, cntk_nrRois, cntk_padWidth, cntk_padHeight) 54 | 55 | #update cntk data 56 | nrRoisFile.write("{}\n".format(len(currRois))) 57 | cntkImgsFile.write("{}\t{}\t0\n".format(imgIndex, imgPath)) 58 | cntkRoiCoordsFile.write("{} |rois{}\n".format(imgIndex, roisStr)) 59 | cntkRoiLabelsFile.write("{} |roiLabels{}\n".format(imgIndex, labelsStr)) 60 | 61 | #print debug info 62 | if image_set == 'train': 63 | for i in range(len(classes)): 64 | print(" {:3}: Found {} objects of class {}.".format(i, counterGt[i], classes[i])) 65 | 66 | print("DONE.") 67 | -------------------------------------------------------------------------------- /3_runCntk.py: -------------------------------------------------------------------------------- 1 | from PARAMETERS import * 2 | from helpers_cntk import * 3 | 4 | 5 | #################################### 6 | # MAIN 7 | #################################### 8 | makeDirectory(modelDir) 9 | print ("classifier = " + classifier) 10 | print ("cntk_lr_per_image = " + str(cntk_lr_per_image)) 11 | 12 | # optionally retrain DNN 13 | # if the classifier is svm, then simply return the 4096-floats penultimate layer as model 14 | # otherwise add new output layer, retrain the DNN, and return this new model. 15 | if classifier == 'svm': 16 | boSkipTraining = True 17 | else: 18 | boSkipTraining = False 19 | model = init_train_fast_rcnn(cntk_padHeight, cntk_padWidth, nrClasses, cntk_nrRois, cntk_mb_size, cntk_max_epochs, 20 | cntk_lr_per_image, cntk_l2_reg_weight, cntk_momentum_time_constant, cntkFilesDir, boSkipTraining) 21 | 22 | # write model to disk 23 | model_path = os.path.join(modelDir, "frcn_" + classifier + ".model") 24 | print("Writing model to %s" % model_path) 25 | model.save(model_path) 26 | 27 | # compute output of every image and write to disk 28 | image_sets = ["test", "train"] 29 | for image_set in image_sets: 30 | outParsedDir = cntkFilesDir + image_set + "_" + classifier + "_parsed/" 31 | makeDirectory(outParsedDir) 32 | run_fast_rcnn(model, image_set, cntk_padHeight, cntk_padWidth, nrClasses, cntk_nrRois, cntkFilesDir, outParsedDir) 33 | 34 | print("DONE.") -------------------------------------------------------------------------------- /4_trainSvm.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | from fastRCNN.train_svms import SVMTrainer 3 | import PARAMETERS 4 | locals().update(importlib.import_module("PARAMETERS").__dict__) 5 | 6 | 7 | ################################################# 8 | # Parameters 9 | ################################################# 10 | experimentName = "exp1" 11 | 12 | #no need to change these params 13 | cntkParsedOutputDir = cntkFilesDir + "train_svm_parsed/" 14 | 15 | 16 | 17 | ################################################# 18 | # Main 19 | ################################################# 20 | if classifier == "nn": 21 | print("No need to train SVM since using 'nn' classifier.") 22 | exit() 23 | print ("svm_targetNorm = " + str(svm_targetNorm)) 24 | print ("svm_retrainLimit = " + str(svm_retrainLimit)) 25 | print ("svm_posWeight = " + str(svm_posWeight)) 26 | print ("svm_C = " + str(svm_C)) 27 | print ("svm_B = " + str(svm_B)) 28 | print ("svm_penality = " + str(svm_penality)) 29 | print ("svm_loss = " + str(svm_loss)) 30 | print ("svm_evictThreshold = " + str(svm_evictThreshold)) 31 | print ("svm_nrEpochs = " + str(svm_nrEpochs)) 32 | 33 | #init 34 | makeDirectory(trainedSvmDir) 35 | np.random.seed(svm_rngSeed) 36 | imdb = imdbs["train"] 37 | net = DummyNet(4096, imdb.num_classes, cntkParsedOutputDir) 38 | svmWeightsPath, svmBiasPath, svmFeatScalePath = svmModelPaths(trainedSvmDir, experimentName) 39 | 40 | # add ROIs which significantly overlap with a ground truth object as positives 41 | if train_posOverlapThres > 0: 42 | print ("Adding ROIs with gt overlap >= %2.2f as positives ..." % (train_posOverlapThres)) 43 | existingPosCounter, addedPosCounter = updateRoisGtClassIfHighGtOverlap(imdb, train_posOverlapThres) 44 | print ("Number of positives originally: {} (in {} images)".format(existingPosCounter, imdb.num_images)) 45 | print ("Number of additional positives: {}.".format(addedPosCounter)) 46 | 47 | # start training 48 | svm = SVMTrainer(net, imdb, im_detect, svmWeightsPath, svmBiasPath, svmFeatScalePath, 49 | svm_C, svm_B, svm_nrEpochs, svm_retrainLimit, svm_evictThreshold, svm_posWeight, 50 | svm_targetNorm, svm_penality, svm_loss, svm_rngSeed) 51 | svm.train() 52 | print ("DONE.") 53 | -------------------------------------------------------------------------------- /5_evaluateResults.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | from fastRCNN.test import test_net 3 | import PARAMETERS 4 | locals().update(importlib.import_module("PARAMETERS").__dict__) 5 | 6 | 7 | #################################### 8 | # Parameters 9 | #################################### 10 | image_set = 'test' 11 | svmExperimentName = "exp1" 12 | 13 | #no need to change these 14 | cntkParsedOutputDir = cntkFilesDir + image_set + "_" + classifier + "_parsed/" 15 | 16 | 17 | #################################### 18 | # Main 19 | #################################### 20 | print("classifier = " + classifier) 21 | print("image_set = " + image_set) 22 | imdb = imdbs[image_set] 23 | net = DummyNet(4096, imdb.num_classes, cntkParsedOutputDir) 24 | 25 | #load svm 26 | svmFeatScale = None 27 | if classifier == 'svm': 28 | svmWeights, svmBias, svmFeatScale = loadSvm(trainedSvmDir, svmExperimentName) 29 | net.params['cls_score'][0].data = svmWeights 30 | net.params['cls_score'][1].data = svmBias 31 | 32 | #create empty directory for evaluation files 33 | if type(imdb) == imdb_data: 34 | evalTempDir = None 35 | else: 36 | #pascal_voc implementation requires temporary directory for evaluation 37 | evalTempDir = os.path.join(procDir, "eval_mAP_" + image_set) 38 | makeDirectory(evalTempDir) 39 | deleteAllFilesInDirectory(evalTempDir, None) 40 | 41 | #compute mAPs 42 | evalResults = test_net(net, imdb, evalTempDir, svmFeatScale, classifier, nmsThreshold, boUsePythonImpl = True, overlapThreshold = evalVocOverlapThreshold) #, boApplyNms = False) #, boThresholdDetections = False) 43 | #writeTable("evalResults.tsv", [["CLASS","Average Precision (AP)"]] + evalResults) 44 | 45 | print("DONE.") -------------------------------------------------------------------------------- /5_visualizeResults.py: -------------------------------------------------------------------------------- 1 | import os, importlib, sys 2 | import PARAMETERS 3 | locals().update(importlib.import_module("PARAMETERS").__dict__) 4 | 5 | 6 | #################################### 7 | # Parameters 8 | #################################### 9 | image_set = 'test' #'train', 'test' 10 | svm_experimentName = 'exp1' 11 | 12 | #no need to change these parameters 13 | boIncludeGroundTruthRois = False #remove GT (perfect) ROIs which were added to the 'train' imageSet 14 | boUseNonMaximaSurpression = True 15 | visualizationDir = resultsDir + "visualizations" 16 | cntkParsedOutputDir = cntkFilesDir + image_set + "_" + classifier + "_parsed/" 17 | if classifier == 'svm': 18 | prThresholds = np.linspace(0, 10, 21) 19 | else: 20 | prThresholds = np.linspace(0, 1, 21) 21 | 22 | 23 | 24 | #################################### 25 | # Main 26 | #################################### 27 | #init 28 | imdb = imdbs[image_set] 29 | gt_roidb = imdb.gt_roidb() 30 | recalls = collections.defaultdict(list) 31 | precisions = collections.defaultdict(list) 32 | 33 | #load svm 34 | print("classifier = " + classifier) 35 | makeDirectory(resultsDir) 36 | makeDirectory(visualizationDir) 37 | if classifier == "svm": 38 | print("Loading svm weights..") 39 | svmWeights, svmBias, svmFeatScale = loadSvm(trainedSvmDir, svm_experimentName) 40 | else: 41 | svmWeights, svmBias, svmFeatScale = (None, None, None) 42 | 43 | 44 | #loop over all images and visualize 45 | for imgIndex in range(0, imdb.num_images): 46 | imgPath = imdb.image_path_at(imgIndex) 47 | imgWidth, imgHeight = imWidthHeight(imgPath) 48 | print("Processing image {} of {}: {}".format(imgIndex, imdb.num_images, imgPath)) 49 | 50 | #load DNN output 51 | cntkOutputPath = os.path.join(cntkParsedOutputDir, str(imgIndex) + ".dat.npz") 52 | dnnOutput = np.load(cntkOutputPath)['arr_0'] 53 | assert(len(dnnOutput) == cntk_nrRois) 54 | 55 | #evaluate classifier for all rois and remove the zero-padded rois 56 | labels, scores = scoreRois(classifier, dnnOutput, svmWeights, svmBias, svmFeatScale, len(classes)) #, vis_decisionThresholds[classifier]) 57 | scores = scores[:len(imdb.roidb[imgIndex]['boxes'])] 58 | labels = labels[:len(imdb.roidb[imgIndex]['boxes'])] 59 | 60 | #remove the ground truth ROIs which were added for training purposes 61 | if not boIncludeGroundTruthRois: 62 | inds = np.where(imdb.roidb[imgIndex]['gt_classes'] == 0)[0] 63 | labels = [labels[i] for i in inds] 64 | scores = [scores[i] for i in inds] 65 | imdb.roidb[imgIndex]['boxes'] = imdb.roidb[imgIndex]['boxes'][inds] 66 | 67 | #perform non-maxima surpression. note that the set of labels detected in the image is not affected by this. 68 | nmsKeepIndices = [] 69 | if boUseNonMaximaSurpression: 70 | nmsKeepIndices = applyNonMaximaSuppression(nmsThreshold, labels, scores, imdb.roidb[imgIndex]['boxes']) 71 | print("Non-maxima surpression kept {:4} of {:4} rois (nmsThreshold={})".format(len(nmsKeepIndices), len(labels), nmsThreshold)) 72 | 73 | #visualize results 74 | imgDebug = visualizeResults(imgPath, labels, scores, imdb.roidb[imgIndex]['boxes'], classes, nmsKeepIndices, 75 | boDrawNegativeRois=False, boDrawNmsRejectedRois=False, decisionThreshold = vis_decisionThresholds[classifier]) 76 | imshow(imgDebug, waitDuration=1, maxDim = 800) 77 | imwrite(imgDebug, visualizationDir + "/" + classifier + "_" + str(imgIndex) + os.path.basename(imgPath)) 78 | 79 | 80 | #compute precision recall of the detection for different thresholds 81 | gtLabels = gt_roidb[imgIndex]['gt_classes'] 82 | gtBboxes = [Bbox(*rect) for rect in gt_roidb[imgIndex]['boxes']] 83 | 84 | for thres in prThresholds: 85 | # get detections with scores higher than the threshold and which were kept by nms 86 | keepInds = set(np.where((np.array(labels) > 0) & (np.array(scores) > thres))[0]) 87 | if boUseNonMaximaSurpression: 88 | keepInds = keepInds.intersection(nmsKeepIndices) 89 | detLabels = [labels[i] for i in keepInds] 90 | detBboxes = [Bbox(*imdb.roidb[imgIndex]['boxes'][i]) for i in keepInds] 91 | 92 | #compute precision recall of the detection 93 | precision, recall = detPrecisionRecall(detBboxes, detLabels, gtBboxes, gtLabels, 94 | evalVocOverlapThreshold, boPenalizeMultipleDetections=False) 95 | recalls[thres].append(recall) 96 | if precision != None: 97 | precisions[thres].append(precision) 98 | 99 | 100 | #compute precision and recall at different thresholds 101 | print("Precision/recall when rejecting detections below a given threshold:") 102 | outPR = [("Threshold", "Precision", "Recall")] 103 | for thres in prThresholds: 104 | if precisions[thres] == []: 105 | break 106 | p = np.mean(precisions[thres]) 107 | r = np.mean(recalls[thres]) 108 | outPR.append((thres, p, r)) 109 | print(" At threshold {:.2f}: precision = {:2.2f}, recall = {:2.2f}".format(thres, p, r)) 110 | #writeTable("precisionRecalls.tsv", outPR) 111 | 112 | print("DONE.") -------------------------------------------------------------------------------- /6_scoreImage.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys, os, importlib, random, json 3 | import PARAMETERS 4 | from helpers_cntk import * 5 | locals().update(importlib.import_module("PARAMETERS").__dict__) 6 | 7 | 8 | #################################### 9 | # Parameters 10 | #################################### 11 | imgPath = r"C:/Users/pabuehle/Desktop/newImgs/WIN_20160803_11_30_07_Pro.jpg" 12 | 13 | #choose which classifier to use 14 | classifier = 'svm' 15 | svm_experimentName = 'exp1' 16 | 17 | # no need to change these parameters 18 | boAddSelectiveSearchROIs = True 19 | boAddGridROIs = True 20 | boFilterROIs = True 21 | boUseNonMaximaSurpression = True 22 | 23 | 24 | #################################### 25 | # Main 26 | #################################### 27 | random.seed(0) 28 | 29 | # load cntk model 30 | print("Loading DNN..") 31 | tstart = datetime.datetime.now() 32 | model_path = os.path.join(modelDir, "frcn_" + classifier + ".model") 33 | if not os.path.exists(model_path): 34 | raise Exception("Model {} not found.".format(model_path)) 35 | model = load_model(model_path) 36 | print("Time loading DNN [ms]: " + str((datetime.datetime.now() - tstart).total_seconds() * 1000)) 37 | 38 | # load trained svm 39 | if classifier == "svm": 40 | print("Loading svm weights..") 41 | tstart = datetime.datetime.now() 42 | svmWeights, svmBias, svmFeatScale = loadSvm(trainedSvmDir, svm_experimentName) 43 | print("Time loading svm [ms]: " + str((datetime.datetime.now() - tstart).total_seconds() * 1000)) 44 | else: 45 | svmWeights, svmBias, svmFeatScale = (None, None, None) 46 | 47 | # compute ROIs 48 | tstart = datetime.datetime.now() 49 | imgOrig = imread(imgPath) 50 | currRois = computeRois(imgOrig, boAddSelectiveSearchROIs, boAddGridROIs, boFilterROIs, ss_kvals, ss_minSize, 51 | ss_max_merging_iterations, ss_nmsThreshold, 52 | roi_minDimRel, roi_maxDimRel, roi_maxImgDim, roi_maxAspectRatio, roi_minNrPixelsRel, 53 | roi_maxNrPixelsRel, grid_nrScales, grid_aspectRatios, grid_downscaleRatioPerIteration, grid_stepSizeRel) 54 | currRois = currRois[:cntk_nrRois] # only keep first cntk_nrRois rois 55 | print("Time roi computation [ms]: " + str((datetime.datetime.now() - tstart).total_seconds() * 1000)) 56 | 57 | # prepare DNN inputs 58 | tstart = datetime.datetime.now() 59 | imgPadded = imresizeAndPad(imgOrig, cntk_padWidth, cntk_padHeight) 60 | _, _, roisCntk = getCntkInputs(imgPath, currRois, None, train_posOverlapThres, nrClasses, cntk_nrRois, cntk_padWidth, cntk_padHeight) 61 | arguments = { 62 | model.arguments[0]: [np.ascontiguousarray(np.array(imgPadded, dtype=np.float32).transpose(2, 0, 1))], # convert to CNTK's HWC format 63 | model.arguments[1]: [np.array(roisCntk, np.float32)] 64 | } 65 | print("Time cnkt input generation [ms]: " + str((datetime.datetime.now() - tstart).total_seconds() * 1000)) 66 | 67 | # run DNN model 68 | print("Running model..") 69 | tstart = datetime.datetime.now() 70 | dnnOutputs = model.eval(arguments)[0] 71 | dnnOutputs = dnnOutputs[:len(currRois)] # remove the zero-padded rois 72 | print("Time running model [ms]: " + str((datetime.datetime.now() - tstart).total_seconds() * 1000)) 73 | 74 | # score all ROIs 75 | tstart = datetime.datetime.now() 76 | labels, scores = scoreRois(classifier, dnnOutputs, svmWeights, svmBias, svmFeatScale, len(classes), 77 | decisionThreshold = vis_decisionThresholds[classifier]) 78 | print("Time making prediction [ms]: " + str((datetime.datetime.now() - tstart).total_seconds() * 1000)) 79 | 80 | # perform non-maxima surpression 81 | tstart = datetime.datetime.now() 82 | nmsKeepIndices = [] 83 | if boUseNonMaximaSurpression: 84 | nmsKeepIndices = applyNonMaximaSuppression(nmsThreshold, labels, scores, currRois) 85 | print("Non-maxima surpression kept {:4} of {:4} rois (nmsThreshold={})".format( 86 | len(nmsKeepIndices), len(labels), nmsThreshold)) 87 | print("Time non-maxima surpression [ms]: " + str((datetime.datetime.now() - tstart).total_seconds() * 1000)) 88 | 89 | # visualize results 90 | imgDebug = visualizeResults(imgPath, labels, scores, currRois, classes, nmsKeepIndices, 91 | boDrawNegativeRois=False, boDrawNmsRejectedRois=False) 92 | imshow(imgDebug, waitDuration=0, maxDim=800) 93 | 94 | # create json-encoded string of all detections 95 | outDict = [{"label": str(l), "score": str(s), "nms": str(False), "left": str(r[0]), "top": str(r[1]), "right": str(r[2]), "bottom": str(r[3])} for l,s, r in zip(labels, scores, currRois)] 96 | for i in nmsKeepIndices: 97 | outDict[i]["nms"] = str(True) 98 | outJsonString = json.dumps(outDict) 99 | print("Json-encoded detections: " + outJsonString[:120] + "...") 100 | print("DONE.") 101 | 102 | #--- optional code ---# 103 | 104 | # write all detections to file, and show how to read in again to visualize 105 | # writeDetectionsFile("detections.tsv", outDict, classes) 106 | # labels2, scores2, currRois2, nmsKeepIndices2 = parseDetectionsFile("detections.tsv", lutClass2Id) 107 | # imgDebug2 = visualizeResults(imgPath, labels2, scores2, currRois2, classes, nmsKeepIndices2, # identical to imgDebug 108 | # boDrawNegativeRois=False, boDrawNmsRejectedRois=False) 109 | # imshow(imgDebug2, waitDuration=0, maxDim=800) 110 | 111 | # extract crop of the highest scored ROI 112 | # maxScore = -float("inf") 113 | # maxScoreRoi = [] 114 | # for index, (label,score) in enumerate(zip(labels,scores)): 115 | # if score > maxScore and label > 0: #and index in nmsKeepIndices: 116 | # maxScore = score 117 | # maxScoreRoi = currRois[index] 118 | # if maxScoreRoi == []: 119 | # print("WARNING: not a single object detected") 120 | # else: 121 | # imgCrop = imgOrig[maxScoreRoi[1]:maxScoreRoi[3], maxScoreRoi[0]:maxScoreRoi[2], :] 122 | # imwrite(imgCrop, outCropDir + os.path.basename(imgPath)) 123 | # imshow(imgCrop) 124 | 125 | -------------------------------------------------------------------------------- /A1_annotateImages.py: -------------------------------------------------------------------------------- 1 | import os, sys, importlib, shutil 2 | import PARAMETERS 3 | locals().update(importlib.import_module("PARAMETERS").__dict__) 4 | 5 | 6 | #################################### 7 | # Parameters 8 | #################################### 9 | imagesToAnnotateDir = "C:/Users/pabuehle/Desktop/newImgs/" 10 | 11 | #no need to change these params 12 | drawingMaxImgSize = 1000.0 13 | annotationsFile = resultsDir + "annotations.tsv" 14 | minNrPixels = -1 15 | 16 | 17 | #################################### 18 | # Functions 19 | #################################### 20 | def event_cv2GetRectangles(event, x, y, flags, param): 21 | global cv2GetRectangle_global_bboxes 22 | global cv2GetRectangle_global_leftButtonDownPoint 23 | boLeftMouseDown = flags == cv2.EVENT_FLAG_LBUTTON 24 | 25 | #draw all previous bounding boxes 26 | imgCopy = image.copy() 27 | drawRectangles(imgCopy, cv2GetRectangle_global_bboxes) 28 | if len(cv2GetRectangle_global_bboxes)>0: 29 | drawRectangles(imgCopy, [cv2GetRectangle_global_bboxes[-1]], color = (255, 0, 0)) 30 | 31 | #handle mouse clicks 32 | if event == cv2.EVENT_LBUTTONDOWN: 33 | cv2GetRectangle_global_leftButtonDownPoint = (x, y) 34 | elif event == cv2.EVENT_LBUTTONUP: 35 | pt1 = cv2GetRectangle_global_leftButtonDownPoint 36 | pt2 = (x, y) 37 | minPt = (min(pt1[0], pt2[0]), min(pt1[1], pt2[1])) 38 | maxPt = (max(pt1[0], pt2[0]), max(pt1[1], pt2[1])) 39 | imgWidth, imgHeight = imWidthHeight(image) 40 | minPt = ptClip(minPt, imgWidth, imgHeight) 41 | maxPt = ptClip(maxPt, imgWidth, imgHeight) 42 | cv2GetRectangle_global_bboxes.append(minPt + maxPt) 43 | elif boLeftMouseDown: 44 | cv2.rectangle(imgCopy, cv2GetRectangle_global_leftButtonDownPoint, (x, y), (255, 255, 0), 1) 45 | else: 46 | drawCrossbar(imgCopy, (x, y)) 47 | cv2.imshow("image", imgCopy) 48 | 49 | 50 | def procBoundingBoxes(rectsIn, imageUnscaled, scaleFactor): 51 | if len(rectsIn) <= 0: 52 | return rectsIn 53 | else: 54 | rects = copy.deepcopy(rectsIn) 55 | for index in range(len(rects)): 56 | for i in range(4): 57 | rects[index][i] = int(round(rects[index][i] / scaleFactor)) 58 | imgWidth, imgHeight = imWidthHeight(imageUnscaled) 59 | bboxes = [Bbox(*rect) for rect in rects] 60 | for bbox in bboxes: 61 | bbox.crop(imgWidth, imgHeight) 62 | assert(bbox.isValid()) 63 | return [bbox.rect() for bbox in bboxes] 64 | 65 | 66 | 67 | #################################### 68 | # Main 69 | #################################### 70 | makeDirectory(resultsDir) 71 | imgFilenames = [f for f in os.listdir(imagesToAnnotateDir) if f.lower().endswith(".jpg")] 72 | 73 | print("Using annotations file: " + annotationsFile) 74 | if annotationsFile and os.path.exists(annotationsFile): 75 | shutil.copyfile(annotationsFile, annotationsFile + ".backup.tsv") 76 | data = readTable(annotationsFile) 77 | annotationsLUT = getDictionary(getColumn(data,0), getColumn(data,1), False) 78 | else: 79 | annotationsLUT = dict() 80 | 81 | 82 | #loop over each image and get annotation 83 | for imgFilenameIndex,imgFilename in enumerate(imgFilenames): 84 | print("imgFilenameIndex = {}, imgFilename = {}".format(imgFilenameIndex, imgFilename)) 85 | imgPath = imagesToAnnotateDir + imgFilename 86 | print("Processing image {0} of {1}: {2}".format(imgFilenameIndex, len(imgFilenames), imgPath)) 87 | bBoxPath = imgPath[:-4] + ".bboxes.tsv" 88 | 89 | #compute scale factor 90 | imgWidth, imgHeight = imWidthHeight(imgPath) 91 | scaleFactor = min(1, drawingMaxImgSize / max(imgWidth, imgHeight)) 92 | if imgWidth * imgHeight < minNrPixels: 93 | print("Low resolution ({0},{1}) hence skipping image: {2}.".format(imgWidth, imgHeight, imgPath)) 94 | continue 95 | 96 | #load existing ground truth if provided 97 | cv2GetRectangle_global_bboxes = [] 98 | if os.path.exists(bBoxPath): 99 | print("Skipping image since ground truth already exists: %s." % imgPath) 100 | continue 101 | 102 | #draw image 103 | imageUnscaled = imread(imgPath) 104 | image = imresize(imageUnscaled, scaleFactor) 105 | cv2.namedWindow("image") 106 | cv2.setMouseCallback("image", event_cv2GetRectangles) 107 | imgCopy = image.copy() 108 | drawRectangles(imgCopy, cv2GetRectangle_global_bboxes) 109 | cv2.imshow("image", imgCopy) 110 | 111 | #wait for user input 112 | while True: 113 | key = chr(cv2.waitKey()) #& 0xFF 114 | 115 | #skip 116 | if key == "s": 117 | if os.path.exists(bBoxPath): 118 | print("Skipping image hence deleting existing bbox file: " + bBoxPath) 119 | os.remove(bBoxPath) 120 | annotationsLUT[imgPath] = "skip" 121 | if annotationsFile: 122 | writeTable(annotationsFile, sortDictionary(annotationsLUT)) 123 | break 124 | 125 | #undo 126 | if key == "u": 127 | if len(cv2GetRectangle_global_bboxes) >= 1: 128 | cv2GetRectangle_global_bboxes = cv2GetRectangle_global_bboxes[:-1] 129 | imgCopy = image.copy() 130 | drawRectangles(imgCopy, cv2GetRectangle_global_bboxes) 131 | cv2.imshow("image", imgCopy) 132 | 133 | #next image 134 | elif key == "n": 135 | bboxes = procBoundingBoxes(cv2GetRectangle_global_bboxes, imageUnscaled, scaleFactor) 136 | writeTable(bBoxPath, bboxes) 137 | annotationsLUT[imgPath] = bboxes 138 | if annotationsFile: 139 | writeTable(annotationsFile, sortDictionary(annotationsLUT)) 140 | break 141 | 142 | #quit 143 | elif key == "q": 144 | sys.exit() 145 | 146 | cv2.destroyAllWindows() 147 | print("DONE.") -------------------------------------------------------------------------------- /A2_annotateBboxLabels.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import cv2, os, sys, time, importlib 3 | from tkinter import * 4 | from PIL import ImageTk 5 | import PARAMETERS 6 | locals().update(importlib.import_module("PARAMETERS").__dict__) 7 | 8 | 9 | #################################### 10 | # Parameters 11 | #################################### 12 | imagesToAnnotateDir = "C:/Users/pabuehle/Desktop/newImgs/" 13 | 14 | #no need to change these 15 | boxWidth = 10 16 | boxHeight = 2 17 | drawingMaxImgSize = 1000 18 | objectNames = classes[1:] 19 | objectNames = np.sort(objectNames).tolist() 20 | objectNames += ["UNDECIDED", "EXCLUDE"] 21 | 22 | 23 | 24 | #################################### 25 | # Helper functions 26 | #################################### 27 | def buttonPressedCallback(s): 28 | global tkLastButtonPressed 29 | global tkBoButtonPressed 30 | tkLastButtonPressed = s 31 | tkBoButtonPressed = True 32 | 33 | 34 | 35 | #################################### 36 | # Main 37 | #################################### 38 | #create UI 39 | tk = Tk() 40 | w = Canvas(tk, width=len(objectNames) * boxWidth, height=len(objectNames) * boxHeight, bd = boxWidth, bg = 'white') 41 | w.grid(row = len(objectNames), column = 0, columnspan = 2) 42 | for objectIndex,objectName in enumerate(objectNames): 43 | b = Button(width=boxWidth, height=boxHeight, text=objectName, command=lambda s = objectName: buttonPressedCallback(s)) 44 | b.grid(row = objectIndex, column = 0) 45 | 46 | 47 | #loop over all images 48 | imgFilenames = getFilesInDirectory(imagesToAnnotateDir, ".jpg") 49 | for imgIndex, imgFilename in enumerate(imgFilenames): 50 | print("imgIndex={}, imgFilename={}".format(imgIndex, imgFilename)) 51 | labelsPath = imagesToAnnotateDir + "/" + imgFilename[:-4] + ".bboxes.labels.tsv" 52 | if os.path.exists(labelsPath): 53 | continue 54 | 55 | #load image and bboxes 56 | imgPath = imagesToAnnotateDir + "/" + imgFilename 57 | print("imgIndex = {}, imgPath = {}".format(imgIndex, imgPath)) 58 | img = imread(imgPath) 59 | rectsPath = imgPath = imagesToAnnotateDir + "/" + imgFilename[:-4] + ".bboxes.tsv" 60 | rects = readTable(rectsPath) 61 | rects = [ToIntegers(rect) for rect in rects] 62 | 63 | #annotate each rectangle in turn 64 | labels = [] 65 | for rectIndex,rect in enumerate(rects): 66 | imgCopy = img.copy() 67 | drawRectangles(imgCopy, [rect], thickness = 15) 68 | 69 | #draw image in tk window 70 | imgTk, _ = imresizeMaxDim(imgCopy, drawingMaxImgSize) 71 | imgTk = imconvertCv2Pil(imgTk) 72 | imgTk = ImageTk.PhotoImage(imgTk) 73 | label = Label(tk, image=imgTk) 74 | label.grid(row=0, column=1, rowspan=drawingMaxImgSize) 75 | tk.update_idletasks() 76 | tk.update() 77 | 78 | #busy-wait until button pressed 79 | tkBoButtonPressed = False 80 | tkLastButtonPressed = None 81 | while not tkBoButtonPressed: 82 | tk.update_idletasks() 83 | tk.update() 84 | 85 | #store result 86 | print("tkLastButtonPressed", tkLastButtonPressed) 87 | labels.append(tkLastButtonPressed) 88 | 89 | writeFile(labelsPath, labels) 90 | tk.destroy() 91 | print("DONE.") -------------------------------------------------------------------------------- /B1_evaluateRois.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys, os, importlib 3 | import PARAMETERS 4 | locals().update(importlib.import_module("PARAMETERS").__dict__) 5 | 6 | 7 | 8 | #################################### 9 | # Parameters 10 | #################################### 11 | subdirs = ['positive'] 12 | 13 | 14 | #################################### 15 | # Main 16 | #################################### 17 | overlaps = [] 18 | roiCounts = [] 19 | for subdir in subdirs: 20 | imgFilenames = getFilesInDirectory(imgDir + subdir, ".jpg") 21 | 22 | #loop over all iamges 23 | for imgIndex,imgFilename in enumerate(imgFilenames): 24 | if imgIndex % 50 == 0: 25 | print("Processing subdir '{}', image {} of {}".format(subdir, imgIndex, len(imgFilenames))) 26 | # load ground truth 27 | imgPath = imgDir + subdir + "/" + imgFilename 28 | imgWidth, imgHeight = imWidthHeight(imgPath) 29 | gtRois, gtLabels = readGtAnnotation(imgPath) 30 | gtRois = [Bbox(*roi) for roi in gtRois] 31 | 32 | # load rois and compute scale 33 | rois = readRois(roiDir, subdir, imgFilename) 34 | rois = rois[:cntk_nrRois] # only use the first N rois (similar to rest of code) 35 | rois = [Bbox(*roi) for roi in rois] 36 | roiCounts.append(len(rois)) 37 | 38 | # for each ground truth, compute if it is covered by an roi 39 | for gtIndex, (gtLabel, gtRoi) in enumerate(zip(gtLabels,gtRois)): 40 | maxOverlap = -1 41 | assert (gtRoi.max() <= max(imgWidth, imgHeight) and gtRoi.max() >= 0) 42 | if gtLabel in classes[1:]: 43 | for roi in rois: 44 | assert (roi.max() <= max(imgWidth, imgHeight) and roi.max() >= 0) 45 | overlap = bboxComputeOverlapVoc(gtRoi, roi) 46 | maxOverlap = max(maxOverlap, overlap) 47 | overlaps.append(maxOverlap) 48 | print("Average number of rois per image " + str(int(1.0 * sum(roiCounts) / len(imgFilenames)))) 49 | 50 | #compute recall at different overlaps 51 | recalls = [] 52 | overlaps = np.array(overlaps, np.float32) 53 | for overlapThreshold in np.linspace(0,1,21): 54 | recall = 1.0 * sum(overlaps >= overlapThreshold) / len(overlaps) 55 | recalls.append(recall) 56 | print("At threshold {:.2f}: recall = {:2.2f}".format(overlapThreshold, recall)) 57 | print("Mean recall = {:2.2}".format(np.mean(recalls))) -------------------------------------------------------------------------------- /B2_cntkVisualizeInputs.py: -------------------------------------------------------------------------------- 1 | import os, importlib, sys 2 | import PARAMETERS 3 | locals().update(importlib.import_module("PARAMETERS").__dict__) 4 | 5 | 6 | #################################### 7 | # Parameters 8 | #################################### 9 | image_set = 'test' # 'train', 'test' 10 | 11 | #no need to change these parameters 12 | parseNrImages = 50 #for speed reasons only parse CNTK file for the first N images 13 | boUseNonMaximaSurpression = False 14 | 15 | 16 | 17 | #################################### 18 | # Main 19 | #################################### 20 | print("Load ROI co-ordinates and labels") 21 | cntkImgsPath, cntkRoiCoordsPath, cntkRoiLabelsPath, nrRoisPath = cntkInputPaths(cntkFilesDir, image_set) 22 | imgPaths = getColumn(readTable(cntkImgsPath),1) 23 | nrRealRois = [int(s) for s in readFile(nrRoisPath)] 24 | roiAllLabels = parseCntkRoiLabels(cntkRoiLabelsPath, cntk_nrRois, len(classes), parseNrImages) 25 | if parseNrImages: 26 | imgPaths = imgPaths[:parseNrImages] 27 | nrRealRois = nrRealRois[:parseNrImages] 28 | roiAllLabels = roiAllLabels[:parseNrImages] 29 | roiAllCoords = parseCntkRoiCoords(imgPaths, cntkRoiCoordsPath, cntk_nrRois, cntk_padWidth, cntk_padHeight, parseNrImages) 30 | assert(len(imgPaths) == len(roiAllCoords) == len(roiAllLabels) == len(nrRealRois)) 31 | 32 | 33 | #loop over all images and visualize 34 | for imgIndex,imgPath in enumerate(imgPaths): 35 | print("Visualizing image %d at %s..." %(imgIndex,imgPath)) 36 | roiCoords = roiAllCoords[imgIndex][:nrRealRois[imgIndex]] 37 | roiLabels = roiAllLabels[imgIndex][:nrRealRois[imgIndex]] 38 | 39 | #perform non-maxima surpression. note that the detected classes in the image is not affected by this. 40 | nmsKeepIndices = [] 41 | if boUseNonMaximaSurpression: 42 | imgWidth, imgHeight = imWidthHeight(imgPath) 43 | nmsKeepIndices = applyNonMaximaSuppression(nmsThreshold, roiLabels, [0] * len(roiLabels), roiCoords) 44 | print("Non-maxima surpression kept {} of {} rois (nmsThreshold={})".format(len(nmsKeepIndices), len(roiLabels), nmsThreshold)) 45 | 46 | #visualize results 47 | imgDebug = visualizeResults(imgPath, roiLabels, None, roiCoords, classes, nmsKeepIndices, boDrawNegativeRois=False) 48 | imshow(imgDebug, waitDuration=0, maxDim = 800) 49 | print("DONE.") 50 | -------------------------------------------------------------------------------- /B3_cntkAnalyzeInputs.py: -------------------------------------------------------------------------------- 1 | import os, sys, importlib 2 | import shutil, time 3 | import PARAMETERS 4 | locals().update(importlib.import_module("PARAMETERS").__dict__) 5 | 6 | 7 | #################################### 8 | # Parameters 9 | #################################### 10 | image_set = "train" 11 | 12 | 13 | #################################### 14 | # Main 15 | #################################### 16 | # read ground truth and ROIs 17 | if not os.path.exists(cntkFilesDir + image_set + ".cache_gt_roidb.pkl"): 18 | raise Exception("Run 2_cntkGenerateInputs.py before executing this script.") 19 | imdb = imdbs[image_set] 20 | gtRois = imdb.gt_roidb() 21 | print("Number of images in set '{}' = {}".format(image_set, imdb.num_images)) 22 | 23 | # extract width, height, etc for all ground truth annotations in all images 24 | roiInfos = [] 25 | for imgIndex in range(0, imdb.num_images): 26 | imgPath = imdb.image_path_at(imgIndex) 27 | imgWidth, imgHeight = imWidthHeight(imgPath) 28 | 29 | if gtRois[imgIndex] != None: 30 | for gtRoi in gtRois[imgIndex]['boxes']: 31 | roiWidth = gtRoi[2] - gtRoi[0] +1 32 | roiHeight = gtRoi[3] - gtRoi[1] +1 33 | roiRelWidth = float(roiWidth) / imgWidth 34 | roiRelHeight = float(roiHeight) / imgHeight 35 | roiInfos.append((roiRelWidth, roiRelHeight, roiRelWidth * roiRelHeight, roiRelWidth / roiRelHeight)) 36 | 37 | # analyse typical width, height, etc of the ground truth annotations 38 | print("\nStatistics for ground truth annotations:") 39 | for percentile in np.linspace(0, 100, 21): 40 | print(" Percentile {:3.0f}: width = {:<.2f}, height = {:<.2f}, area = {:<.3f}, aspectRatio = {:<.2f}".format( 41 | percentile, 42 | np.percentile(getColumn(roiInfos, 0), percentile), 43 | np.percentile(getColumn(roiInfos, 1), percentile), 44 | np.percentile(getColumn(roiInfos, 2), percentile), 45 | np.percentile(getColumn(roiInfos, 3), percentile))) 46 | print("DONE.") 47 | -------------------------------------------------------------------------------- /PARAMETERS.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from helpers import * 3 | from imdb_data import imdb_data 4 | import fastRCNN, time, datetime 5 | from fastRCNN.pascal_voc import pascal_voc 6 | print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')) 7 | 8 | 9 | ############################ 10 | # Adjust these parameters 11 | # to make scripts run 12 | ############################ 13 | rootDir = os.path.dirname(os.path.realpath(sys.argv[0])) 14 | 15 | ############################ 16 | # default parameters 17 | ############################ 18 | datasetName = "grocery" 19 | 20 | #directories 21 | imgDir = rootDir + "/data/" + datasetName + "/" 22 | procDir = rootDir + "/proc/" + datasetName + "/" 23 | resultsDir = rootDir + "/results/" + datasetName + "/" 24 | roiDir = procDir + "rois/" 25 | modelDir = procDir + "models/" 26 | cntkFilesDir = procDir + "cntkFiles/" 27 | trainedSvmDir = procDir + "trainedSvm/" 28 | cntkResourcesDir = rootDir + "/resources/cntk/" 29 | 30 | # ROI generation 31 | roi_maxImgDim = 200 # image size used for ROI generation 32 | roi_minDimRel = 0.01 # minimum relative width/height of a ROI 33 | roi_maxDimRel = 1.0 # maximum relative width/height of a ROI 34 | roi_minNrPixelsRel = 0 # minimum relative area covered by a ROI 35 | roi_maxNrPixelsRel = 1.0 # maximum relative area covered by a ROI 36 | roi_maxAspectRatio = 4.0 # maximum aspect Ratio of a ROI, both vertically and horizontally 37 | ss_minSize = 20 # for a description of the selective search parameters see: 38 | ss_kvals = (50, 500, 6) # http://dlib.net/dlib/image_transforms/segment_image_abstract.h.html#find_candidate_object_locations 39 | ss_max_merging_iterations = 20 # 40 | ss_nmsThreshold = 0.85 # non-maxima surpression threshold run after selective search 41 | grid_nrScales = 7 # uniform grid ROIs: number of iterations from largest possible ROI to smaller ROIs 42 | grid_stepSizeRel = 0.5 # uniform grid ROIs: step size for sliding windows 43 | grid_aspectRatios = [1.0, 2.0, 0.5] # uniform grid ROIs: allowed aspect ratio of ROIs 44 | grid_downscaleRatioPerIteration = 1.5 # uniform grid ROIs: relative ROI width/height reduction per iteration, starting from largest possible ROI 45 | 46 | # cntk model 47 | cntk_nrRois = 2000 # DNN input number of ROIs per image. Zero-padded/truncated if necessary 48 | cntk_padWidth = 1000 # DNN input image width [pixels] 49 | cntk_padHeight = 1000 # DNN input image height [pixels] 50 | cntk_featureDimensions = {'svm': 4096} # DNN output, dimension of each ROI 51 | 52 | # nn and svm training 53 | classifier = 'svm' # Options: 'svm', 'nn'. Train either a Support Vector Machine, or directly the Neural Network 54 | train_posOverlapThres = 0.5 # DNN and SVM threshold for marking ROIs with significant overlap with a GT object as positive 55 | 56 | # nn training 57 | cntk_max_epochs = 18 # number of training epochs (only relevant if 'lassifier' is set to: 'nn') 58 | cntk_mb_size = 5 # minibatch size 59 | cntk_l2_reg_weight = 0.0005 # l2 regularizer weight 60 | cntk_lr_per_image = [0.01] * 10 + [0.001] * 5 + [0.0001] #learning rate per image 61 | cntk_momentum_time_constant = 10 # momentum 62 | 63 | # svm training 64 | svm_C = 0.001 # regularization parameter of the soft-margin error term 65 | svm_B = 10.0 # intercept scaling 66 | svm_nrEpochs = 2 # number of training iterations 67 | svm_retrainLimit = 2000 # number of new items to trigger SVM training 68 | svm_evictThreshold = -1.1 # remove easy negatives with decision value below this threshold 69 | svm_posWeight = "balanced"# automatically balance training set to correct for the majority of ROIs being negative 70 | svm_targetNorm = 20.0 # magic value from traditional R-CNN (helps with convergence) 71 | svm_penality = 'l2' # penalty norm 72 | svm_loss = 'l1' # loss norm 73 | svm_rngSeed = 3 # seed for randomization 74 | 75 | # postprocessing 76 | nmsThreshold = 0.3 # Non-Maxima suppression threshold (in range [0,1]) 77 | # The lower the more ROIs will be combined. Used during evaluation and visualization (scripts 5_) 78 | vis_decisionThresholds = {'svm' : 0.5, # Reject detections with low confidence, used only in 5_visualizeResults 79 | 'nn' : None} 80 | 81 | # evaluation 82 | evalVocOverlapThreshold = 0.5 # voc-style intersection-over-union threshold used to determine if object was found 83 | 84 | 85 | 86 | ############################ 87 | # project-specific 88 | # parameters / overrides 89 | ############################ 90 | if datasetName.startswith("grocery"): 91 | classes = ('__background__', # always have '__background__' be at index 0 92 | "orange", "eggBox", "joghurt", "ketchup", "squash", "mushroom", "water", "mustard") 93 | 94 | 95 | # roi generation 96 | cntk_nrRois = 200 #this number is too low to get good accuracy but allows for fast training and scoring (for demo purposes) 97 | roi_minDimRel = 0.04 98 | roi_maxDimRel = 0.4 99 | roi_minNrPixelsRel = 2 * roi_minDimRel * roi_minDimRel 100 | roi_maxNrPixelsRel = 0.33 * roi_maxDimRel * roi_maxDimRel 101 | 102 | # postprocessing 103 | nmsThreshold = 0.01 104 | 105 | # database 106 | imdbs = dict() # database provider of images and image annotations 107 | for image_set in ["train", "test"]: 108 | imdbs[image_set] = imdb_data(image_set, classes, cntk_nrRois, imgDir, roiDir, cntkFilesDir, boAddGroundTruthRois = (image_set!='test')) 109 | 110 | 111 | elif datasetName.startswith("pascalVoc"): 112 | classes = ('__background__', 113 | 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 114 | 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor') 115 | lutImageSet = {"train": "trainval", "test": "test"} 116 | 117 | # model training / scoring 118 | classifier = 'nn' 119 | 120 | # cntk model (Should train a model with mean-AP around 0.45) 121 | # more than 99% of the test images have less than 4000 rois, but 50% more than 2000 122 | cntk_mb_size = 2 123 | cntk_nrRois = 4000 124 | cntk_lr_per_image = [0.05] * 10 + [0.005] * 5 + [0.0005] 125 | 126 | # database 127 | imdbs = dict() 128 | for image_set, year in zip(["train", "test"], ["2007", "2007"]): 129 | imdbs[image_set] = fastRCNN.pascal_voc(lutImageSet[image_set], year, classes, cntk_nrRois, cacheDir = cntkFilesDir) 130 | print("Number of {} images: {}".format(image_set, imdbs[image_set].num_images)) 131 | 132 | else: 133 | ERROR 134 | 135 | 136 | 137 | ############################ 138 | # computed parameters 139 | ############################ 140 | nrClasses = len(classes) 141 | cntk_featureDimensions['nn'] = nrClasses 142 | lutClass2Id = dict(zip(classes, range(len(classes)))) 143 | 144 | print("PARAMETERS: datasetName = " + datasetName) 145 | assert cntk_padWidth == cntk_padHeight, "ERROR: different width and height for padding not supported." 146 | assert classifier.lower() in ['svm','nn'], "ERROR: only 'nn' or 'svm' classifier supported." 147 | assert not (datasetName == 'pascalVoc' and classifier == 'svm'), "ERROR: 'svm' classifier for pascal VOC not supported." 148 | assert(train_posOverlapThres >= 0 and train_posOverlapThres <= 1) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | Fast R-CNN Object Detection Tutorial for Microsoft Cognitive Toolkit (CNTK) 3 | ============== 4 | 5 | ```diff 6 | + Update V2.0.1 (June 2017): 7 | + Updated documentation to include Visual Object Tagging Tool as an annotation option. 8 | + Update v2 (June 2017): 9 | + Updated code to be compatible with the CNTK 2.0.0 release. 10 | + Update v1 (Feb 2017): 11 | + This tutorial was updated to use CNTK's python wrappers. Now all processing happens in-memory during scoring. See script 6_runSingleImage for an example. Furthermore, we switched to a much more accurate and faster implementation of Selective Search. 12 | + Note that, at the time of writing, CNTK does not support Python 2. If you need Python 2 then please refer to the [previous version](https://github.com/Azure/ObjectDetectionUsingCntk/tree/7edd3276a189bad862dc54e9f73b7cfcec5ae562) of this tutorial. 13 | ``` 14 | 15 | DESCRIPTION 16 | -------------- 17 | 18 | Object Detection is one of the main problems in Computer Vision. Traditionally, this required expert knowledge to identify and implement so called “features” that highlight the position of objects in the image. Starting in 2012 with the famous AlexNet paper, Deep Neural Networks are used to automatically find these features. This lead to a huge improvement in the field for a large range of problems. 19 | 20 | This tutorial uses Microsoft Cognitive Toolkit's (CNTK) fast R-CNN implementation (see the [Fast R-CNN](#fast-r-cnn) section for a description) which was shown to produce state-of-the-art results for [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/), one of the main object detection challenges in the field. 21 | 22 | GOALS 23 | -------------- 24 | 25 | The goal of this tutorial is to show how to train and test your own Deep Learning object detection model using [Microsoft Cognitive Toolkit (CNTK)](https://github.com/Microsoft/CNTK). Example data and annotations are provided, but the reader can also bring their own images and train their own, unique, object detector. 26 | 27 | The tutorial is split into four parts: 28 | - [Part 1](#part-1) shows how to train an object detection model for the example data without retraining the provided Neural Network, but instead training an external classifier on its output. This approach works particularly well with small datasets, and does not require expertise with deep learning. 29 | - [Part 2](#part-2) extends this approach to refine the Neural Network directly without the need for an external classifier. 30 | - [Part 3](#part-3) illustrates how to annotate your own images and use these to train an object detection model for your specific use case. 31 | - [Part 4](#part-4) covers how to reproduce published results on the Pascal VOC dataset. 32 | 33 | Previous expertise with Machine Learning while not required to complete this tutorial, however is very helpful to understand the underlying principles. More information on the topic can also be found at [CNTK's Fast-RCNN page](https://github.com/Microsoft/CNTK/tree/master/Examples/Image/Detection/FastRCNN). 34 | 35 | 36 | 37 | 38 | PREREQUISITES 39 | -------------- 40 | 41 | This tutorial was tested using CNTK v2.0.0, and assumes that CNTK was installed with the (default) Anaconda Python interpreter. Note that the code will only run on v2.0 due to breaking changes in other versions. 42 | 43 | CNTK can be easily installed by following the instructions on the [script-driven installation page](https://github.com/Microsoft/CNTK/wiki/Setup-Windows-Binary-Script). This will also automatically add an Anaconda Python distribution. At the time of writing, the default python version is 3.5. 44 | 45 | A dedicated GPU is not required, but recommended for retraining of the Neural Network (part 2). If you lack a strong GPU, don't want to install CNTK yourself, or want to train a model using multiple GPUs, then consider using Azure's Data Science Virtual Machine. See the [Cortana Intelligence Gallery](https://gallery.cortanaintelligence.com/Solution/Linux-Data-Science-Virtual-Machine-3) for a 1-click deployment solution. 46 | 47 | 56 | 57 | Several Python packages are required to execute the python scripts. These libraries can be installed easily using provided python wheels by opening a command prompt and running: 58 | ````bash 59 | c:/local/CNTK-2-0/cntk/Scripts/cntkpy35.bat 60 | cd resources/python35_64bit_requirements/ 61 | pip.exe install -r requirements.txt 62 | ```` 63 | 64 | In the code snippet above, we assumed that the CNTK root directory is C:/local/CNTK-2-0/. The python wheels were originally downloaded from this [page](http://www.lfd.uci.edu/~gohlke/pythonlibs/). 65 | 66 | Finally, the file *AlexNet.model* is too big to be hosted in Github and hence needs to be downloaded manually from [here](https://www.cntk.ai/Models/AlexNet/AlexNet.model) and placed into the subfolder */resources/cntk/AlexNet.model*. 67 | 68 | 69 | 70 | FOLDER STRUCTURE 71 | -------------- 72 | 73 | |Folder| Description 74 | |---|--- 75 | |/| Root directory 76 | |/data/| Directory containing images for different object recognition projects 77 | |/data/grocery/| Example data for grocery item detection in refrigerators 78 | |/data/grocery/positives/| Images and annotations to train the model 79 | |/data/grocery/negatives/| Images used as negatives during model training 80 | |/data/grocery/testImages/| Test images used to evaluate model accuracy 81 | |/doc/| Resources such as images for this readme page 82 | |/fastRCNN/| Slightly modified code used in R-CNN publications 83 | |/resources/| All provided resources are in here 84 | |/resources/cntk/| CNTK configuration file and pre-trained AlexNet model 85 | |/resources/python35_64_bit_requirements/| Python wheels and requirements file for 64bit Python version 3.5 86 | 87 | 88 | All scripts used in this tutorial are located in the root folder. 89 | 90 | 91 | PART 1 92 | -------------- 93 | In the first part of this tutorial we will train a classifier which uses, but does not modify, a pre-trained deep neural network. See the [Fast R-CNN](#fast-r-cnn) section for details of the employed approaches. As example data 25 images of grocery items inside refrigerators are provided, split into 20 images for training and the remaining 5 images are used as test set. The training images contain in total 180 annotated objects, these are: 94 | ``` 95 | Egg box, joghurt, ketchup, mushroom, mustard, orange, squash, and water. 96 | ``` 97 | Note that 20 training images is a very low number and too little train a high-accuracy detector. Nevertheless, even this small dataset is sufficient to return plausible detections as can be seen in step 5. 98 | Every step has to be executed in order, and we recommend after each step to inspect which files are written, where they are written to, and what the content of these files is (mostly the content is written as text file). 99 | 100 | 101 | 102 | 103 | ### STEP 1: Computing Region of Interests 104 | `Script: 1_computeRois.py` 105 | 106 | Region-of-interests (ROIs) are computed for each image independently using a 3-step approach: First, Selective Search is used to generate hundreds of ROIs per Image. These ROIs often fit tightly around some objects but miss other objects in the image (see [Selective Search](#selective-search) section). Many of the ROIs are bigger, smaller, etc. than the typical grocery item in our dataset. Hence in a second step these ROIs, as well as ROIs which are too similar, are discarded. Finally, to complement the detected ROIs from Selective Search, ROIs that uniform cover the image are added at different scales and aspect ratios. 107 | 108 | The final ROIs are written for each image separately to the files *[imageName].roi.txt* in the *proc/grocery/rois/* folder. 109 | 110 | For the grocery dataset, selective search typically generates around 1000 ROIs per image, plus on average another 2000 ROIs sampled uniformly from the image. A high number of ROIs typically leads to better object detection performance, at the expense however of longer running time. Hence the parameter `cntk_nrRois` can be used to only keep a subset of the ROIs (e.g. if `cntk_nrRois = 2000` then typically all ROIs from selective search are preserved, plus the 1000 largest ROIs generated using uniform sampling). 111 | 112 | The goodness of these ROIs can be measured by counting how many of the ground truth annotated objects in the image are covered by at least one ROI, where "covered" is defined as having an overlap greater than a given threshold. Script `B1_evaluateRois.py` outputs these counts at different threshold values. For example for a threshold of 0.5 and 2000 ROIs, the recall is around 98%, while with 200 ROIs the recall is around 85%. It is important that the recall at a threshold of 0.5 is close to 100%, since even a perfect classifier cannot find an object in the image if it is not covered by at least one ROI. 113 | 114 | ROIs computed using Selective Search (left); ROIs from the image above after discarding ROIs that are too small, too big, etc. (middle); Final set of ROIs after adding ROIs that uniformly cover the image (right). 115 |

116 | alt text 117 | alt text 118 | alt text 119 |

120 | 121 | 122 | ### STEP 2: Computing CNTK inputs 123 | `Script: 2_cntkGenerateInputs.py` 124 | 125 | Each ROI generated in the last step has to run through the CNTK model to compute its 4,096 float Deep Neural Network representation (see the [Fast R-CNN](#fast-r-cnn) section). This requires three CNTK-specific input files to be generated for the training and the test set: 126 | - *{train,test}.txt*: each row contains the path to an image. 127 | - *{train,test}.rois.txt*: each row contains all ROIs for an image in relative (x,y,w,h) co-ordinates. 128 | - *{train,test}.roilabels.txt*: each row contains the labels for the ROIs in one-hot-encoding. 129 | 130 | An in-depth understanding of how these files are structured is not necessary to understand this tutorial. However, two points are worth pointing out: 131 | - CNTK’s fast R-CNN implementation requires all images to be of the same size. For this reason, all images are first scaled and then centered and zero-padded (i.e. columns of gray-colored pixels are added to the left and right of the image, or respectively rows at the top and bottom). Note that the scaling preserves the original aspect ratio. For our experiments we use input width and height of 1000 x 1000 pixels to the Neural Network. 132 | Interestingly, upscaling an image can significantly improve accuracy if the objects to be detected are small (this is due to objects in ImageNet typically having a width and height of 100-200 pixels). 133 | - CNTK expects each image to have the same number of ROIs (for our experiments we use 2000). Hence, if the computation in step 1 returned more ROIs, then only the first 2000 are used. Likewise, if less ROIs were found, then the remaining spots are filled using ROIs with co-ordinates of (0,0,0,0). These “zero-padded” ROIs are only used during CNTK execution and have no influence on the training / test performance. 134 | 135 | This step writes the above mentioned files to the directory *proc/grocery/cntkFiles/*. For debugging, the script `B2_cntkVisualizeInputs.py` can be used to visualize the content of these files (e.g. the Figure at the end of step 4 was generated using this script). 136 | 137 | 138 | 139 | ### STEP 3: Running CNTK 140 | `Script: 3_runCntk.py` 141 | 142 | We can now run the CNTK training which takes as input the co-ordinates and labels files from the last step and writes the 4096 float embedding for each ROI and for each image to *proc/grocery/cntkFiles/{train,test}_svm_parsed/[imageName].dat.npz*. This will take a few minutes, and will automatically run on GPU if detected. 143 | 144 | Note: Look for the line "Using GPU for training." in the console output to make sure the training runs on GPU and not CPU (which would be too slow). Note that a previous CNTK run might still be open and holding a block on the GPU. 145 | 146 | 147 | ### STEP 4: Classifier training 148 | `Script: 4_trainSvm.py` 149 | 150 | We now train the classifier which given an ROI as input, assigns it to one of the grocery items or to a “background” class. 151 | 152 | We use a slightly modified version of the published R-CNN code to train a linear SVM classifier. The main change is to load the 4096 floats ROI embedding from disk rather than to run the network on-the-fly. An in-depth explanation of the training procedure can be found in the [R-CNN paper](http://arxiv.org/abs/1311.2524). For the purpose of this tutorial we consider the training script a black box, which uses the training ROIs as input (or to be precise the 4096 floats representations), and outputs N+1 linear classifiers, one for each class, plus one for the background. 153 | 154 | The training starts by loading all positive ROIs into memory. Positive here corresponds to each ROI that has a significant overlap with a ground truth annotated object. Negatives are then iteratively added using hard negative mining, and the SVM is retrained. A list and short description of the parameters that govern the SVM training can be found in the script `PARAMETERS.py`. 155 | 156 | The learned linear classifiers for each class, i.e. a weight vector of dimension 4096 floats plus a float that represents the bias term, are then written to the folder *proc/grocery/trainedSVMs/*. 157 | 158 | 159 | 160 | ### STEP 5: Evaluation and visualization 161 | `Scripts: 5_evaluateResults.py and 5_visualizeResults.py` 162 | 163 | Once training succeeded, the model can be used to find objects in images. For this, every ROI in an image is classified and assigned a confidence to be orange, ketchup, ... and background. The class with highest confidence is then selected (most often “background”) and optionally a threshold applied to reject detections with low confidence. 164 | 165 | The accuracy of the classifier can be measured using the script `5_evaluateResults.py`. This outputs the mean Average Precision (mAP; see the [Mean Average Precision](#mean-average-precision) section) for either the training or the test set. Keep in mind that the test set only contains 5 images and hence these numbers need to be taken with a grain of salt. Due to randomization effects one might get very different results when running the script. 166 | 167 | 168 | 169 | Results using 200 ROIs (this number is too low to get good accuracy but for demo purposes allows for fast training and scoring): 170 | 171 | |Dataset| AP(orange)|AP(eggBox)|AP(joghurt)|AP(ketchup)| | mAP 172 | |---|---|---|---|---|---|--- 173 | |Test Set| 0.45 |1.00 |0.82 |0.76 | |**0.63** 174 | 175 | Results using 2000 ROIs: 176 | 177 | |Dataset| AP(orange)|AP(eggBox)|AP(joghurt)|AP(ketchup)| | mAP 178 | |---|---|---|---|---|---|--- 179 | |Test Set| 0.32 | 0.48 | 0.82 | 0.82 | |**0.65** 180 | 181 | The output of the classifier using 2000 ROIs can be visualized using the script `5_visualizeResults.py`. Only ROIs classified as grocery item are shown (not background), and only if the confidence in the detection is greater or above 0.5. Multiple ROIs are combined into single detections using [Non-Maxima Suppression](#non-maxima-suppression), the output of which is visualized below for the test images. 182 | 183 |

184 | alt text 185 | alt text 186 | alt text 187 | alt text 188 | alt text 189 |

190 | 191 | In addition to visualizing the detected objects, script `5_visualizeResults.py` also computes precision and recall after rejecting detections with confidence scores less than a given threshold. This information can be used to set an operating point of the final classifier: for example, given the table below, to reach 85% precision all detections with score less than 5.0 would have to be rejected. 192 | 193 |

194 | alt text 195 |

196 | 197 | 198 | 199 | ### STEP 6: Scoring images 200 | `Script: 6_scoreImage` 201 | 202 | Up to now our focus was on training a model and evaluating its performance. Hence all steps were performed one-by-one, and intermediate results were written to and loaded from disk. During scoring, given one or more images, it would be preferable to perform all steps in-memory. Exactly this is done in script `6_scoreImage`: it loads a given image, computes the ROIs, runs each ROI through the DNN, evaluates the trained SVM if needed, and finally outputs a list of the detected objects. 203 | 204 | Note that the script makes call to functions in `cntk_helpers.py` which were originally written for steps 1-5. Loading the model takes a few seconds, but this only has to be done once and can then be kept in-memory (e.g. in a web-service which waits for images to be uploaded). 205 | 206 | 207 | 208 | PART 2 209 | -------------- 210 | In part 1 we learned how to classify ROIs by training a linear Support Vector Machine on the output of a given Neural Network. We will now show how to instead perform this classification directly in the Deep Neural Network. This can be achieved by adding a new last layer which, given the input from the last fully connected layer, outputs the probabilities for each ROI to be of a certain class. See section [SVM vs NN training](#svm-vs-nn-training) for pros/cons of the two different approaches. 211 | 212 | Training the Neural Network instead of an SVM is done by simply changing the variable `classifier` in `PARAMETERS.py` from "svm" to "nn". Then, as described in part 1, all the scripts need to be executed in order, except for the SVM training in step 4. This will add a classification layer to the network and train the last layer(s) of the network, and for each ROI write its classification label and confidence to disk (rather than the 4096 floats representation which was required to train the SVM). Note that NN training can cause an out-of-memory error on less powerful machines which can possibly be avoided by reducing the minibatch size and if needed also the number of ROIs per image (see variables `cntk_mb_size` and `cntk_nrRois` in `PARAMETERS.py`). 213 | 214 | The mean Average Precision measure after running all steps should roughly look like the results below. 215 | 216 | Using 200 ROIs: 217 | 218 | |Dataset| AP(orange)|AP(eggBox)|AP(joghurt)|AP(ketchup)| | mAP 219 | |---|---|---|---|---|---|--- 220 | |Test Set| 0.45 |0.97 |0.82 |1.00 | |**0.70** 221 | 222 | Using 2000 ROIs: 223 | 224 | |Dataset| AP(orange)|AP(eggBox)|AP(joghurt)|AP(ketchup)| | mAP 225 | |---|---|---|---|---|---|--- 226 | |Test Set| 1.00 |0.92 |1.00 |0.07 | |**0.87** 227 | 228 | 245 | 246 | 263 | 264 | The output of the Neural Network with 2000 ROIs on the five test images after Non-Maxima Suppression to combine multiple detections should look like this: 265 |

266 | alt text 267 | alt text 268 | alt text 269 | alt text 270 | alt text 271 |

272 | 273 | PART 3 274 | -------------- 275 | So far we trained and evaluated object detectors using the provided grocery dataset. It is very straight forward to use a custom dataset instead: the necessary scripts for image annotation are included in the repository, and only minor code changes are required to point to a new dataset. 276 | 277 | First, lets have a look at the folder structure and the provided annotation files for the grocery data: 278 | Note how all positive, negative and test images and their annotations are in the subfolders *positive*, *negative* and *testImages* of *data/grocery/*. Each image (with the exception of the negative images) has (i) a similarly named *[imageName].bboxes.txt* file where each row corresponds to the co-ordinates of a manually labeled object (aka. bounding box); and (ii) a *[imageName].bboxes.labels.txt* file where each row corresponds to the class of the object (e.g. avocado or orange). 279 | 280 | 281 | ### Image Annotation 282 | 283 | **Option #1: Visual Object Tagging Tool (Recommended)** 284 | 285 | The [Visual Object Tagging Tool (VOTT)](https://github.com/CatalystCode/VOTT) is a cross platform annotation tool for tagging video and image assets. 286 | 287 | ![Vott Screen Shot](https://github.com/CatalystCode/VOTT/blob/master/media/4_Tagging_Job.jpg) 288 | 289 | VOTT provides the following **features**: 290 | 291 | - Computer-assisted tagging and tracking of objects in videos using the [Camshift tracking algorithm](http://opencv.jp/opencv-1.0.0_org/docs/papers/camshift.pdf). 292 | - Exporting tags and assets to CNTK Fast-RCNN format for training an object detection model. 293 | - Running and validating a trained CNTK object detection model on new videos to generate stronger models. 294 | 295 | How to annotate with VOTT: 296 | 297 | 1. Download the latest [Release](https://github.com/CatalystCode/VOTT/releases) 298 | 2. Follow the [Readme](https://github.com/CatalystCode/VOTT/blob/master/README.md) to run a tagging job 299 | 3. After tagging Export to the dataset directory 300 | 301 | 302 | **Option #2: Using Annotation Scripts** 303 | 304 | These two *.txt* files per image can be generated using the scripts `A1_annotateImages.py` and `A2_annotateBboxLabels.py`. 305 | 306 | The first script lets the user draw rectangles around each object (see left image below). Once all objects in an image are annotated, pressing key 'n' writes the *.bboxes.txt* file and then proceeds to the next image, 'u' undoes (i.e. removes) the last rectangle, and 'q' quits the annotation tool. 307 | 308 | The second script loads these manually annotated rectangles for each image, displays them one-by-one, and asks the user to provide the object class by clicking on the respective button to the left of the window (see right image below). Ground truth annotations marked as either "undecided" or "exclude" are fully excluded from further processing. 309 |

310 | alt text 311 | alt text 312 |

313 | 314 | ### Using a custom dataset 315 | 316 | If you used VOTT to generate and export your datatset, it will all ready be in sorted in to positive*, *negative* and *testImages* subfolders. 317 | 318 | Otherwise, once all (non-negative) images are annotated using the annotation scripts, the images and *.txt* annotation files should be copied to the *positive*, *negative* and *testImages* subfolders of a new directory called *data/myOwnImages/*, where the string "myOwnImages" can be replaced at will. 319 | 320 | The only required code change is to update the `datasetName` variable in `PARAMETERS.py` to the newly created folder: 321 | ```python 322 | datasetName = "myOwnImages" 323 | ``` 324 | 325 | All steps in part 1 can then be executed in order and will use the new dataset. 326 | 327 | 328 | ### How to get good results 329 | 330 | As is true for most Machine Learning project, getting good results requires careful parameter tuning. To help with this, all important parameters are specified, and a short explanation provided, in a single place: the `PARAMETERS.py` file. 331 | 332 | Here now a few tips on how to find good parameters / design a good training set: 333 | - Select images carefully and perform annotations identically across all images. Typically, all objects in the image need to be annotated, even if the image contains many of them. It is common practice to remove such cluttered images. This is similarly true also for images where one is uncertain about the label of an object or where it is unclear whether the object should even be annotated (e.g. due to truncation, occlusion, motion blur, etc.). 334 | - During Region-of-Interest generation in step 1, all ROIs which are deemed too small, too big, etc. are discarded. This filtering step relies on thresholds on the respective properties and are defined in `PARAMETERS.py` (paragraph "ROI generation"). 335 | Visualizing the generated ROIs helps tremendously for debugging and can be done either while computing the ROIs in the script `1_computeRois.py` itself, or by visualizing the CNTK training files using the script `B2_cntkVisualizeInputs.py`. In addition, script `B1_evaluateRois.py` computes the percentage of annotated ground truth objects that are covered by one or more ROI (i.e. recall). Generally the more ROIs (variable `cntk_nrRois`) the better the accuracy, but at slower training and scoring speeds. 336 | - Training a linear SVM (step 4) is relatively robust and hence for most problems the corresponding parameters in `PARAMETERS.py` (paragraph "svm training") do not need to be modified. 337 | The evaluation script `5_evaluateResults.py` can be used to verify that the SVM successfully learned to capture the training data (typically the APs are above 0.5). 338 | - Training a Neural Network (part 2) is significantly more difficult, and often requires expert knowledge to make the network converge to a good solution (see [Michael Nielsen's](http://neuralnetworksanddeeplearning.com/) great introduction to Deep Neural Networks). The arguably most important parameter here is the learning rate (parameter `cntk_lr_per_image`). 339 | - In addition to computing mAP, always also visualize the results on the test and on the training set. This is done with script `5_visualizeResults.py` and helps getting an understanding of the error modes, and to verify the model is behaving as expected. 340 | 341 | ### Publishing the model as Rest API 342 | 343 | Finally, the trained model can be used to create a web service or Rest API on Azure. For this, we recommend using a technology called Flask, which makes it easy to run Python code in the cloud. See the tutorial [Creating web apps with Flask in Azure](https://azure.microsoft.com/en-us/documentation/articles/web-sites-python-create-deploy-flask-app/) for an introduction to Flask, and the GitHub repo [Azure-WebApp-w-CNTK](https://github.com/ilkarman/Azure-WebApp-w-CNTK) for an example how to deploy and run CNTK inside a web-service on Azure. 344 | 345 | 346 | 347 | 348 | PART 4 349 | -------------- 350 | 354 | 355 | The last part of this tutorial shows how to reproduce published results on the Pascal VOC dataset. 356 | 357 | First, the Pascal VOC data as well as the pre-computed Selective Search boxes need to be downloaded from these links: [VOCtest_06-Nov-2007.tar](http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar), 358 | [VOCtrainval_06-Nov-2007.tar](http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar), 359 | [selective_search_data.tgz](http://www.cs.berkeley.edu/~rbg/fast-rcnn-data/selective_search_data.tgz). 360 | 361 | 362 | 363 | 364 | 365 | 366 | These three tar-compressed files should to be extracted and copied into the *resources/pascalVocData/* directory. Your resources folder should look like this: 367 | ```bash 368 | resources/pascalVocData/selective_search_data 369 | resources/pascalVocData/VOCdevkit2007/VOC2007 370 | resources/pascalVocData/VOCdevkit2007/VOC2007/Annotations 371 | resources/pascalVocData/VOCdevkit2007/VOC2007/ImageSets 372 | resources/pascalVocData/VOCdevkit2007/VOC2007/JPEGImages 373 | ``` 374 | 375 | Second, the `datasetName` variable in `PARAMETERS.py` needs to point to the Pascal VOC dataset instead of our grocery dataset: 376 | ```python 377 | datasetName = "pascalVoc" 378 | ``` 379 | 380 | Now the steps from part 1 can be executed in order with the exception of: 381 | - Step 1: ROI generation is not necessary since we use the downloaded Selective Search boxes instead. 382 | - Step 4: SVM training is not necessary since the classification is done by adding a new softmax layer to the network (similar to part 2). 383 | 384 | Note that Pascal VOC is a very big dataset and hence some of the steps (especially the CNTK training in step 3) will take hours to complete. 385 | 386 | The table below shows the mean Average Precision (mAP) of our final model, and compares this figure to the corresponding experiment in the [Fast R-CNN](https://arxiv.org/pdf/1504.08083v2.pdf) paper (Table 6, group "S"). Note that this tutorial uses an AlexNet architecture, and we do not perform bounding box regression. To be consistent with the paper, our model is trained using the VOC 2007 "trainval" set, and the mean Average Precision is computed on the VOC 2007 "test" set. 387 | 388 | |Dataset| mAP 389 | |---|--- 390 | |Published results|0.52 391 | |Our results|0.48 392 | 393 | More information on training a PascalVOC classifier (including a download link to a trained model) can be found at [CNTK's Fast-RCNN page](https://github.com/Microsoft/CNTK/tree/master/Examples/Image/Detection/FastRCNN). 394 | 395 | 396 | TECHNOLOGY 397 | -------------- 398 | 399 | ### Fast R-CNN 400 | R-CNNs for Object Detection were first presented in 2014 by [Ross Girshick et al.](http://arxiv.org/abs/1311.2524), and shown to outperform previous state-of-the-art approaches on one of the major object recognition challenges in the field: [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/). Since then, two follow-up papers were published which contain significant speed improvements: [Fast R-CNN](https://arxiv.org/pdf/1504.08083v2.pdf) and [Faster R-CNN](https://arxiv.org/abs/1506.01497). 401 | 402 | The basic idea of R-CNN is to take a deep Neural Network which was originally trained for image classification using millions of annotated images and modify it for the purpose of object detection. The basic idea from the first R-CNN paper is illustrated in the Figure below (taken from the paper): (1) Given an input image, (2) in a first step, a large number region proposals are generated. (3) These region proposals, or Regions-of-Interests (ROIs), are then each independently sent through the network which outputs a vector of e.g. 4096 floating point values for each ROI. Finally, (4) a classifier is learned which takes the 4096 float ROI representation as input and outputs a label and confidence to each ROI. 403 |

404 | alt text 405 |

406 | 407 | While this approach works well in terms of accuracy, it is very costly to compute since the Neural Network has to be evaluated for each ROI. Fast R-CNN addresses this drawback by only evaluating most of the network (to be specific: the convolution layers) a single time per image. According to the authors, this leads to a 213 times speed-up during testing and a 9x speed-up during training without loss of accuracy. 408 | 409 | The original Caffe implementation used in the R-CNN papers can be found at github: 410 | [RCNN](https://github.com/rbgirshick/rcnn), [Fast R-CNN](https://github.com/rbgirshick/fast-rcnn), and [Faster R-CNN](https://github.com/rbgirshick/py-faster-rcnn). This tutorial uses some of the code from these repositories, notably (but not exclusively) for svm training and model evaluation. 411 | 412 | ### SVM vs NN training 413 | In the last section, we describe how a linear SVM model is trained on the ROI 4096 float embedding. Alternatively, and this has pros/cons which are outlined below, one can do this classification directly in the neural network in a soft-max layer that takes the 4096 floats of the 2nd-to-last fully-connected layer as input. 414 | 415 | The advantage of adding a new soft-max layer is that the full network can be retrained using backpropagation, including all convolution layers, which can lead to (slightly to moderately) better prediction accuracies. Another (implementation-dependent) advantage is that only (number of classes +1) floats per ROI need to be written to disk compared to the 4096 floats ROI embedding used to train a SVM. 416 | On the other hand, training a Neural Network requires a good GPU, is even then 1-2 magnitudes slower than training a SVM, and requires extensive parameter tweaking and expert knowledge. 417 | 418 | ### Selective Search 419 | [Selective Search](http://koen.me/research/pub/uijlings-ijcv2013-draft.pdf) is a method for finding a large set of possible object locations in an image, independent of the class of the actual object. It works by clustering image pixels into segments, and then performing hierarchical clustering to combine segments from the same object into object proposals. The first image in part 1 shows an example output of Selective Search, where each possible object location is visualized by a green rectangle. These rectangles are then used as Regions-of-Interests (ROIs) in the R-CNN pipeline. 420 | 421 | The goal of ROI generation is to find a small set of ROIs which however tightly cover as many objects in the image as possible. This computation has to be sufficiently quick, while at the same time finding object locations at different scales and aspect ratios. Selective Search was shown to perform well for this task, with good accuracy to speed trade-offs. 422 | 423 | 424 | ### Non-maxima suppression 425 | Object detection methods often output multiple detections which fully or partly cover the same object in an image. These ROIs need to be merged to be able to count objects and obtain their exact locations in the image. This is traditionally done using a technique called Non-Maxima Suppression (NMS). The version of NMS we use (and which was also used in the R-CNN publications) does not merge ROIs but instead tries to identify which ROIs best cover the real locations of an object and discards all other ROIs. This is implemented by iteratively selecting the ROI with highest confidence and removing all other ROIs which significantly overlap this ROI and are classified to be of the same class. 426 | 427 | Detection results before (left) and after (right) Non-maxima Suppression: 428 |

429 | alt text 430 | alt text 431 |

432 | 433 | ### Mean Average Precision 434 | Once trained, the quality of the model can be measured using different criteria, such as precision, recall, accuracy, area-under-curve, etc. A common metric which is used for the Pascal VOC object recognition challenge is to measure the Average Precision (AP) for each class. Average Precision takes confidence in the detections into account and hence assigns a smaller penalty to false detections with low confidence. For a description of Average Precision see [Everingham et. al](http://homepages.inf.ed.ac.uk/ckiw/postscript/ijcv_voc09.pdf). The mean Average Precision (mAP) is computed by taking the average over all APs. 435 | 436 | FUTURE WORK 437 | --------------- 438 | 439 | One big item for future work is to use CNTK's Python APIs. Once these are fully available, the following changes can be made which should significantly improve run-time performance and simplify the code: 440 | - Reduce start-up time by loading the model only once and then keeping it persistent in memory. <-- Done in v1. 441 | - Reduce processing time using in-memory calls of the python wrappers, rather than writing all inputs and outputs to file first and subsequently parsing the CNTK output back into memory (e.g. this is especially expensive for the temporary file *train.z* in step 3 which can be many Gigabytes in size). <-- Done in v1. 442 | - Reduce code complexity by evaluating the network for each ROI on-the-fly in the `im_detect()` function rather than pre-computing all outputs in steps 4 and 5. 443 | 444 | Other items for future work include: 445 | - Replace Selective Search with a faster and more accurate implementation. <-- Done in v1. 446 | - Adding bounding box regression. 447 | - Implementation of fast*er* R-CNN, i.e. performing ROI generation inside the DNN. 448 | - Using a more recent DNN topology such as ResNet instead of AlexNet. 449 | 450 | 451 | AUTHOR 452 | --------------- 453 | Patrick Buehler, Senior Data Scientist 454 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/__init__.py -------------------------------------------------------------------------------- /data/grocery/negative/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/negative/1.jpg -------------------------------------------------------------------------------- /data/grocery/negative/2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/negative/2.jpg -------------------------------------------------------------------------------- /data/grocery/negative/3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/negative/3.jpg -------------------------------------------------------------------------------- /data/grocery/negative/4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/negative/4.jpg -------------------------------------------------------------------------------- /data/grocery/negative/5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/negative/5.jpg -------------------------------------------------------------------------------- /data/grocery/positive/0.bboxes.labels.tsv: -------------------------------------------------------------------------------- 1 | joghurt 2 | squash 3 | mushroom 4 | eggBox 5 | ketchup 6 | mustard 7 | water 8 | orange 9 | squash 10 | -------------------------------------------------------------------------------- /data/grocery/positive/0.bboxes.tsv: -------------------------------------------------------------------------------- 1 | 213 337 329 473 2 | 356 347 476 468 3 | 489 408 621 468 4 | 663 393 804 467 5 | 623 549 720 619 6 | 475 559 565 623 7 | 656 709 726 892 8 | 361 810 435 880 9 | 207 741 327 881 10 | -------------------------------------------------------------------------------- /data/grocery/positive/0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/0.jpg -------------------------------------------------------------------------------- /data/grocery/positive/11.bboxes.labels.tsv: -------------------------------------------------------------------------------- 1 | water 2 | squash 3 | mushroom 4 | orange 5 | eggBox 6 | mustard 7 | joghurt 8 | ketchup 9 | squash 10 | -------------------------------------------------------------------------------- /data/grocery/positive/11.bboxes.tsv: -------------------------------------------------------------------------------- 1 | 175 457 265 536 2 | 419 389 537 528 3 | 617 460 760 527 4 | 724 603 806 670 5 | 536 579 677 677 6 | 694 873 770 962 7 | 499 774 603 916 8 | 383 793 443 868 9 | 296 1010 419 1153 10 | -------------------------------------------------------------------------------- /data/grocery/positive/11.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/11.jpg -------------------------------------------------------------------------------- /data/grocery/positive/12.bboxes.labels.tsv: -------------------------------------------------------------------------------- 1 | water 2 | mushroom 3 | squash 4 | eggBox 5 | joghurt 6 | mustard 7 | ketchup 8 | orange 9 | squash 10 | -------------------------------------------------------------------------------- /data/grocery/positive/12.bboxes.tsv: -------------------------------------------------------------------------------- 1 | 312 316 373 484 2 | 463 423 587 493 3 | 655 361 758 485 4 | 541 541 686 641 5 | 596 718 690 841 6 | 737 848 824 932 7 | 387 749 448 824 8 | 225 814 301 882 9 | 295 957 416 1090 10 | -------------------------------------------------------------------------------- /data/grocery/positive/12.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/12.jpg -------------------------------------------------------------------------------- /data/grocery/positive/13.bboxes.labels.tsv: -------------------------------------------------------------------------------- 1 | squash 2 | water 3 | squash 4 | eggBox 5 | mushroom 6 | joghurt 7 | ketchup 8 | orange 9 | mustard 10 | -------------------------------------------------------------------------------- /data/grocery/positive/13.bboxes.tsv: -------------------------------------------------------------------------------- 1 | 201 348 324 491 2 | 480 425 612 493 3 | 681 349 814 499 4 | 520 545 672 640 5 | 694 573 830 647 6 | 584 721 681 850 7 | 375 754 439 826 8 | 209 821 285 888 9 | 724 856 804 940 10 | -------------------------------------------------------------------------------- /data/grocery/positive/13.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/13.jpg -------------------------------------------------------------------------------- /data/grocery/positive/14.bboxes.labels.tsv: -------------------------------------------------------------------------------- 1 | mustard 2 | water 3 | squash 4 | orange 5 | eggBox 6 | joghurt 7 | squash 8 | ketchup 9 | mushroom 10 | -------------------------------------------------------------------------------- /data/grocery/positive/14.bboxes.tsv: -------------------------------------------------------------------------------- 1 | 304 487 399 537 2 | 485 471 635 527 3 | 700 375 836 527 4 | 553 607 633 684 5 | 706 577 864 676 6 | 613 760 720 886 7 | 728 817 853 962 8 | 461 861 580 954 9 | 236 849 377 949 10 | -------------------------------------------------------------------------------- /data/grocery/positive/14.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/14.jpg -------------------------------------------------------------------------------- /data/grocery/positive/17.bboxes.labels.tsv: -------------------------------------------------------------------------------- 1 | mustard 2 | ketchup 3 | orange 4 | squash 5 | water 6 | mushroom 7 | joghurt 8 | squash 9 | eggBox 10 | -------------------------------------------------------------------------------- /data/grocery/positive/17.bboxes.tsv: -------------------------------------------------------------------------------- 1 | 264 469 333 520 2 | 343 461 440 523 3 | 455 453 533 527 4 | 539 376 670 528 5 | 774 445 901 529 6 | 656 593 820 676 7 | 688 800 820 952 8 | 539 781 648 908 9 | 227 838 509 933 10 | -------------------------------------------------------------------------------- /data/grocery/positive/17.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/17.jpg -------------------------------------------------------------------------------- /data/grocery/positive/18.bboxes.labels.tsv: -------------------------------------------------------------------------------- 1 | mustard 2 | ketchup 3 | squash 4 | squash 5 | mushroom 6 | orange 7 | water 8 | joghurt 9 | eggBox 10 | -------------------------------------------------------------------------------- /data/grocery/positive/18.bboxes.tsv: -------------------------------------------------------------------------------- 1 | 195 492 295 548 2 | 369 412 436 547 3 | 587 403 721 552 4 | 729 399 876 552 5 | 660 623 824 706 6 | 553 625 629 700 7 | 742 793 814 981 8 | 604 820 720 965 9 | 240 813 409 953 10 | -------------------------------------------------------------------------------- /data/grocery/positive/18.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/18.jpg -------------------------------------------------------------------------------- /data/grocery/positive/19.bboxes.labels.tsv: -------------------------------------------------------------------------------- 1 | mustard 2 | ketchup 3 | mushroom 4 | eggBox 5 | orange 6 | water 7 | joghurt 8 | squash 9 | squash 10 | -------------------------------------------------------------------------------- /data/grocery/positive/19.bboxes.tsv: -------------------------------------------------------------------------------- 1 | 244 444 307 565 2 | 336 420 409 564 3 | 563 504 686 569 4 | 689 500 814 571 5 | 532 641 608 714 6 | 697 806 770 990 7 | 563 841 681 997 8 | 376 846 497 978 9 | 216 842 335 981 10 | -------------------------------------------------------------------------------- /data/grocery/positive/19.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/19.jpg -------------------------------------------------------------------------------- /data/grocery/positive/2.bboxes.labels.tsv: -------------------------------------------------------------------------------- 1 | eggBox 2 | mustard 3 | joghurt 4 | orange 5 | squash 6 | water 7 | squash 8 | mushroom 9 | ketchup 10 | -------------------------------------------------------------------------------- /data/grocery/positive/2.bboxes.tsv: -------------------------------------------------------------------------------- 1 | 208 425 369 503 2 | 432 387 503 508 3 | 603 389 701 507 4 | 728 440 808 513 5 | 571 532 700 657 6 | 674 700 730 844 7 | 509 785 627 921 8 | 380 769 504 841 9 | 231 753 291 898 10 | -------------------------------------------------------------------------------- /data/grocery/positive/2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/2.jpg -------------------------------------------------------------------------------- /data/grocery/positive/21.bboxes.labels.tsv: -------------------------------------------------------------------------------- 1 | mushroom 2 | eggBox 3 | water 4 | mustard 5 | ketchup 6 | squash 7 | squash 8 | joghurt 9 | orange 10 | -------------------------------------------------------------------------------- /data/grocery/positive/21.bboxes.tsv: -------------------------------------------------------------------------------- 1 | 176 299 301 451 2 | 188 448 487 527 3 | 573 324 649 531 4 | 736 217 825 371 5 | 724 372 812 531 6 | 704 559 838 680 7 | 649 822 777 960 8 | 251 790 367 922 9 | 240 732 315 798 10 | -------------------------------------------------------------------------------- /data/grocery/positive/21.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/21.jpg -------------------------------------------------------------------------------- /data/grocery/positive/22.bboxes.labels.tsv: -------------------------------------------------------------------------------- 1 | eggBox 2 | water 3 | ketchup 4 | squash 5 | mushroom 6 | squash 7 | mustard 8 | orange 9 | joghurt 10 | -------------------------------------------------------------------------------- /data/grocery/positive/22.bboxes.tsv: -------------------------------------------------------------------------------- 1 | 201 439 511 519 2 | 595 309 677 520 3 | 750 344 841 519 4 | 736 547 868 673 5 | 527 589 668 673 6 | 680 810 808 954 7 | 577 806 655 932 8 | 445 860 517 932 9 | 272 782 391 929 10 | -------------------------------------------------------------------------------- /data/grocery/positive/22.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/22.jpg -------------------------------------------------------------------------------- /data/grocery/positive/23.bboxes.labels.tsv: -------------------------------------------------------------------------------- 1 | orange 2 | ketchup 3 | water 4 | squash 5 | squash 6 | mustard 7 | eggBox 8 | mushroom 9 | joghurt 10 | -------------------------------------------------------------------------------- /data/grocery/positive/23.bboxes.tsv: -------------------------------------------------------------------------------- 1 | 303 424 368 479 2 | 508 360 564 489 3 | 704 319 765 483 4 | 692 517 826 637 5 | 651 774 774 909 6 | 567 714 620 818 7 | 363 814 635 910 8 | 408 750 509 821 9 | 248 756 365 894 10 | -------------------------------------------------------------------------------- /data/grocery/positive/23.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/23.jpg -------------------------------------------------------------------------------- /data/grocery/positive/24.bboxes.labels.tsv: -------------------------------------------------------------------------------- 1 | orange 2 | ketchup 3 | water 4 | mustard 5 | squash 6 | mushroom 7 | squash 8 | eggBox 9 | joghurt 10 | -------------------------------------------------------------------------------- /data/grocery/positive/24.bboxes.tsv: -------------------------------------------------------------------------------- 1 | 365 412 421 471 2 | 515 351 579 485 3 | 672 324 733 487 4 | 832 353 914 496 5 | 744 528 880 645 6 | 549 560 708 643 7 | 682 777 809 918 8 | 440 773 627 910 9 | 291 738 403 877 10 | -------------------------------------------------------------------------------- /data/grocery/positive/24.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/24.jpg -------------------------------------------------------------------------------- /data/grocery/positive/26.bboxes.labels.tsv: -------------------------------------------------------------------------------- 1 | mustard 2 | ketchup 3 | eggBox 4 | squash 5 | mushroom 6 | orange 7 | joghurt 8 | squash 9 | water 10 | -------------------------------------------------------------------------------- /data/grocery/positive/26.bboxes.tsv: -------------------------------------------------------------------------------- 1 | 215 371 275 501 2 | 335 372 403 499 3 | 508 439 681 513 4 | 730 365 866 512 5 | 529 587 690 672 6 | 405 571 487 647 7 | 436 746 543 878 8 | 252 784 379 914 9 | 740 765 816 952 10 | -------------------------------------------------------------------------------- /data/grocery/positive/26.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/26.jpg -------------------------------------------------------------------------------- /data/grocery/positive/3.bboxes.labels.tsv: -------------------------------------------------------------------------------- 1 | mustard 2 | eggBox 3 | ketchup 4 | squash 5 | orange 6 | joghurt 7 | mushroom 8 | water 9 | squash 10 | -------------------------------------------------------------------------------- /data/grocery/positive/3.bboxes.tsv: -------------------------------------------------------------------------------- 1 | 196 468 325 515 2 | 355 455 584 513 3 | 607 459 696 517 4 | 557 539 688 659 5 | 724 596 809 665 6 | 666 784 785 936 7 | 388 785 515 858 8 | 268 746 333 926 9 | 588 1018 716 1162 10 | -------------------------------------------------------------------------------- /data/grocery/positive/3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/3.jpg -------------------------------------------------------------------------------- /data/grocery/positive/4.bboxes.labels.tsv: -------------------------------------------------------------------------------- 1 | ketchup 2 | mustard 3 | eggBox 4 | squash 5 | orange 6 | joghurt 7 | water 8 | mushroom 9 | squash 10 | -------------------------------------------------------------------------------- /data/grocery/positive/4.bboxes.tsv: -------------------------------------------------------------------------------- 1 | 307 481 423 544 2 | 532 489 661 544 3 | 704 471 852 541 4 | 701 568 837 693 5 | 565 621 644 689 6 | 674 766 781 893 7 | 355 724 409 865 8 | 465 872 605 965 9 | 603 1052 730 1201 10 | -------------------------------------------------------------------------------- /data/grocery/positive/4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/4.jpg -------------------------------------------------------------------------------- /data/grocery/positive/6.bboxes.labels.tsv: -------------------------------------------------------------------------------- 1 | mushroom 2 | squash 3 | mustard 4 | ketchup 5 | orange 6 | joghurt 7 | eggBox 8 | water 9 | squash 10 | -------------------------------------------------------------------------------- /data/grocery/positive/6.bboxes.tsv: -------------------------------------------------------------------------------- 1 | 179 463 328 532 2 | 403 391 497 511 3 | 580 381 636 516 4 | 701 343 774 503 5 | 575 585 653 659 6 | 677 777 792 918 7 | 515 786 635 924 8 | 341 737 408 906 9 | 611 1006 741 1152 10 | -------------------------------------------------------------------------------- /data/grocery/positive/6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/6.jpg -------------------------------------------------------------------------------- /data/grocery/positive/7.bboxes.labels.tsv: -------------------------------------------------------------------------------- 1 | squash 2 | mushroom 3 | ketchup 4 | mustard 5 | orange 6 | water 7 | eggBox 8 | joghurt 9 | squash 10 | -------------------------------------------------------------------------------- /data/grocery/positive/7.bboxes.tsv: -------------------------------------------------------------------------------- 1 | 213 344 344 481 2 | 472 424 589 480 3 | 627 329 694 484 4 | 741 361 817 485 5 | 714 567 792 637 6 | 643 845 841 926 7 | 492 757 589 888 8 | 307 752 423 898 9 | 311 969 435 1106 10 | -------------------------------------------------------------------------------- /data/grocery/positive/7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/7.jpg -------------------------------------------------------------------------------- /data/grocery/positive/8.bboxes.labels.tsv: -------------------------------------------------------------------------------- 1 | mustard 2 | ketchup 3 | mushroom 4 | squash 5 | water 6 | eggBox 7 | joghurt 8 | orange 9 | squash 10 | -------------------------------------------------------------------------------- /data/grocery/positive/8.bboxes.tsv: -------------------------------------------------------------------------------- 1 | 248 384 309 509 2 | 347 384 403 511 3 | 519 453 661 520 4 | 726 384 869 525 5 | 682 607 838 682 6 | 702 776 804 905 7 | 347 730 439 852 8 | 277 840 347 909 9 | 299 982 417 1125 10 | -------------------------------------------------------------------------------- /data/grocery/positive/8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/8.jpg -------------------------------------------------------------------------------- /data/grocery/positive/9.bboxes.labels.tsv: -------------------------------------------------------------------------------- 1 | mustard 2 | water 3 | squash 4 | mushroom 5 | orange 6 | eggBox 7 | ketchup 8 | joghurt 9 | squash 10 | -------------------------------------------------------------------------------- /data/grocery/positive/9.bboxes.tsv: -------------------------------------------------------------------------------- 1 | 232 452 379 509 2 | 512 341 572 512 3 | 720 364 866 515 4 | 535 588 685 670 5 | 758 596 837 663 6 | 692 772 805 900 7 | 488 841 625 904 8 | 333 732 427 861 9 | 291 1001 411 1144 10 | -------------------------------------------------------------------------------- /data/grocery/positive/9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/positive/9.jpg -------------------------------------------------------------------------------- /data/grocery/testImages/10.bboxes.labels.tsv: -------------------------------------------------------------------------------- 1 | squash 2 | orange 3 | water 4 | mushroom 5 | eggBox 6 | ketchup 7 | mustard 8 | joghurt 9 | squash 10 | -------------------------------------------------------------------------------- /data/grocery/testImages/10.bboxes.tsv: -------------------------------------------------------------------------------- 1 | 209 361 339 496 2 | 399 424 471 495 3 | 509 319 568 489 4 | 737 332 858 489 5 | 535 541 682 648 6 | 617 820 732 910 7 | 467 812 563 882 8 | 280 762 396 904 9 | 305 980 423 1117 10 | -------------------------------------------------------------------------------- /data/grocery/testImages/10.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/testImages/10.jpg -------------------------------------------------------------------------------- /data/grocery/testImages/15.bboxes.labels.tsv: -------------------------------------------------------------------------------- 1 | mustard 2 | ketchup 3 | orange 4 | water 5 | squash 6 | squash 7 | joghurt 8 | eggBox 9 | mushroom 10 | -------------------------------------------------------------------------------- /data/grocery/testImages/15.bboxes.tsv: -------------------------------------------------------------------------------- 1 | 212 379 295 508 2 | 325 352 408 512 3 | 451 443 532 513 4 | 571 303 644 515 5 | 696 353 837 509 6 | 714 797 842 938 7 | 551 786 670 938 8 | 391 788 509 932 9 | 237 824 372 918 10 | -------------------------------------------------------------------------------- /data/grocery/testImages/15.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/testImages/15.jpg -------------------------------------------------------------------------------- /data/grocery/testImages/20.bboxes.labels.tsv: -------------------------------------------------------------------------------- 1 | mushroom 2 | ketchup 3 | eggBox 4 | water 5 | orange 6 | mustard 7 | joghurt 8 | squash 9 | squash 10 | -------------------------------------------------------------------------------- /data/grocery/testImages/20.bboxes.tsv: -------------------------------------------------------------------------------- 1 | 571 475 693 537 2 | 697 336 768 468 3 | 698 472 814 540 4 | 635 615 832 686 5 | 560 615 635 682 6 | 601 713 673 837 7 | 576 810 684 962 8 | 469 812 577 946 9 | 349 809 465 936 10 | -------------------------------------------------------------------------------- /data/grocery/testImages/20.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/testImages/20.jpg -------------------------------------------------------------------------------- /data/grocery/testImages/25.bboxes.labels.tsv: -------------------------------------------------------------------------------- 1 | mustard 2 | ketchup 3 | water 4 | squash 5 | eggBox 6 | mushroom 7 | orange 8 | joghurt 9 | squash 10 | -------------------------------------------------------------------------------- /data/grocery/testImages/25.bboxes.tsv: -------------------------------------------------------------------------------- 1 | 245 369 315 499 2 | 320 348 392 499 3 | 407 329 463 499 4 | 524 385 624 501 5 | 773 444 920 523 6 | 560 587 716 670 7 | 433 565 513 644 8 | 556 793 673 945 9 | 271 772 395 905 10 | -------------------------------------------------------------------------------- /data/grocery/testImages/25.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/testImages/25.jpg -------------------------------------------------------------------------------- /data/grocery/testImages/5.bboxes.labels.tsv: -------------------------------------------------------------------------------- 1 | mushroom 2 | joghurt 3 | eggBox 4 | squash 5 | orange 6 | water 7 | ketchup 8 | mustard 9 | squash 10 | -------------------------------------------------------------------------------- /data/grocery/testImages/5.bboxes.tsv: -------------------------------------------------------------------------------- 1 | 288 425 445 493 2 | 511 371 613 495 3 | 740 425 893 503 4 | 733 529 872 657 5 | 585 577 666 648 6 | 372 713 436 878 7 | 545 836 690 912 8 | 714 824 821 896 9 | 623 1012 749 1156 10 | -------------------------------------------------------------------------------- /data/grocery/testImages/5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/data/grocery/testImages/5.jpg -------------------------------------------------------------------------------- /deprecated_3_runCntk_brainscript.py: -------------------------------------------------------------------------------- 1 | import os, sys, importlib 2 | import shutil, time 3 | import subprocess 4 | import PARAMETERS 5 | locals().update(importlib.import_module("PARAMETERS").__dict__) 6 | 7 | 8 | 9 | #################################### 10 | # Parameters 11 | #################################### 12 | cntkBinariesDir = "C:/local/CNTK-2-0-rc1/cntk/cntk/" 13 | 14 | # no need to change this 15 | cntkCmdStrPattern = "{0}/cntk.exe configFile={1}config.cntk currentDirectory={1}" 16 | 17 | 18 | 19 | #################################### 20 | # Main 21 | #################################### 22 | print("classifier = " + classifier) 23 | if not os.path.exists(cntkBinariesDir + "/cntk.exe"): 24 | raise Exception("Cannot find cntk.exe in directory: " + cntkBinariesDir) 25 | deleteAllFilesInDirectory(cntkFilesDir + "/tmp", None) 26 | shutil.copy(os.path.join(cntkResourcesDir, "config.cntk"), cntkFilesDir) 27 | 28 | #generate cntk command string 29 | cmdStr = cntkCmdStrPattern.format(cntkBinariesDir, cntkFilesDir, classifier) 30 | cmdStr += " ImageH={} ImageW={}".format(cntk_padHeight, cntk_padWidth) 31 | cmdStr += " NumLabels={0} NumTrainROIs={1} NumTestROIs={1}".format(len(classes), cntk_nrRois) 32 | cmdStr += " TrainROIDim={} TrainROILabelDim={}".format(4*cntk_nrRois, cntk_nrRois * cntk_featureDimensions[classifier]) 33 | cmdStr += " TestROIDim={} TestROILabelDim={}".format( 4*cntk_nrRois, cntk_nrRois * cntk_featureDimensions[classifier]) 34 | if classifier == 'svm': 35 | cmdStr += " [Train=[SGD=[maxEpochs=0]]]" #no need to train the network if just using it as featurizer 36 | cmdStr += " [WriteTest=[outputNodeNames=(z.fcOut.h2.y)]]" 37 | cmdStr += " [WriteTrain=[outputNodeNames=(z.fcOut.h2.y)]]" 38 | 39 | #run cntk 40 | tstart = datetime.datetime.now() 41 | os.environ['ACML_FMA'] = str(0) 42 | print(cmdStr) 43 | pid = subprocess.Popen(cmdStr, cwd = cntkFilesDir) #, creationflags=subprocess.CREATE_NEW_CONSOLE) 44 | pid.wait() 45 | print ("Time running cntk [s]: " + str((datetime.datetime.now() - tstart).total_seconds())) 46 | 47 | #delete model files written during cntk training 48 | filenames = getFilesInDirectory(cntkFilesDir + "/tmp/", postfix = None) 49 | for filename in filenames: 50 | if filename.startswith('Fast-RCNN.'): 51 | os.remove(cntkFilesDir + "/tmp/" + filename) 52 | assert pid.returncode == 0, "ERROR: cntk ended with exit code {}".format(pid.returncode) 53 | 54 | #parse cntk output 55 | print("classifier = " + classifier) 56 | image_sets = ["test", "train"] 57 | for image_set in image_sets: 58 | print("Parsing CNTK output for image set: " + image_set) 59 | cntkImgsListPath = cntkFilesDir + image_set + ".txt" 60 | outParsedDir = cntkFilesDir + image_set + "_" + classifier + "_parsed/" 61 | if classifier == 'svm': 62 | cntkOutputPath = cntkFilesDir + image_set + ".z.fcOut.h2.y" 63 | elif classifier == 'nn': 64 | cntkOutputPath = cntkFilesDir + image_set + ".z" 65 | else: 66 | error 67 | 68 | #write cntk output for each image to separate file 69 | makeDirectory(outParsedDir) 70 | parseCntkOutput(cntkImgsListPath, cntkOutputPath, outParsedDir, cntk_nrRois, cntk_featureDimensions[classifier], 71 | saveCompressed = True, skipCheck = False) #, skip5Mod = 0) 72 | 73 | #delete cntk output file which can be very large and are no longer needed 74 | deleteFile(cntkOutputPath) 75 | print("DONE.") -------------------------------------------------------------------------------- /doc/0.filter.roi.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/0.filter.roi.jpg -------------------------------------------------------------------------------- /doc/0.grid.roi.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/0.grid.roi.jpg -------------------------------------------------------------------------------- /doc/0.ss.roi.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/0.ss.roi.jpg -------------------------------------------------------------------------------- /doc/anno_boxes.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/anno_boxes.jpg -------------------------------------------------------------------------------- /doc/anno_labels.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/anno_labels.jpg -------------------------------------------------------------------------------- /doc/nn_00.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/nn_00.jpg -------------------------------------------------------------------------------- /doc/nn_00_no_nms.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/nn_00_no_nms.jpg -------------------------------------------------------------------------------- /doc/nn_01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/nn_01.jpg -------------------------------------------------------------------------------- /doc/nn_110.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/nn_110.jpg -------------------------------------------------------------------------------- /doc/nn_215.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/nn_215.jpg -------------------------------------------------------------------------------- /doc/nn_425.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/nn_425.jpg -------------------------------------------------------------------------------- /doc/nn_55.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/nn_55.jpg -------------------------------------------------------------------------------- /doc/precision_recall.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/precision_recall.jpg -------------------------------------------------------------------------------- /doc/rcnnPipeline.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/rcnnPipeline.JPG -------------------------------------------------------------------------------- /doc/svm_010.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/svm_010.jpg -------------------------------------------------------------------------------- /doc/svm_115.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/svm_115.jpg -------------------------------------------------------------------------------- /doc/svm_220.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/svm_220.jpg -------------------------------------------------------------------------------- /doc/svm_325.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/svm_325.jpg -------------------------------------------------------------------------------- /doc/svm_45.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/doc/svm_45.jpg -------------------------------------------------------------------------------- /fastRCNN/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | from .imdb import imdb 8 | from .pascal_voc import pascal_voc 9 | 10 | -------------------------------------------------------------------------------- /fastRCNN/imdb.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import os, sys 9 | import os.path as osp 10 | import PIL 11 | import numpy as np 12 | import scipy.sparse 13 | import platform 14 | from builtins import range 15 | 16 | if sys.version_info[1] == 4 and sys.version_info[0] == 3: 17 | from .utils34_win64.cython_bbox import bbox_overlaps 18 | elif sys.version_info[1] == 5 and sys.version_info[0] == 3: 19 | from .utils35_win64.cython_bbox import bbox_overlaps 20 | else: 21 | print("ERROR: Python version {} not supported".format(sys.version_info)) 22 | error 23 | 24 | 25 | class imdb(object): 26 | """Image database.""" 27 | 28 | def __init__(self, name): 29 | self._name = name 30 | self._num_classes = 0 31 | self._classes = [] 32 | self._image_index = [] 33 | self._obj_proposer = 'selective_search' 34 | self._roidb = None 35 | self._roidb_handler = self.default_roidb 36 | # Use this dict for storing dataset specific config options 37 | self.config = {} 38 | 39 | @property 40 | def name(self): 41 | return self._name 42 | 43 | @property 44 | def num_classes(self): 45 | return len(self._classes) 46 | 47 | @property 48 | def classes(self): 49 | return self._classes 50 | 51 | @property 52 | def image_index(self): 53 | return self._image_index 54 | 55 | @property 56 | def roidb_handler(self): 57 | return self._roidb_handler 58 | 59 | @roidb_handler.setter 60 | def roidb_handler(self, val): 61 | self._roidb_handler = val 62 | 63 | @property 64 | def roidb(self): 65 | # A roidb is a list of dictionaries, each with the following keys: 66 | # boxes 67 | # gt_overlaps 68 | # gt_classes 69 | # flipped 70 | if self._roidb is not None: 71 | return self._roidb 72 | self._roidb = self.roidb_handler() 73 | return self._roidb 74 | 75 | # @property 76 | # def cache_path(self): 77 | # cache_path = osp.abspath(osp.join(datasets.ROOT_DIR, 'data', 'cache')) 78 | # print cache_path 79 | # if not os.path.exists(cache_path): 80 | # os.makedirs(cache_path) 81 | # return cache_path 82 | 83 | @property 84 | def num_images(self): 85 | return len(self.image_index) 86 | 87 | def image_path_at(self, i): 88 | raise NotImplementedError 89 | 90 | def default_roidb(self): 91 | raise NotImplementedError 92 | 93 | def evaluate_detections(self, all_boxes, output_dir=None): 94 | """ 95 | all_boxes is a list of length number-of-classes. 96 | Each list element is a list of length number-of-images. 97 | Each of those list elements is either an empty list [] 98 | or a numpy array of detection. 99 | 100 | all_boxes[class][image] = [] or np.array of shape #dets x 5 101 | """ 102 | raise NotImplementedError 103 | 104 | def append_flipped_images(self): 105 | num_images = self.num_images 106 | widths = [PIL.Image.open(self.image_path_at(i)).size[0] 107 | for i in range(num_images)] 108 | for i in range(num_images): 109 | boxes = self.roidb[i]['boxes'].copy() 110 | oldx1 = boxes[:, 0].copy() 111 | oldx2 = boxes[:, 2].copy() 112 | boxes[:, 0] = widths[i] - oldx2 - 1 113 | boxes[:, 2] = widths[i] - oldx1 - 1 114 | assert (boxes[:, 2] >= boxes[:, 0]).all() 115 | entry = {'boxes' : boxes, 116 | 'gt_overlaps' : self.roidb[i]['gt_overlaps'], 117 | 'gt_classes' : self.roidb[i]['gt_classes'], 118 | 'flipped' : True} 119 | self.roidb.append(entry) 120 | self._image_index = self._image_index * 2 121 | 122 | def evaluate_recall(self, candidate_boxes, ar_thresh=0.5): 123 | # Record max overlap value for each gt box 124 | # Return vector of overlap values 125 | gt_overlaps = np.zeros(0) 126 | for i in range(self.num_images): 127 | gt_inds = np.where(self.roidb[i]['gt_classes'] > 0)[0] 128 | gt_boxes = self.roidb[i]['boxes'][gt_inds, :] 129 | 130 | boxes = candidate_boxes[i] 131 | if boxes.shape[0] == 0: 132 | continue 133 | overlaps = bbox_overlaps(boxes.astype(np.float), 134 | gt_boxes.astype(np.float)) 135 | 136 | # gt_overlaps = np.hstack((gt_overlaps, overlaps.max(axis=0))) 137 | _gt_overlaps = np.zeros((gt_boxes.shape[0])) 138 | for j in range(gt_boxes.shape[0]): 139 | argmax_overlaps = overlaps.argmax(axis=0) 140 | max_overlaps = overlaps.max(axis=0) 141 | gt_ind = max_overlaps.argmax() 142 | gt_ovr = max_overlaps.max() 143 | assert(gt_ovr >= 0) 144 | box_ind = argmax_overlaps[gt_ind] 145 | _gt_overlaps[j] = overlaps[box_ind, gt_ind] 146 | assert(_gt_overlaps[j] == gt_ovr) 147 | overlaps[box_ind, :] = -1 148 | overlaps[:, gt_ind] = -1 149 | 150 | gt_overlaps = np.hstack((gt_overlaps, _gt_overlaps)) 151 | 152 | num_pos = gt_overlaps.size 153 | gt_overlaps = np.sort(gt_overlaps) 154 | step = 0.001 155 | thresholds = np.minimum(np.arange(0.5, 1.0 + step, step), 1.0) 156 | recalls = np.zeros_like(thresholds) 157 | for i, t in enumerate(thresholds): 158 | recalls[i] = (gt_overlaps >= t).sum() / float(num_pos) 159 | ar = 2 * np.trapz(recalls, thresholds) 160 | 161 | return ar, gt_overlaps, recalls, thresholds 162 | 163 | def create_roidb_from_box_list(self, box_list, gt_roidb): 164 | assert len(box_list) == self.num_images, \ 165 | 'Number of boxes must match number of ground-truth images' 166 | roidb = [] 167 | for i in range(self.num_images): 168 | boxes = box_list[i] 169 | num_boxes = boxes.shape[0] 170 | overlaps = np.zeros((num_boxes, self.num_classes), dtype=np.float32) 171 | 172 | if gt_roidb and gt_roidb[i]: 173 | gt_boxes = gt_roidb[i]['boxes'] 174 | gt_classes = gt_roidb[i]['gt_classes'] 175 | if len(gt_classes) > 0: #for pascal every image has at least one annotated object. This is not the case however if including negative images 176 | gt_overlaps = bbox_overlaps(boxes.astype(np.float), 177 | gt_boxes.astype(np.float)) 178 | 179 | argmaxes = gt_overlaps.argmax(axis=1) 180 | maxes = gt_overlaps.max(axis=1) 181 | I = np.where(maxes > 0)[0] 182 | overlaps[I, gt_classes[argmaxes[I]]] = maxes[I] 183 | 184 | overlaps = scipy.sparse.csr_matrix(overlaps) 185 | roidb.append({'boxes' : boxes, 186 | 'gt_classes' : np.zeros((num_boxes,), 187 | dtype=np.int32), 188 | 'gt_overlaps' : overlaps, 189 | 'flipped' : False}) 190 | return roidb 191 | 192 | @staticmethod 193 | def merge_roidbs(a, b): 194 | assert len(a) == len(b) 195 | for i in range(len(a)): 196 | if a[i]: #if image has at least one annotated object 197 | a[i]['boxes'] = np.vstack((a[i]['boxes'], b[i]['boxes'])) 198 | a[i]['gt_classes'] = np.hstack((a[i]['gt_classes'], 199 | b[i]['gt_classes'])) 200 | a[i]['gt_overlaps'] = scipy.sparse.vstack([a[i]['gt_overlaps'], 201 | b[i]['gt_overlaps']]) 202 | else: 203 | a[i] = b[i] 204 | return a 205 | 206 | def competition_mode(self, on): 207 | """Turn competition mode on or off.""" 208 | pass 209 | -------------------------------------------------------------------------------- /fastRCNN/nms.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | def nms(dets, thresh): 11 | x1 = dets[:, 0] 12 | y1 = dets[:, 1] 13 | x2 = dets[:, 2] 14 | y2 = dets[:, 3] 15 | scores = dets[:, 4] 16 | 17 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 18 | order = scores.argsort()[::-1] 19 | 20 | keep = [] 21 | while order.size > 0: 22 | i = order[0] 23 | keep.append(i) 24 | xx1 = np.maximum(x1[i], x1[order[1:]]) 25 | yy1 = np.maximum(y1[i], y1[order[1:]]) 26 | xx2 = np.minimum(x2[i], x2[order[1:]]) 27 | yy2 = np.minimum(y2[i], y2[order[1:]]) 28 | 29 | w = np.maximum(0.0, xx2 - xx1 + 1) 30 | h = np.maximum(0.0, yy2 - yy1 + 1) 31 | inter = w * h 32 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 33 | 34 | inds = np.where(ovr <= thresh)[0] 35 | order = order[inds + 1] 36 | 37 | return keep 38 | -------------------------------------------------------------------------------- /fastRCNN/pascal_voc.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | from __future__ import print_function 9 | import os, pdb 10 | import xml.dom.minidom as minidom 11 | import numpy as np 12 | import scipy.sparse 13 | import scipy.io as sio 14 | import pickle as cp 15 | import subprocess 16 | from .imdb import imdb 17 | from .voc_eval import voc_eval 18 | #from fastRCNN.imdb import imdb 19 | #from fastRCNN.voc_eval import voc_eval 20 | 21 | class pascal_voc(imdb): 22 | def __init__(self, image_set, year, classes, maxNrRois, cacheDir, devkit_path=None): 23 | imdb.__init__(self, 'voc_' + year + '_' + image_set) 24 | self._year = year 25 | self._image_set = image_set 26 | self._maxNrRois = maxNrRois 27 | self._ROOT_DIR = os.path.join(os.path.dirname(__file__), '..') 28 | self._cacheDir = cacheDir 29 | self._devkit_path = self._get_default_path() if devkit_path is None \ 30 | else devkit_path 31 | self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year) 32 | self._classes = classes 33 | #('__background__', # always index 0 34 | # 'aeroplane', 'bicycle', 'bird', 'boat', 35 | # 'bottle', 'bus', 'car', 'cat', 'chair', 36 | # 'cow', 'diningtable', 'dog', 'horse', 37 | # 'motorbike', 'person', 'pottedplant', 38 | # 'sheep', 'sofa', 'train', 'tvmonitor') 39 | self._class_to_ind = dict(zip(self.classes, range(self.num_classes))) 40 | self._image_ext = '.jpg' 41 | self._image_index = self._load_image_set_index() 42 | # Default to roidb handler 43 | self._roidb_handler = self.selective_search_roidb 44 | 45 | # PASCAL specific config options 46 | self.config = {'cleanup' : True, 47 | 'use_salt' : True, 48 | 'top_k' : 2000} 49 | 50 | assert os.path.exists(self._devkit_path), \ 51 | 'VOCdevkit path does not exist: {}'.format(self._devkit_path) 52 | assert os.path.exists(self._data_path), \ 53 | 'Path does not exist: {}'.format(self._data_path) 54 | 55 | @property 56 | def cache_path(self): 57 | cache_path = self._cacheDir 58 | #cache_path = osp.abspath(osp.join(datasets.ROOT_DIR, 'data', 'cache')) 59 | if not os.path.exists(cache_path): 60 | os.makedirs(cache_path) 61 | return cache_path 62 | 63 | def image_path_at(self, i): 64 | """ 65 | Return the absolute path to image i in the image sequence. 66 | """ 67 | return self.image_path_from_index(self._image_index[i]) 68 | 69 | def image_path_from_index(self, index): 70 | """ 71 | Construct an image path from the image's "index" identifier. 72 | """ 73 | image_path = os.path.join(self._data_path, 'JPEGImages', 74 | index + self._image_ext) 75 | assert os.path.exists(image_path), \ 76 | 'Path does not exist: {}'.format(image_path) 77 | return image_path 78 | 79 | def _load_image_set_index(self): 80 | """ 81 | Load the indexes listed in this dataset's image set file. 82 | """ 83 | # Example path to image set file: 84 | # self._devkit_path + /VOCdevkit2007/VOC2007/ImageSets/Main/val.txt 85 | image_set_file = os.path.join(self._data_path, 'ImageSets', 'Main', 86 | self._image_set + '.txt') 87 | assert os.path.exists(image_set_file), \ 88 | 'Path does not exist: {}'.format(image_set_file) 89 | with open(image_set_file) as f: 90 | image_index = [x.strip() for x in f.readlines()] 91 | return image_index 92 | 93 | def _get_default_path(self): 94 | """ 95 | Return the default path where PASCAL VOC is expected to be installed. 96 | """ 97 | return os.path.join(self._ROOT_DIR, 'resources', 'pascalVocData', 'VOCdevkit' + self._year) 98 | 99 | def gt_roidb(self): 100 | """ 101 | Return the database of ground-truth regions of interest. 102 | 103 | This function loads/saves from/to a cache file to speed up future calls. 104 | """ 105 | cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl') 106 | if os.path.exists(cache_file): 107 | with open(cache_file, 'rb') as fid: 108 | roidb = cp.load(fid) 109 | print ('{} gt roidb loaded from {}'.format(self.name, cache_file)) 110 | return roidb 111 | 112 | gt_roidb = [self._load_pascal_annotation(index) 113 | for index in self.image_index] 114 | with open(cache_file, 'wb') as fid: 115 | cp.dump(gt_roidb, fid, cp.HIGHEST_PROTOCOL) 116 | print ('wrote gt roidb to {}'.format(cache_file)) 117 | 118 | return gt_roidb 119 | 120 | def selective_search_roidb(self): 121 | """ 122 | Return the database of selective search regions of interest. 123 | Ground-truth ROIs are also included. 124 | 125 | This function loads/saves from/to a cache file to speed up future calls. 126 | """ 127 | cache_file = os.path.join(self.cache_path, 128 | self.name + '_selective_search_roidb.pkl') 129 | 130 | if os.path.exists(cache_file): 131 | with open(cache_file, 'rb') as fid: 132 | roidb = cp.load(fid, encoding='latin1') 133 | print ('{} ss roidb loaded from {}'.format(self.name, cache_file)) 134 | return roidb 135 | 136 | if int(self._year) == 2007 or not self._image_set.startswith('test'): 137 | gt_roidb = self.gt_roidb() 138 | ss_roidb = self._load_selective_search_roidb(gt_roidb) 139 | roidb = imdb.merge_roidbs(gt_roidb, ss_roidb) 140 | else: 141 | roidb = self._load_selective_search_roidb(None) 142 | 143 | # Keep max of e.g. 2000 rois 144 | if type(self._maxNrRois) == int: 145 | print ("Only keep the first %d ROIs..." % self._maxNrRois) 146 | for i in range(self.num_images): 147 | gt_overlaps = roidb[i]['gt_overlaps'] 148 | gt_overlaps = gt_overlaps.todense()[:self._maxNrRois] 149 | gt_overlaps = scipy.sparse.csr_matrix(gt_overlaps) 150 | roidb[i]['boxes'] = roidb[i]['boxes'][:self._maxNrRois, :] 151 | roidb[i]['gt_classes'] = roidb[i]['gt_classes'][:self._maxNrRois] 152 | roidb[i]['gt_overlaps'] = roidb[i]['gt_overlaps'] = gt_overlaps 153 | 154 | with open(cache_file, 'wb') as fid: 155 | cp.dump(roidb, fid, cp.HIGHEST_PROTOCOL) 156 | print ('wrote ss roidb to {}'.format(cache_file)) 157 | 158 | return roidb 159 | 160 | def _load_selective_search_roidb(self, gt_roidb): 161 | filename = os.path.abspath(os.path.join(self._devkit_path, '..', 162 | 'selective_search_data', 163 | self.name + '.mat')) 164 | assert os.path.exists(filename), \ 165 | 'Selective search data not found at: {}'.format(filename) 166 | raw_data = sio.loadmat(filename)['boxes'].ravel() 167 | 168 | box_list = [] 169 | for i in range(raw_data.shape[0]): 170 | box_list.append(raw_data[i][:, (1, 0, 3, 2)] - 1) 171 | 172 | return self.create_roidb_from_box_list(box_list, gt_roidb) 173 | 174 | def selective_search_IJCV_roidb(self): 175 | """ 176 | Return the database of selective search regions of interest. 177 | Ground-truth ROIs are also included. 178 | 179 | This function loads/saves from/to a cache file to speed up future calls. 180 | """ 181 | cache_file = os.path.join(self.cache_path, 182 | '{:s}_selective_search_IJCV_top_{:d}_roidb.pkl'. 183 | format(self.name, self.config['top_k'])) 184 | 185 | if os.path.exists(cache_file): 186 | with open(cache_file, 'rb') as fid: 187 | roidb = cp.load(fid) 188 | print ('{} ss roidb loaded from {}'.format(self.name, cache_file)) 189 | return roidb 190 | 191 | gt_roidb = self.gt_roidb() 192 | ss_roidb = self._load_selective_search_IJCV_roidb(gt_roidb) 193 | roidb = imdb.merge_roidbs(gt_roidb, ss_roidb) 194 | with open(cache_file, 'wb') as fid: 195 | cp.dump(roidb, fid, cp.HIGHEST_PROTOCOL) 196 | print ('wrote ss roidb to {}'.format(cache_file)) 197 | 198 | return roidb 199 | 200 | def _load_selective_search_IJCV_roidb(self, gt_roidb): 201 | IJCV_path = os.path.abspath(os.path.join(self.cache_path, '..', 202 | 'selective_search_IJCV_data', 203 | 'voc_' + self._year)) 204 | assert os.path.exists(IJCV_path), \ 205 | 'Selective search IJCV data not found at: {}'.format(IJCV_path) 206 | 207 | top_k = self.config['top_k'] 208 | box_list = [] 209 | for i in range(self.num_images): 210 | filename = os.path.join(IJCV_path, self.image_index[i] + '.mat') 211 | raw_data = sio.loadmat(filename) 212 | box_list.append((raw_data['boxes'][:top_k, :]-1).astype(np.uint16)) 213 | 214 | return self.create_roidb_from_box_list(box_list, gt_roidb) 215 | 216 | def _load_pascal_annotation(self, index): 217 | """ 218 | Load image and bounding boxes info from XML file in the PASCAL VOC 219 | format. 220 | """ 221 | filename = os.path.join(self._data_path, 'Annotations', index + '.xml') 222 | # print ('Loading: {}'.format(filename)) 223 | def get_data_from_tag(node, tag): 224 | return node.getElementsByTagName(tag)[0].childNodes[0].data 225 | 226 | with open(filename) as f: 227 | data = minidom.parseString(f.read()) 228 | 229 | objs = data.getElementsByTagName('object') 230 | num_objs = len(objs) 231 | 232 | boxes = np.zeros((num_objs, 4), dtype=np.uint16) 233 | gt_classes = np.zeros((num_objs), dtype=np.int32) 234 | overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32) 235 | 236 | # Load object bounding boxes into a data frame. 237 | for ix, obj in enumerate(objs): 238 | # Make pixel indexes 0-based 239 | x1 = float(get_data_from_tag(obj, 'xmin')) - 1 240 | y1 = float(get_data_from_tag(obj, 'ymin')) - 1 241 | x2 = float(get_data_from_tag(obj, 'xmax')) - 1 242 | y2 = float(get_data_from_tag(obj, 'ymax')) - 1 243 | cls = self._class_to_ind[ 244 | str(get_data_from_tag(obj, "name")).lower().strip()] 245 | boxes[ix, :] = [x1, y1, x2, y2] 246 | gt_classes[ix] = cls 247 | overlaps[ix, cls] = 1.0 248 | 249 | overlaps = scipy.sparse.csr_matrix(overlaps) 250 | 251 | return {'boxes' : boxes, 252 | 'gt_classes': gt_classes, 253 | 'gt_overlaps' : overlaps, 254 | 'flipped' : False} 255 | 256 | def _write_voc_results_file(self, all_boxes, output_dir): 257 | comp_id = 'comp4' 258 | if self.config['use_salt']: 259 | comp_id += '-{}'.format(os.getpid()) 260 | 261 | for cls_ind, cls in enumerate(self.classes): 262 | if cls == '__background__': 263 | continue 264 | print ('Writing {} VOC results file'.format(cls)) 265 | filename = self._get_voc_results_file_template(output_dir).format(cls) 266 | with open(filename, 'wt') as f: 267 | for im_ind, index in enumerate(self.image_index): 268 | dets = all_boxes[cls_ind][im_ind] 269 | if dets == []: 270 | continue 271 | # the VOCdevkit expects 1-based indices 272 | for k in range(dets.shape[0]): 273 | f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. 274 | format(index, dets[k, -1], 275 | dets[k, 0] + 1, dets[k, 1] + 1, 276 | dets[k, 2] + 1, dets[k, 3] + 1)) 277 | return comp_id 278 | 279 | def evaluate_detections(self, all_boxes, output_dir, boUsePythonImpl = True, use_07_metric = False): 280 | self._write_voc_results_file(all_boxes, output_dir) 281 | if not boUsePythonImpl: 282 | self._do_matlab_eval(comp_id, output_dir) 283 | else: 284 | self._do_python_eval(output_dir, use_07_metric) 285 | return [] 286 | 287 | def _do_matlab_eval(self, comp_id, output_dir='output'): 288 | rm_results = self.config['cleanup'] 289 | 290 | path = os.path.join(os.path.dirname(__file__), 291 | 'VOCdevkit-matlab-wrapper') 292 | cmd = 'cd {} && '.format(path) 293 | cmd += '{:s} -nodisplay -nodesktop '.format(datasets.MATLAB) 294 | cmd += '-r "dbstop if error; ' 295 | cmd += 'voc_eval(\'{:s}\',\'{:s}\',\'{:s}\',\'{:s}\',{:d}); quit;"' \ 296 | .format(self._devkit_path, comp_id, 297 | self._image_set, output_dir, int(rm_results)) 298 | print('Running:\n{}'.format(cmd)) 299 | status = subprocess.call(cmd, shell=True) 300 | 301 | def competition_mode(self, on): 302 | if on: 303 | self.config['use_salt'] = False 304 | self.config['cleanup'] = False 305 | else: 306 | self.config['use_salt'] = True 307 | self.config['cleanup'] = True 308 | 309 | ######################################################################### 310 | # Python evaluation functions (copied from faster-RCNN) 311 | ########################################################################## 312 | def _get_voc_results_file_template(self, evalDir): 313 | if not os.path.exists(evalDir): 314 | os.makedirs(evalDir) 315 | filename = self._image_set + '_{:s}.txt' 316 | return os.path.join(evalDir, filename) 317 | 318 | def _do_python_eval(self, output_dir='output', use_07_metric=None): 319 | annopath = os.path.join(self._devkit_path, 'VOC' + self._year, 'Annotations', '{}.xml') 320 | imagesetfile = os.path.join( 321 | self._devkit_path, 322 | 'VOC' + self._year, 323 | 'ImageSets', 324 | 'Main', 325 | self._image_set + '.txt') 326 | aps = [] 327 | # The PASCAL VOC metric changed in 2010 328 | if use_07_metric == None: 329 | use_07_metric = True if int(self._year) < 2010 else False 330 | 331 | print ('VOC07 metric? ' + ('Yes' if use_07_metric else 'No')) 332 | if not os.path.isdir(output_dir): 333 | os.mkdir(output_dir) 334 | for i, cls in enumerate(self._classes): 335 | if cls == '__background__': 336 | continue 337 | filename = self._get_voc_results_file_template(output_dir).format(cls) 338 | 339 | rec, prec, ap = voc_eval( 340 | filename, annopath, imagesetfile, cls, cachedir = output_dir, ovthresh=0.5, 341 | use_07_metric=use_07_metric) 342 | aps += [ap] 343 | print('AP for {} = {:.4f}'.format(cls, ap)) 344 | with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f: 345 | cp.dump({'rec': rec, 'prec': prec, 'ap': ap}, f) 346 | print('Mean AP = {:.4f}'.format(np.mean(aps))) 347 | # print('~~~~~~~~') 348 | # print('Results:') 349 | # for ap in aps: 350 | # print('{:.3f}'.format(ap)) 351 | # print('{:.3f}'.format(np.mean(aps))) 352 | # print('~~~~~~~~') 353 | # print('') 354 | print('--------------------------------------------------------------') 355 | print('Results computed with the **unofficial** Python eval code.') 356 | print('Results should be very close to the official MATLAB eval code.') 357 | print('Recompute with `./tools/reval.py --matlab ...` for your paper.') 358 | print('-- Thanks, The Management') 359 | print('--------------------------------------------------------------') 360 | 361 | if __name__ == '__main__': 362 | d = datasets.pascal_voc('trainval', '2007') 363 | res = d.roidb 364 | from IPython import embed; embed() -------------------------------------------------------------------------------- /fastRCNN/test.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Test a Fast R-CNN network on an imdb (image database).""" 9 | 10 | #from config import cfg #, get_output_dir 11 | #from blob import im_list_to_blob 12 | from __future__ import print_function 13 | import os, sys, cv2, numpy as np, pickle as cp, heapq 14 | from .nms import nms as nmsPython 15 | from .timer import Timer 16 | from helpers import im_detect, apply_nms 17 | from builtins import range 18 | import pdb 19 | 20 | # if sys.version_info[0] < 3: 21 | # from utils2_win64.cython_nms import nms 22 | # else: 23 | # from .utils3_win64.cython_nms import nms 24 | 25 | if sys.version_info[1] == 4 and sys.version_info[0] == 3: 26 | from .utils34_win64.cython_nms import nms 27 | elif sys.version_info[1] == 5 and sys.version_info[0] == 3: 28 | from .utils35_win64.cython_nms import nms 29 | else: 30 | print("ERROR: Python version {} not supported".format(sys.version_info)) 31 | error 32 | 33 | 34 | 35 | def _get_image_blob(im): 36 | """Converts an image into a network input. 37 | 38 | Arguments: 39 | im (ndarray): a color image in BGR order 40 | 41 | Returns: 42 | blob (ndarray): a data blob holding an image pyramid 43 | im_scale_factors (list): list of image scales (relative to im) used 44 | in the image pyramid 45 | """ 46 | im_orig = im.astype(np.float32, copy=True) 47 | im_orig -= cfg.PIXEL_MEANS 48 | 49 | im_shape = im_orig.shape 50 | im_size_min = np.min(im_shape[0:2]) 51 | im_size_max = np.max(im_shape[0:2]) 52 | 53 | processed_ims = [] 54 | im_scale_factors = [] 55 | 56 | for target_size in cfg.TEST.SCALES: 57 | im_scale = float(target_size) / float(im_size_min) 58 | # Prevent the biggest axis from being more than MAX_SIZE 59 | if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE: 60 | im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max) 61 | im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, 62 | interpolation=cv2.INTER_LINEAR) 63 | im_scale_factors.append(im_scale) 64 | processed_ims.append(im) 65 | 66 | # Create a blob to hold the input images 67 | blob = im_list_to_blob(processed_ims) 68 | 69 | return blob, np.array(im_scale_factors) 70 | 71 | def _get_rois_blob(im_rois, im_scale_factors): 72 | """Converts RoIs into network inputs. 73 | 74 | Arguments: 75 | im_rois (ndarray): R x 4 matrix of RoIs in original image coordinates 76 | im_scale_factors (list): scale factors as returned by _get_image_blob 77 | 78 | Returns: 79 | blob (ndarray): R x 5 matrix of RoIs in the image pyramid 80 | """ 81 | rois, levels = _project_im_rois(im_rois, im_scale_factors) 82 | rois_blob = np.hstack((levels, rois)) 83 | return rois_blob.astype(np.float32, copy=False) 84 | 85 | def _project_im_rois(im_rois, scales): 86 | """Project image RoIs into the image pyramid built by _get_image_blob. 87 | 88 | Arguments: 89 | im_rois (ndarray): R x 4 matrix of RoIs in original image coordinates 90 | scales (list): scale factors as returned by _get_image_blob 91 | 92 | Returns: 93 | rois (ndarray): R x 4 matrix of projected RoI coordinates 94 | levels (list): image pyramid levels used by each projected RoI 95 | """ 96 | im_rois = im_rois.astype(np.float, copy=False) 97 | 98 | if len(scales) > 1: 99 | widths = im_rois[:, 2] - im_rois[:, 0] + 1 100 | heights = im_rois[:, 3] - im_rois[:, 1] + 1 101 | 102 | areas = widths * heights 103 | scaled_areas = areas[:, np.newaxis] * (scales[np.newaxis, :] ** 2) 104 | diff_areas = np.abs(scaled_areas - 224 * 224) 105 | levels = diff_areas.argmin(axis=1)[:, np.newaxis] 106 | else: 107 | levels = np.zeros((im_rois.shape[0], 1), dtype=np.int) 108 | 109 | rois = im_rois * scales[levels] 110 | 111 | return rois, levels 112 | 113 | def _get_blobs(im, rois): 114 | """Convert an image and RoIs within that image into network inputs.""" 115 | blobs = {'data' : None, 'rois' : None} 116 | blobs['data'], im_scale_factors = _get_image_blob(im) 117 | blobs['rois'] = _get_rois_blob(rois, im_scale_factors) 118 | return blobs, im_scale_factors 119 | 120 | def _bbox_pred(boxes, box_deltas): 121 | """Transform the set of class-agnostic boxes into class-specific boxes 122 | by applying the predicted offsets (box_deltas) 123 | """ 124 | if boxes.shape[0] == 0: 125 | return np.zeros((0, box_deltas.shape[1])) 126 | 127 | boxes = boxes.astype(np.float, copy=False) 128 | widths = boxes[:, 2] - boxes[:, 0] + cfg.EPS 129 | heights = boxes[:, 3] - boxes[:, 1] + cfg.EPS 130 | ctr_x = boxes[:, 0] + 0.5 * widths 131 | ctr_y = boxes[:, 1] + 0.5 * heights 132 | 133 | dx = box_deltas[:, 0::4] 134 | dy = box_deltas[:, 1::4] 135 | dw = box_deltas[:, 2::4] 136 | dh = box_deltas[:, 3::4] 137 | 138 | pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis] 139 | pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis] 140 | pred_w = np.exp(dw) * widths[:, np.newaxis] 141 | pred_h = np.exp(dh) * heights[:, np.newaxis] 142 | 143 | pred_boxes = np.zeros(box_deltas.shape) 144 | # x1 145 | pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w 146 | # y1 147 | pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h 148 | # x2 149 | pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w 150 | # y2 151 | pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h 152 | 153 | return pred_boxes 154 | 155 | def _clip_boxes(boxes, im_shape): 156 | """Clip boxes to image boundaries.""" 157 | # x1 >= 0 158 | boxes[:, 0::4] = np.maximum(boxes[:, 0::4], 0) 159 | # y1 >= 0 160 | boxes[:, 1::4] = np.maximum(boxes[:, 1::4], 0) 161 | # x2 < im_shape[1] 162 | boxes[:, 2::4] = np.minimum(boxes[:, 2::4], im_shape[1] - 1) 163 | # y2 < im_shape[0] 164 | boxes[:, 3::4] = np.minimum(boxes[:, 3::4], im_shape[0] - 1) 165 | return boxes 166 | 167 | # def im_detect(net, im, boxes): 168 | # """Detect object classes in an image given object proposals. 169 | # 170 | # Arguments: 171 | # net (caffe.Net): Fast R-CNN network to use 172 | # im (ndarray): color image to test (in BGR order) 173 | # boxes (ndarray): R x 4 array of object proposals 174 | # 175 | # Returns: 176 | # scores (ndarray): R x K array of object class scores (K includes 177 | # background as object category 0) 178 | # boxes (ndarray): R x (4*K) array of predicted bounding boxes 179 | # """ 180 | # blobs, unused_im_scale_factors = _get_blobs(im, boxes) 181 | # 182 | # # When mapping from image ROIs to feature map ROIs, there's some aliasing 183 | # # (some distinct image ROIs get mapped to the same feature ROI). 184 | # # Here, we identify duplicate feature ROIs, so we only compute features 185 | # # on the unique subset. 186 | # if cfg.DEDUP_BOXES > 0: 187 | # v = np.array([1, 1e3, 1e6, 1e9, 1e12]) 188 | # hashes = np.round(blobs['rois'] * cfg.DEDUP_BOXES).dot(v) 189 | # _, index, inv_index = np.unique(hashes, return_index=True, 190 | # return_inverse=True) 191 | # blobs['rois'] = blobs['rois'][index, :] 192 | # boxes = boxes[index, :] 193 | # 194 | # # reshape network inputs 195 | # net.blobs['data'].reshape(*(blobs['data'].shape)) 196 | # net.blobs['rois'].reshape(*(blobs['rois'].shape)) 197 | # blobs_out = net.forward(data=blobs['data'].astype(np.float32, copy=False), 198 | # rois=blobs['rois'].astype(np.float32, copy=False)) 199 | # if cfg.TEST.SVM: 200 | # # use the raw scores before softmax under the assumption they 201 | # # were trained as linear SVMs 202 | # scores = net.blobs['cls_score'].data 203 | # else: 204 | # # use softmax estimated probabilities 205 | # scores = blobs_out['cls_prob'] 206 | # 207 | # if cfg.TEST.BBOX_REG: 208 | # # Apply bounding-box regression deltas 209 | # box_deltas = blobs_out['bbox_pred'] 210 | # pred_boxes = _bbox_pred(boxes, box_deltas) 211 | # pred_boxes = _clip_boxes(pred_boxes, im.shape) 212 | # else: 213 | # # Simply repeat the boxes, once for each class 214 | # pred_boxes = np.tile(boxes, (1, scores.shape[1])) 215 | # 216 | # if cfg.DEDUP_BOXES > 0: 217 | # # Map scores and predictions back to the original set of boxes 218 | # scores = scores[inv_index, :] 219 | # pred_boxes = pred_boxes[inv_index, :] 220 | # 221 | # return scores, pred_boxes 222 | 223 | def vis_detections(im, class_name, dets, thresh=0.3): 224 | """Visual debugging of detections.""" 225 | import matplotlib.pyplot as plt 226 | im = im[:, :, (2, 1, 0)] 227 | for i in range(np.minimum(10, dets.shape[0])): 228 | bbox = dets[i, :4] 229 | score = dets[i, -1] 230 | if score > thresh: 231 | plt.cla() 232 | plt.imshow(im) 233 | plt.gca().add_patch( 234 | plt.Rectangle((bbox[0], bbox[1]), 235 | bbox[2] - bbox[0], 236 | bbox[3] - bbox[1], fill=False, 237 | edgecolor='g', linewidth=3) 238 | ) 239 | plt.title('{} {:.3f}'.format(class_name, score)) 240 | plt.show() 241 | 242 | 243 | 244 | # TODO: MOVE THIS TO CNTK HELPERS 245 | # def test_net_noThreshold(): 246 | # #boxes = roidb[i]['boxes'] 247 | # scores, _, _ = im_detect(net, i, boxes, feature_scale=feature_scale, classifier=classifier) 248 | # 249 | # for j in range(1, imdb.num_classes): 250 | # inds = np.where(roidb[i]['gt_classes'] == 0)[0] 251 | # cls_scores = scores[inds, j] 252 | # cls_boxes = roidb[i]['boxes'][inds] 253 | # all_boxes[j][i] = \ 254 | # np.hstack((cls_boxes, cls_scores[:, np.newaxis])) \ 255 | # .astype(np.float32, copy=False) 256 | # 257 | 258 | 259 | def test_net(net, imdb, output_dir, feature_scale, classifier = 'svm', nmsThreshold = 0.3, 260 | boUsePythonImpl = False, boThresholdDetections = True, boApplyNms = True, 261 | overlapThreshold=0.5): 262 | """Test a Fast R-CNN network on an image database.""" 263 | num_images = len(imdb.image_index) 264 | # heuristic: keep an average of 40 detections per class per images prior 265 | # to NMS 266 | max_per_set = 40 * num_images 267 | # heuristic: keep at most 100 detection per class per image prior to NMS 268 | max_per_image = 100 269 | # detection thresold for each class (this is adaptively set based on the 270 | # max_per_set constraint) 271 | thresh = -np.inf * np.ones(imdb.num_classes) 272 | # top_scores will hold one minheap of scores per class (used to enforce 273 | # the max_per_set constraint) 274 | top_scores = [[] for _ in range(imdb.num_classes)] 275 | # all detections are collected into: 276 | # all_boxes[cls][image] = N x 5 array of detections in 277 | # (x1, y1, x2, y2, score) 278 | all_boxes = [[[] for _ in range(num_images)] 279 | for _ in range(imdb.num_classes)] 280 | 281 | #output_dir = get_output_dir(imdb, net) 282 | 283 | # timers 284 | _t = {'im_detect' : Timer(), 'misc' : Timer()} 285 | roidb = imdb.roidb 286 | 287 | if not boThresholdDetections: 288 | for i in range(num_images): 289 | if i % 100 == 0: 290 | print (" Processing image {} of {}..".format(i, num_images)) 291 | scores, _, _ = im_detect(net, i, roidb[i]['boxes'], feature_scale=feature_scale, classifier=classifier) 292 | 293 | for j in range(1, imdb.num_classes): 294 | inds = np.where(roidb[i]['gt_classes'] == 0)[0] 295 | cls_scores = scores[inds, j] 296 | cls_boxes = roidb[i]['boxes'][inds] 297 | all_boxes[j][i] = \ 298 | np.hstack((cls_boxes, cls_scores[:, np.newaxis])) \ 299 | .astype(np.float32, copy=False) 300 | 301 | else: 302 | for i in range(num_images): 303 | if i % 100 == 0: 304 | print (" Processing image {} of {}..".format(i, num_images)) 305 | #im = cv2.imread(imdb.image_path_at(i)) 306 | #_t['im_detect'].tic() 307 | scores, _, _ = im_detect(net, i, roidb[i]['boxes'], feature_scale = feature_scale, classifier = classifier) 308 | #_t['im_detect'].toc() 309 | 310 | _t['misc'].tic() 311 | for j in range(1, imdb.num_classes): 312 | #only get detections with high scores AND exclude ground truth ROIs 313 | inds = np.where((scores[:, j] > thresh[j]) & 314 | (roidb[i]['gt_classes'] == 0))[0] 315 | cls_scores = scores[inds, j] 316 | 317 | # cls_boxes = boxes[inds, j * 4:(j + 1) * 4] 318 | boxes = roidb[i]['boxes'] 319 | cls_boxes = boxes[inds] 320 | 321 | top_inds = np.argsort(-cls_scores)[:max_per_image] 322 | cls_scores = cls_scores[top_inds] 323 | cls_boxes = cls_boxes[top_inds, :] 324 | # push new scores onto the minheap 325 | for val in cls_scores: 326 | heapq.heappush(top_scores[j], val) 327 | # if we've collected more than the max number of detection, 328 | # then pop items off the minheap and update the class threshold 329 | if len(top_scores[j]) > max_per_set: 330 | while len(top_scores[j]) > max_per_set: 331 | heapq.heappop(top_scores[j]) 332 | thresh[j] = top_scores[j][0] 333 | 334 | all_boxes[j][i] = \ 335 | np.hstack((cls_boxes, cls_scores[:, np.newaxis])) \ 336 | .astype(np.float32, copy=False) 337 | 338 | #visualize rois 339 | if False and i == 6 and j == 15: 340 | im = cv2.imread(imdb.image_path_at(i)) 341 | if boUsePythonImpl: 342 | nms_boxes, nms_keepIndices = apply_nms(all_boxes, nmsThreshold, boUsePythonImpl = True) 343 | keep = nms_keepIndices[j][i] 344 | else: 345 | keep = nms(all_boxes[j][i], 0.3) 346 | #vis_detections(im, imdb.classes[j], all_boxes[j][i]) 347 | vis_detections(im, imdb.classes[j], all_boxes[j][i][keep, :]) #, thres=-10.0) 348 | _t['misc'].toc() 349 | 350 | # print ('im_detect: {:d}/{:d} {:.3f}s {:.3f}s' \ 351 | # .format(i + 1, num_images, _t['im_detect'].average_time, 352 | # _t['misc'].average_time)) 353 | 354 | # for j in range(1, imdb.num_classes): 355 | # thresh[j] = max(0.5, thresh[j]) 356 | # print("thresh[{}] = {}".format(j, thresh[j])) 357 | 358 | #keep only the boxes with highest score for each class 359 | # shape of all_boxes: e.g. 21 classes x 4952 images x 58 rois x 5 coord+score 360 | for j in range(1, imdb.num_classes): 361 | for i in range(num_images): 362 | inds = np.where(all_boxes[j][i][:, -1] > thresh[j])[0] 363 | if len(inds) == 0: 364 | all_boxes[j][i] = [] 365 | else: 366 | all_boxes[j][i] = all_boxes[j][i][inds, :] 367 | 368 | if output_dir: 369 | det_file = os.path.join(output_dir, 'detections.pkl') 370 | with open(det_file, 'wb') as f: 371 | cp.dump(all_boxes, f, cp.HIGHEST_PROTOCOL) 372 | 373 | if boApplyNms: 374 | print ("Number of rois before non-maxima surpression: %d" % sum([len(all_boxes[i][j]) for i in range(imdb.num_classes) for j in range(imdb.num_images)])) 375 | nms_dets,_ = apply_nms(all_boxes, nmsThreshold, boUsePythonImpl) 376 | print ("Number of rois after non-maxima surpression: %d" % sum([len(nms_dets[i][j]) for i in range(imdb.num_classes) for j in range(imdb.num_images)])) 377 | else: 378 | print ("Skipping non-maxima surpression") 379 | nms_dets = all_boxes 380 | 381 | print ('Evaluating detections') 382 | return imdb.evaluate_detections(nms_dets, output_dir, overlapThreshold) 383 | -------------------------------------------------------------------------------- /fastRCNN/timer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import time 9 | 10 | class Timer(object): 11 | """A simple timer.""" 12 | def __init__(self): 13 | self.total_time = 0. 14 | self.calls = 0 15 | self.start_time = 0. 16 | self.diff = 0. 17 | self.average_time = 0. 18 | 19 | def tic(self): 20 | # using time.time instead of time.clock because time time.clock 21 | # does not normalize for multithreading 22 | self.start_time = time.time() 23 | 24 | def toc(self, average=True): 25 | self.diff = time.time() - self.start_time 26 | self.total_time += self.diff 27 | self.calls += 1 28 | self.average_time = self.total_time / self.calls 29 | if average: 30 | return self.average_time 31 | else: 32 | return self.diff 33 | -------------------------------------------------------------------------------- /fastRCNN/train_svms.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -------------------------------------------------------- 4 | # Fast R-CNN 5 | # Copyright (c) 2015 Microsoft 6 | # Licensed under The MIT License [see LICENSE for details] 7 | # Written by Ross Girshick 8 | # -------------------------------------------------------- 9 | 10 | """ 11 | Train post-hoc SVMs using the algorithm and hyper-parameters from 12 | traditional R-CNN. 13 | """ 14 | 15 | from .timer import Timer 16 | from sklearn import svm 17 | import numpy as np 18 | 19 | 20 | 21 | ################################################# 22 | # Slightly modified SVM training functions 23 | ################################################# 24 | class SVMTrainer(object): 25 | """ 26 | Trains post-hoc detection SVMs for all classes using the algorithm 27 | and hyper-parameters of traditional R-CNN. 28 | """ 29 | 30 | def __init__(self, net, imdb, im_detect, svmWeightsPath, svmBiasPath, svmFeatScalePath, 31 | svm_C, svm_B, svm_nrEpochs, svm_retrainLimit, svm_evictThreshold, svm_posWeight, 32 | svm_targetNorm, svm_penality, svm_loss, svm_rngSeed): 33 | self.net = net 34 | self.imdb = imdb 35 | self.im_detect = im_detect 36 | self.svm_nrEpochs = svm_nrEpochs 37 | self.svm_targetNorm = svm_targetNorm 38 | self.svmWeightsPath = svmWeightsPath 39 | self.svmBiasPath = svmBiasPath 40 | self.svmFeatScalePath = svmFeatScalePath 41 | self.layer = 'fc7' 42 | self.hard_thresh = -1.0001 43 | self.neg_iou_thresh = 0.3 44 | dim = net.params['cls_score'][0].data.shape[1] 45 | self.feature_scale = self._get_feature_scale() 46 | print('Feature dim: {}'.format(dim)) 47 | print('Feature scale: {:.3f}'.format(self.feature_scale)) 48 | self.trainers = [SVMClassTrainer(cls, dim, self.feature_scale, svm_C, svm_B, svm_posWeight, svm_penality, svm_loss, 49 | svm_rngSeed, svm_retrainLimit, svm_evictThreshold) for cls in imdb.classes] 50 | 51 | 52 | def _get_feature_scale(self, num_images=100): 53 | _t = Timer() 54 | roidb = self.imdb.roidb 55 | total_norm = 0.0 56 | total_sum = 0.0 57 | count = 0.0 58 | num_images = min(num_images, self.imdb.num_images) 59 | inds = np.random.choice(range(self.imdb.num_images), size=num_images, replace=False) 60 | 61 | for i_, i in enumerate(inds): 62 | #im = cv2.imread(self.imdb.image_path_at(i)) 63 | #if roidb[i]['flipped']: 64 | # im = im[:, ::-1, :] 65 | #im = self.imdb.image_path_at(i) 66 | _t.tic() 67 | scores, boxes, feat = self.im_detect(self.net, i, roidb[i]['boxes'], boReturnClassifierScore = False) 68 | _t.toc() 69 | #feat = self.net.blobs[self.layer].data 70 | total_norm += np.sqrt((feat ** 2).sum(axis=1)).sum() 71 | total_sum += 1.0 * sum(sum(feat)) / len(feat) 72 | count += feat.shape[0] 73 | print('{}/{}: avg feature norm: {:.3f}, average value: {:.3f}'.format(i_ + 1, num_images, 74 | total_norm / count, total_sum / count)) 75 | 76 | return self.svm_targetNorm * 1.0 / (total_norm / count) 77 | 78 | def _get_pos_counts(self): 79 | counts = np.zeros((len(self.imdb.classes)), dtype=np.int) 80 | roidb = self.imdb.roidb 81 | for i in range(len(roidb)): 82 | for j in range(1, self.imdb.num_classes): 83 | I = np.where(roidb[i]['gt_classes'] == j)[0] 84 | counts[j] += len(I) 85 | 86 | for j in range(1, self.imdb.num_classes): 87 | print('class {:s} has {:d} positives'. 88 | format(self.imdb.classes[j], counts[j])) 89 | 90 | return counts 91 | 92 | def get_pos_examples(self): 93 | counts = self._get_pos_counts() 94 | for i in range(len(counts)): 95 | self.trainers[i].alloc_pos(counts[i]) 96 | 97 | _t = Timer() 98 | roidb = self.imdb.roidb 99 | num_images = len(roidb) 100 | for i in range(num_images): 101 | #im = cv2.imread(self.imdb.image_path_at(i)) 102 | #if roidb[i]['flipped']: 103 | # im = im[:, ::-1, :] 104 | #im = self.imdb.image_path_at(i) 105 | gt_inds = np.where(roidb[i]['gt_classes'] > 0)[0] 106 | gt_boxes = roidb[i]['boxes'][gt_inds] 107 | _t.tic() 108 | scores, boxes, feat = self.im_detect(self.net, i, gt_boxes, self.feature_scale, gt_inds, boReturnClassifierScore = False) 109 | _t.toc() 110 | #feat = self.net.blobs[self.layer].data 111 | for j in range(1, self.imdb.num_classes): 112 | cls_inds = np.where(roidb[i]['gt_classes'][gt_inds] == j)[0] 113 | if len(cls_inds) > 0: 114 | cls_feat = feat[cls_inds, :] 115 | self.trainers[j].append_pos(cls_feat) 116 | if i % 50 == 0: 117 | print('get_pos_examples: {:d}/{:d} {:.3f}s' \ 118 | .format(i + 1, len(roidb), _t.average_time)) 119 | 120 | def initialize_net(self): 121 | # Start all SVM parameters at zero 122 | self.net.params['cls_score'][0].data[...] = 0 123 | self.net.params['cls_score'][1].data[...] = 0 124 | 125 | # Initialize SVMs in a smart way. Not doing this because its such 126 | # a good initialization that we might not learn something close to 127 | # the SVM solution. 128 | # # subtract background weights and biases for the foreground classes 129 | # w_bg = self.net.params['cls_score'][0].data[0, :] 130 | # b_bg = self.net.params['cls_score'][1].data[0] 131 | # self.net.params['cls_score'][0].data[1:, :] -= w_bg 132 | # self.net.params['cls_score'][1].data[1:] -= b_bg 133 | # # set the background weights and biases to 0 (where they shall remain) 134 | # self.net.params['cls_score'][0].data[0, :] = 0 135 | # self.net.params['cls_score'][1].data[0] = 0 136 | 137 | def update_net(self, cls_ind, w, b): 138 | self.net.params['cls_score'][0].data[cls_ind, :] = w 139 | self.net.params['cls_score'][1].data[cls_ind] = b 140 | 141 | def train_with_hard_negatives(self): 142 | _t = Timer() 143 | roidb = self.imdb.roidb 144 | num_images = len(roidb) 145 | 146 | for epoch in range(0,self.svm_nrEpochs): 147 | 148 | # num_images = 100 149 | for i in range(num_images): 150 | print("*** EPOCH = %d, IMAGE = %d *** " % (epoch, i)) 151 | #im = cv2.imread(self.imdb.image_path_at(i)) 152 | #if roidb[i]['flipped']: 153 | # im = im[:, ::-1, :] 154 | #im = self.imdb.image_path_at(i) 155 | _t.tic() 156 | scores, boxes, feat = self.im_detect(self.net, i, roidb[i]['boxes'], self.feature_scale) 157 | _t.toc() 158 | #feat = self.net.blobs[self.layer].data 159 | for j in range(1, self.imdb.num_classes): 160 | hard_inds = \ 161 | np.where((scores[:, j] > self.hard_thresh) & 162 | (roidb[i]['gt_overlaps'][:, j].toarray().ravel() < 163 | self.neg_iou_thresh))[0] 164 | if len(hard_inds) > 0: 165 | hard_feat = feat[hard_inds, :].copy() 166 | new_w_b = \ 167 | self.trainers[j].append_neg_and_retrain(feat=hard_feat) 168 | if new_w_b is not None: 169 | self.update_net(j, new_w_b[0], new_w_b[1]) 170 | np.savetxt(self.svmWeightsPath[:-4] + "_epoch" + str(epoch) + ".txt", self.net.params['cls_score'][0].data) 171 | np.savetxt(self.svmBiasPath[:-4] + "_epoch" + str(epoch) + ".txt", self.net.params['cls_score'][1].data) 172 | np.savetxt(self.svmFeatScalePath[:-4] + "_epoch" + str(epoch) + ".txt", [self.feature_scale]) 173 | 174 | print(('train_with_hard_negatives: ' 175 | '{:d}/{:d} {:.3f}s').format(i + 1, len(roidb), 176 | _t.average_time)) 177 | 178 | def train(self): 179 | # Initialize SVMs using 180 | # a. w_i = fc8_w_i - fc8_w_0 181 | # b. b_i = fc8_b_i - fc8_b_0 182 | # c. Install SVMs into net 183 | self.initialize_net() 184 | 185 | # Pass over roidb to count num positives for each class 186 | # a. Pre-allocate arrays for positive feature vectors 187 | # Pass over roidb, computing features for positives only 188 | self.get_pos_examples() 189 | 190 | # Pass over roidb 191 | # a. Compute cls_score with forward pass 192 | # b. For each class 193 | # i. Select hard negatives 194 | # ii. Add them to cache 195 | # c. For each class 196 | # i. If SVM retrain criteria met, update SVM 197 | # ii. Install new SVM into net 198 | self.train_with_hard_negatives() 199 | 200 | # One final SVM retraining for each class 201 | # Install SVMs into net 202 | for j in range(1, self.imdb.num_classes): 203 | new_w_b = self.trainers[j].append_neg_and_retrain(force=True) 204 | self.update_net(j, new_w_b[0], new_w_b[1]) 205 | 206 | #save svm 207 | np.savetxt(self.svmWeightsPath, self.net.params['cls_score'][0].data) 208 | np.savetxt(self.svmBiasPath, self.net.params['cls_score'][1].data) 209 | np.savetxt(self.svmFeatScalePath, [self.feature_scale]) 210 | 211 | 212 | class SVMClassTrainer(object): 213 | """Manages post-hoc SVM training for a single object class.""" 214 | 215 | def __init__(self, cls, dim, feature_scale, 216 | C, B, pos_weight, svm_penality, svm_loss, svm_rngSeed, svm_retrainLimit, svm_evictThreshold): 217 | self.pos = np.zeros((0, dim), dtype=np.float32) 218 | self.neg = np.zeros((0, dim), dtype=np.float32) 219 | self.B = B 220 | self.C = C 221 | self.cls = cls 222 | self.pos_weight = pos_weight 223 | self.dim = dim 224 | self.feature_scale = feature_scale 225 | if type(pos_weight) == str: #e.g. pos_weight == 'auto' 226 | class_weight = pos_weight 227 | else: 228 | class_weight = {1: pos_weight, -1: 1} 229 | 230 | self.svm = svm.LinearSVC(C=C, class_weight=class_weight, 231 | intercept_scaling=B, verbose=1, 232 | penalty=svm_penality, loss=svm_loss, 233 | random_state=svm_rngSeed, dual=True) 234 | 235 | self.pos_cur = 0 236 | self.num_neg_added = 0 237 | self.retrain_limit = svm_retrainLimit 238 | self.evict_thresh = svm_evictThreshold 239 | self.loss_history = [] 240 | 241 | def alloc_pos(self, count): 242 | self.pos_cur = 0 243 | self.pos = np.zeros((count, self.dim), dtype=np.float32) 244 | 245 | def append_pos(self, feat): 246 | num = feat.shape[0] 247 | self.pos[self.pos_cur:self.pos_cur + num, :] = feat 248 | self.pos_cur += num 249 | 250 | def train(self): 251 | print('>>> Updating {} detector <<<'.format(self.cls)) 252 | num_pos = self.pos.shape[0] 253 | num_neg = self.neg.shape[0] 254 | print('Cache holds {} pos examples and {} neg examples'. 255 | format(num_pos, num_neg)) 256 | X = np.vstack((self.pos, self.neg)) * self.feature_scale 257 | y = np.hstack((np.ones(num_pos), 258 | -np.ones(num_neg))) 259 | self.svm.fit(X, y) 260 | w = self.svm.coef_ 261 | b = self.svm.intercept_[0] 262 | 263 | scores = self.svm.decision_function(X) 264 | pos_scores = scores[:num_pos] 265 | neg_scores = scores[num_pos:] 266 | 267 | num_neg_wrong = sum(neg_scores > 0) 268 | num_pos_wrong = sum(pos_scores < 0) 269 | meanAcc = 0.5 * (num_pos - num_pos_wrong) / num_pos + 0.5*(num_neg - num_neg_wrong) / num_neg 270 | if type(self.pos_weight) == str: 271 | pos_loss = 0 272 | else: 273 | pos_loss = (self.C * self.pos_weight * 274 | np.maximum(0, 1 - pos_scores).sum()) 275 | neg_loss = self.C * np.maximum(0, 1 + neg_scores).sum() 276 | reg_loss = 0.5 * np.dot(w.ravel(), w.ravel()) + 0.5 * b ** 2 277 | tot_loss = pos_loss + neg_loss + reg_loss 278 | self.loss_history.append((meanAcc, num_pos_wrong, num_pos, num_neg_wrong, num_neg, tot_loss, pos_loss, neg_loss, reg_loss)) 279 | for i, losses in enumerate(self.loss_history): 280 | print((' {:4d}: meanAcc={:.3f} -- pos wrong: {:5}/{:5}; neg wrong: {:5}/{:5}; ' 281 | ' obj val: {:.3f} = {:.3f} (posUnscaled) + {:.3f} (neg) + {:.3f} (reg)').format(i, *losses)) 282 | 283 | # Sanity check 284 | 285 | scores_ret = ( 286 | X * 1.0 / self.feature_scale).dot(w.T * self.feature_scale) + b 287 | assert np.allclose(scores, scores_ret[:, 0], atol=1e-5), \ 288 | "Scores from returned model don't match decision function" 289 | 290 | return ((w * self.feature_scale, b), pos_scores, neg_scores) 291 | 292 | def append_neg_and_retrain(self, feat=None, force=False): 293 | if feat is not None: 294 | num = feat.shape[0] 295 | self.neg = np.vstack((self.neg, feat)) 296 | self.num_neg_added += num 297 | if self.num_neg_added > self.retrain_limit or force: 298 | self.num_neg_added = 0 299 | new_w_b, pos_scores, neg_scores = self.train() 300 | # scores = np.dot(self.neg, new_w_b[0].T) + new_w_b[1] 301 | # easy_inds = np.where(neg_scores < self.evict_thresh)[0] 302 | print(' Pruning easy negatives') 303 | print(' before pruning: #neg = ' + str(len(self.neg))) 304 | not_easy_inds = np.where(neg_scores >= self.evict_thresh)[0] 305 | if len(not_easy_inds) > 0: 306 | self.neg = self.neg[not_easy_inds, :] 307 | # self.neg = np.delete(self.neg, easy_inds) 308 | print(' after pruning: #neg = ' + str(len(self.neg))) 309 | print(' Cache holds {} pos examples and {} neg examples'. 310 | format(self.pos.shape[0], self.neg.shape[0])) 311 | print(' {} pos support vectors'.format((pos_scores <= 1).sum())) 312 | print(' {} neg support vectors'.format((neg_scores >= -1).sum())) 313 | return new_w_b 314 | else: 315 | return None 316 | -------------------------------------------------------------------------------- /fastRCNN/utils34_win64/cython_bbox.pyd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/fastRCNN/utils34_win64/cython_bbox.pyd -------------------------------------------------------------------------------- /fastRCNN/utils34_win64/cython_nms.pyd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/fastRCNN/utils34_win64/cython_nms.pyd -------------------------------------------------------------------------------- /fastRCNN/utils35_win64/cython_bbox.pyd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/fastRCNN/utils35_win64/cython_bbox.pyd -------------------------------------------------------------------------------- /fastRCNN/utils35_win64/cython_nms.pyd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/fastRCNN/utils35_win64/cython_nms.pyd -------------------------------------------------------------------------------- /fastRCNN/voc_eval.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Bharath Hariharan 5 | # -------------------------------------------------------- 6 | 7 | from __future__ import print_function 8 | import xml.etree.ElementTree as ET 9 | import os 10 | import pickle as cp 11 | import numpy as np 12 | 13 | def parse_rec(filename): 14 | """ Parse a PASCAL VOC xml file """ 15 | tree = ET.parse(filename) 16 | objects = [] 17 | for obj in tree.findall('object'): 18 | obj_struct = {} 19 | obj_struct['name'] = obj.find('name').text 20 | obj_struct['pose'] = obj.find('pose').text 21 | obj_struct['truncated'] = int(obj.find('truncated').text) 22 | obj_struct['difficult'] = int(obj.find('difficult').text) 23 | bbox = obj.find('bndbox') 24 | obj_struct['bbox'] = [int(bbox.find('xmin').text), 25 | int(bbox.find('ymin').text), 26 | int(bbox.find('xmax').text), 27 | int(bbox.find('ymax').text)] 28 | objects.append(obj_struct) 29 | 30 | return objects 31 | 32 | def voc_ap(rec, prec, use_07_metric=False): 33 | """ ap = voc_ap(rec, prec, [use_07_metric]) 34 | Compute VOC AP given precision and recall. 35 | If use_07_metric is true, uses the 36 | VOC 07 11 point method (default:False). 37 | """ 38 | if use_07_metric: 39 | # 11 point metric 40 | ap = 0. 41 | for t in np.arange(0., 1.1, 0.1): 42 | if np.sum(rec >= t) == 0: 43 | p = 0 44 | else: 45 | p = np.max(prec[rec >= t]) 46 | ap = ap + p / 11. 47 | else: 48 | # correct AP calculation 49 | # first append sentinel values at the end 50 | mrec = np.concatenate(([0.], rec, [1.])) 51 | mpre = np.concatenate(([0.], prec, [0.])) 52 | 53 | # compute the precision envelope 54 | for i in range(mpre.size - 1, 0, -1): 55 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 56 | 57 | # to calculate area under PR curve, look for points 58 | # where X axis (recall) changes value 59 | i = np.where(mrec[1:] != mrec[:-1])[0] 60 | 61 | # and sum (\Delta recall) * prec 62 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 63 | return ap 64 | 65 | def voc_eval(detpath, 66 | annopath, 67 | imagesetfile, 68 | classname, 69 | cachedir, 70 | ovthresh=0.5, 71 | use_07_metric=False): 72 | """rec, prec, ap = voc_eval(detpath, 73 | annopath, 74 | imagesetfile, 75 | classname, 76 | [ovthresh], 77 | [use_07_metric]) 78 | 79 | Top level function that does the PASCAL VOC evaluation. 80 | 81 | detpath: Path to detections 82 | detpath.format(classname) should produce the detection results file. 83 | annopath: Path to annotations 84 | annopath.format(imagename) should be the xml annotations file. 85 | imagesetfile: Text file containing the list of images, one image per line. 86 | classname: Category name (duh) 87 | cachedir: Directory for caching the annotations 88 | [ovthresh]: Overlap threshold (default = 0.5) 89 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 90 | (default False) 91 | """ 92 | # assumes detections are in detpath.format(classname) 93 | # assumes annotations are in annopath.format(imagename) 94 | # assumes imagesetfile is a text file with each line an image name 95 | # cachedir caches the annotations in a pickle file 96 | 97 | # first load gt 98 | if cachedir: 99 | if not os.path.isdir(cachedir): 100 | os.mkdir(cachedir) 101 | cachefile = os.path.join(cachedir, 'annots.pkl') 102 | # read list of images 103 | with open(imagesetfile, 'r') as f: 104 | lines = f.readlines() 105 | imagenames = [x.strip() for x in lines] 106 | 107 | if not cachedir or not os.path.isfile(cachefile): 108 | # load annots 109 | recs = {} 110 | for i, imagename in enumerate(imagenames): 111 | recs[imagename] = parse_rec(annopath.format(imagename)) 112 | if i % 1000 == 0: 113 | print ('Reading annotation for {:d}/{:d}'.format( 114 | i + 1, len(imagenames))) 115 | # save 116 | if cachedir: 117 | print ('Saving cached annotations to {:s}'.format(cachefile)) 118 | with open(cachefile, 'wb') as f: 119 | cp.dump(recs, f) 120 | else: 121 | # load 122 | with open(cachefile, 'rb') as f: 123 | recs = cp.load(f) 124 | 125 | # extract gt objects for this class 126 | class_recs = {} 127 | npos = 0 128 | for imagename in imagenames: 129 | R = [obj for obj in recs[imagename] if obj['name'] == classname] 130 | bbox = np.array([x['bbox'] for x in R]) 131 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 132 | det = [False] * len(R) 133 | npos = npos + sum(~difficult) 134 | class_recs[imagename] = {'bbox': bbox, 135 | 'difficult': difficult, 136 | 'det': det} 137 | 138 | # read dets 139 | detfile = detpath.format(classname) 140 | with open(detfile, 'r') as f: 141 | lines = f.readlines() 142 | 143 | splitlines = [x.strip().split(' ') for x in lines] 144 | image_ids = [x[0] for x in splitlines] 145 | confidence = np.array([float(x[1]) for x in splitlines]) 146 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 147 | 148 | # sort by confidence 149 | sorted_ind = np.argsort(-confidence) 150 | sorted_scores = np.sort(-confidence) 151 | 152 | BB = BB[sorted_ind, :] 153 | image_ids = [image_ids[x] for x in sorted_ind] 154 | 155 | # go down dets and mark TPs and FPs 156 | nd = len(image_ids) 157 | tp = np.zeros(nd) 158 | fp = np.zeros(nd) 159 | for d in range(nd): 160 | R = class_recs[image_ids[d]] 161 | bb = BB[d, :].astype(float) 162 | ovmax = -np.inf 163 | BBGT = R['bbox'].astype(float) 164 | 165 | if BBGT.size > 0: 166 | # compute overlaps 167 | # intersection 168 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 169 | iymin = np.maximum(BBGT[:, 1], bb[1]) 170 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 171 | iymax = np.minimum(BBGT[:, 3], bb[3]) 172 | iw = np.maximum(ixmax - ixmin + 1., 0.) 173 | ih = np.maximum(iymax - iymin + 1., 0.) 174 | inters = iw * ih 175 | 176 | # union 177 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 178 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 179 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 180 | 181 | overlaps = inters / uni 182 | ovmax = np.max(overlaps) 183 | jmax = np.argmax(overlaps) 184 | 185 | if ovmax > ovthresh: 186 | if not R['difficult'][jmax]: 187 | if not R['det'][jmax]: 188 | tp[d] = 1. 189 | R['det'][jmax] = 1 190 | else: 191 | fp[d] = 1. 192 | else: 193 | fp[d] = 1. 194 | 195 | # compute precision recall 196 | fp = np.cumsum(fp) 197 | tp = np.cumsum(tp) 198 | rec = tp / float(npos) 199 | # avoid divide by zero in case the first detection matches a difficult 200 | # ground truth 201 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 202 | ap = voc_ap(rec, prec, use_07_metric) 203 | 204 | return rec, prec, ap 205 | -------------------------------------------------------------------------------- /helpers_cntk.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from past.utils import old_div 4 | 5 | import os, pdb, sys, numpy as np 6 | from os.path import join 7 | from helpers import readTable 8 | 9 | from cntk import load_model, Trainer, UnitType, use_default_device, placeholder, constant, cross_entropy_with_softmax, classification_error 10 | from cntk.device import use_default_device #default #gpu, set_default_device 11 | from cntk.initializer import glorot_uniform 12 | from cntk.io import MinibatchSource, ImageDeserializer, CTFDeserializer, StreamDefs, StreamDef 13 | from cntk.io.transforms import scale 14 | from cntk.learners import momentum_sgd, learning_rate_schedule, momentum_as_time_constant_schedule 15 | from cntk.logging import log_number_of_parameters, ProgressPrinter, TensorBoardProgressWriter 16 | from cntk.logging.graph import find_by_name, plot 17 | from cntk.ops import input_variable, parameter, times, combine, roipooling 18 | from cntk.ops.functions import CloneMethod 19 | 20 | 21 | #################################### 22 | # CNTK-python wrapper functions 23 | #################################### 24 | def create_mb_source(data_set, img_height, img_width, n_classes, n_rois, data_path, randomize): 25 | # set paths 26 | map_file = join(data_path, data_set + '.txt') 27 | roi_file = join(data_path, data_set + '.rois.txt') 28 | label_file = join(data_path, data_set + '.roilabels.txt') 29 | if not os.path.exists(map_file) or not os.path.exists(roi_file) or not os.path.exists(label_file): 30 | raise RuntimeError("File '%s', '%s' or '%s' does not exist. " % (map_file, roi_file, label_file)) 31 | 32 | # read images 33 | nrImages = len(readTable(map_file)) 34 | transforms = [scale(width=img_width, height=img_height, channels=3, 35 | scale_mode="pad", pad_value=114, interpolations='linear')] 36 | image_source = ImageDeserializer(map_file, StreamDefs(features = StreamDef(field='image', transforms=transforms))) 37 | 38 | # read rois and labels 39 | rois_dim = 4 * n_rois 40 | label_dim = n_classes * n_rois 41 | roi_source = CTFDeserializer(roi_file, StreamDefs( 42 | rois = StreamDef(field='rois', shape=rois_dim, is_sparse=False))) 43 | label_source = CTFDeserializer(label_file, StreamDefs( 44 | roiLabels = StreamDef(field='roiLabels', shape=label_dim, is_sparse=False))) 45 | 46 | # define a composite reader 47 | mb = MinibatchSource([image_source, roi_source, label_source], max_samples=sys.maxsize, randomize=randomize) 48 | return (mb, nrImages) 49 | 50 | 51 | # Defines the Fast R-CNN network model for detecting objects in images 52 | def frcn_predictor(features, rois, n_classes, base_path): 53 | # model specific variables for AlexNet 54 | model_file = base_path + "/../../../resources/cntk/AlexNet.model" 55 | roi_dim = 6 56 | feature_node_name = "features" 57 | last_conv_node_name = "conv5.y" 58 | pool_node_name = "pool3" 59 | last_hidden_node_name = "h2_d" 60 | 61 | # Load the pretrained classification net and find nodes 62 | print("Loading pre-trained model...") 63 | loaded_model = load_model(model_file) 64 | print("Loading pre-trained model... DONE.") 65 | feature_node = find_by_name(loaded_model, feature_node_name) 66 | conv_node = find_by_name(loaded_model, last_conv_node_name) 67 | pool_node = find_by_name(loaded_model, pool_node_name) 68 | last_node = find_by_name(loaded_model, last_hidden_node_name) 69 | 70 | # Clone the conv layers and the fully connected layers of the network 71 | conv_layers = combine([conv_node.owner]).clone(CloneMethod.freeze, {feature_node: placeholder()}) 72 | fc_layers = combine([last_node.owner]).clone(CloneMethod.clone, {pool_node: placeholder()}) 73 | 74 | # Create the Fast R-CNN model 75 | feat_norm = features - constant(114) 76 | conv_out = conv_layers(feat_norm) 77 | roi_out = roipooling(conv_out, rois, (roi_dim, roi_dim)) 78 | fc_out = fc_layers(roi_out) 79 | #fc_out.set_name("fc_out") 80 | 81 | # z = Dense(rois[0], num_classes, map_rank=1)(fc_out) # --> map_rank=1 is not yet supported 82 | W = parameter(shape=(4096, n_classes), init=glorot_uniform()) 83 | b = parameter(shape=n_classes, init=0) 84 | z = times(fc_out, W) + b 85 | return z, fc_out 86 | 87 | 88 | # Initialize and train a Fast R-CNN model 89 | def init_train_fast_rcnn(image_height, image_width, num_classes, num_rois, mb_size, max_epochs, cntk_lr_per_image, l2_reg_weight, 90 | momentum_time_constant, base_path, boSkipTraining = False, debug_output=False, tensorboardLogDir = None): 91 | 92 | #make sure we use GPU for training 93 | if use_default_device().type() == 0: 94 | print("WARNING: using CPU for training.") 95 | else: 96 | print("Using GPU for training.") 97 | 98 | # Instantiate the Fast R-CNN prediction model 99 | image_input = input_variable((3, image_height, image_width)) 100 | roi_input = input_variable((num_rois, 4)) 101 | label_input = input_variable((num_rois, num_classes)) 102 | frcn_output, frcn_penultimateLayer = frcn_predictor(image_input, roi_input, num_classes, base_path) 103 | 104 | if boSkipTraining: 105 | print("Using pre-trained DNN without refinement") 106 | return frcn_penultimateLayer 107 | 108 | # Create the minibatch source and define mapping from reader streams to network inputs 109 | minibatch_source, epoch_size = create_mb_source("train", image_height, image_width, num_classes, num_rois, 110 | base_path, randomize=True) 111 | input_map = { 112 | image_input: minibatch_source.streams.features, 113 | roi_input: minibatch_source.streams.rois, 114 | label_input: minibatch_source.streams.roiLabels 115 | } 116 | 117 | # set loss / error functions 118 | ce = cross_entropy_with_softmax(frcn_output, label_input, axis=1) 119 | pe = classification_error(frcn_output, label_input, axis=1) 120 | if debug_output: 121 | plot(frcn_output, "graph_frcn.png") 122 | 123 | # set the progress printer(s) 124 | progress_writers = [ProgressPrinter(tag='Training', num_epochs=max_epochs)] 125 | if tensorboardLogDir != None: 126 | tensorboard_writer = TensorBoardProgressWriter(freq=10, log_dir=tensorboardLogDir, model=frcn_output) 127 | progress_writers.append(tensorboard_writer) 128 | 129 | # Set learning parameters and instantiate the trainer object 130 | lr_per_sample = [f/float(num_rois) for f in cntk_lr_per_image] 131 | lr_schedule = learning_rate_schedule(lr_per_sample, unit=UnitType.sample) 132 | mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant) 133 | learner = momentum_sgd(frcn_output.parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight) 134 | trainer = Trainer(frcn_output, (ce, pe), learner, progress_writers) 135 | 136 | # Get minibatches of images and perform model training 137 | print("Training Fast R-CNN model for %s epochs." % max_epochs) 138 | log_number_of_parameters(frcn_output) 139 | for epoch in range(max_epochs): 140 | sample_count = 0 141 | 142 | # loop over minibatches in the epoch 143 | while sample_count < epoch_size: 144 | data = minibatch_source.next_minibatch(min(mb_size, epoch_size - sample_count), input_map=input_map) 145 | if sample_count % 100 == 1: 146 | print("Training in progress: epoch {} of {}, sample count {} of {}".format(epoch, max_epochs, sample_count, epoch_size)) 147 | trainer.train_minibatch(data) 148 | sample_count += trainer.previous_minibatch_sample_count # count samples processed so far 149 | trainer.summarize_training_progress() 150 | 151 | # Log mean of each parameter tensor, so that we can confirm that the parameters change indeed. 152 | if tensorboardLogDir != None: 153 | for parameter in frcn_output.parameters: 154 | tensorboard_writer.write_value(parameter.uid + "/mean", np.mean(parameter.value), epoch) 155 | tensorboard_writer.write_value(parameter.uid + "/std", np.std(parameter.value), epoch) 156 | tensorboard_writer.write_value(parameter.uid + "/absSum", np.sum(np.abs(parameter.value)), epoch) 157 | 158 | if debug_output: 159 | frcn_output.save_model("frcn_py_%s.model" % (epoch + 1)) 160 | return frcn_output 161 | 162 | 163 | def run_fast_rcnn(model, data_set, image_height, image_width, num_classes, num_rois, base_path, outDir): 164 | # Create the minibatch source and define mapping from reader streams to network inputs 165 | minibatch_source, num_images = create_mb_source(data_set, image_height, image_width, num_classes, num_rois, base_path, randomize=False) 166 | input_map = { 167 | model.arguments[0]: minibatch_source['features'], 168 | model.arguments[1]: minibatch_source['rois'] 169 | } 170 | 171 | # evaluate test images and write to file 172 | for imgIndex in range(0, num_images): 173 | if imgIndex % 100 == 1: 174 | print("Evaluating images {} of {}".format(imgIndex, num_images)) 175 | data = minibatch_source.next_minibatch(1, input_map=input_map) 176 | output = model.eval(data)[0] 177 | output = np.array(output, np.float32) 178 | 179 | # write to disk 180 | if imgIndex % 100 == 1: 181 | print("Writing DNN output of dimension {} to disk".format(output.shape)) 182 | outPath = outDir + str(imgIndex) + ".dat" 183 | np.savez_compressed(outPath, output) -------------------------------------------------------------------------------- /imdb_data.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | from __future__ import print_function 9 | from builtins import range 10 | import sys, os 11 | from helpers import * 12 | import scipy.sparse 13 | import scipy.io as sio 14 | import pickle as cp 15 | import numpy as np 16 | import fastRCNN 17 | 18 | 19 | class imdb_data(fastRCNN.imdb): 20 | def __init__(self, image_set, classes, maxNrRois, imgDir, roiDir, cacheDir, boAddGroundTruthRois): 21 | fastRCNN.imdb.__init__(self, image_set + ".cache") #'data_' + image_set) 22 | self._image_set = image_set 23 | self._maxNrRois = maxNrRois 24 | self._imgDir = imgDir 25 | self._roiDir = roiDir 26 | self._cacheDir = cacheDir #cache_path 27 | self._imgSubdirs ={'train': ['positive', 'negative'], 'test': ['testImages']} 28 | self._classes = classes 29 | self._class_to_ind = dict(zip(self.classes, range(self.num_classes))) 30 | self._image_ext = '.jpg' 31 | self._image_index, self._image_subdirs = self._load_image_set_index() 32 | self._roidb_handler = self.selective_search_roidb 33 | self._boAddGroundTruthRois = boAddGroundTruthRois 34 | 35 | 36 | #overwrite parent definition 37 | @property 38 | def cache_path(self): 39 | return self._cacheDir 40 | 41 | def image_path_at(self, i): 42 | """ 43 | Return the absolute path to image i in the image sequence. 44 | """ 45 | return self.image_path_from_index(self._image_subdirs[i], self._image_index[i]) 46 | 47 | def image_path_from_index(self, subdir, fname): 48 | """ 49 | Construct an image path from the image's "index" identifier. 50 | """ 51 | image_path = os.path.join(self._imgDir, subdir, fname) 52 | assert os.path.exists(image_path), \ 53 | 'Path does not exist: {}'.format(image_path) 54 | return image_path 55 | 56 | def _load_image_set_index(self): 57 | """ 58 | Compile list of image indices and the subdirectories they are in. 59 | """ 60 | image_index = [] 61 | image_subdirs = [] 62 | for subdir in self._imgSubdirs[self._image_set]: 63 | imgFilenames = getFilesInDirectory(os.path.join(self._imgDir,subdir), self._image_ext) 64 | image_index += imgFilenames 65 | image_subdirs += [subdir] * len(imgFilenames) 66 | return image_index, image_subdirs 67 | 68 | def gt_roidb(self): 69 | """ 70 | Return the database of ground-truth regions of interest. 71 | 72 | This function loads/saves from/to a cache file to speed up future calls. 73 | """ 74 | cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl') 75 | if os.path.exists(cache_file): 76 | with open(cache_file, 'rb') as fid: 77 | roidb = cp.load(fid) 78 | print ('{} gt roidb loaded from {}'.format(self.name, cache_file)) 79 | return roidb 80 | 81 | gt_roidb = [self._load_annotation(i) for i in range(self.num_images)] 82 | with open(cache_file, 'wb') as fid: 83 | cp.dump(gt_roidb, fid, cp.HIGHEST_PROTOCOL) 84 | print ('wrote gt roidb to {}'.format(cache_file)) 85 | 86 | return gt_roidb 87 | 88 | def selective_search_roidb(self): 89 | """ 90 | Return the database of selective search regions of interest. 91 | Ground-truth ROIs are also included. 92 | 93 | This function loads/saves from/to a cache file to speed up future calls. 94 | """ 95 | cache_file = os.path.join(self.cache_path, 96 | self.name + '_selective_search_roidb.pkl') 97 | 98 | if os.path.exists(cache_file): 99 | with open(cache_file, 'rb') as fid: 100 | if sys.version_info[0] < 3: 101 | roidb = cp.load(fid) 102 | else: 103 | roidb = cp.load(fid, encoding='latin1') 104 | print ('{} ss roidb loaded from {}'.format(self.name, cache_file)) 105 | return roidb 106 | 107 | gt_roidb = self.gt_roidb() 108 | ss_roidb = self._load_selective_search_roidb(gt_roidb) 109 | 110 | #add ground truth ROIs 111 | if self._boAddGroundTruthRois: 112 | roidb = self.merge_roidbs(gt_roidb, ss_roidb) 113 | else: 114 | roidb = ss_roidb 115 | 116 | #Keep max of e.g. 2000 rois 117 | if self._maxNrRois and self._maxNrRois > 0: 118 | print ("Only keeping the first %d ROIs.." % self._maxNrRois) 119 | for i in range(self.num_images): 120 | gt_overlaps = roidb[i]['gt_overlaps'] 121 | gt_overlaps = gt_overlaps.todense()[:self._maxNrRois] 122 | gt_overlaps = scipy.sparse.csr_matrix(gt_overlaps) 123 | roidb[i]['gt_overlaps'] = gt_overlaps 124 | roidb[i]['boxes'] = roidb[i]['boxes'][:self._maxNrRois,:] 125 | roidb[i]['gt_classes'] = roidb[i]['gt_classes'][:self._maxNrRois] 126 | 127 | with open(cache_file, 'wb') as fid: 128 | cp.dump(roidb, fid, cp.HIGHEST_PROTOCOL) 129 | print ('wrote ss roidb to {}'.format(cache_file)) 130 | 131 | return roidb 132 | 133 | def _load_selective_search_roidb(self, gt_roidb): 134 | # box_list = nrImages x nrBoxes x 4 135 | box_list = [] 136 | for imgFilename, subdir in zip(self._image_index, self._image_subdirs): 137 | roiPath = "{}/{}/{}.roi.txt".format(self._roiDir, subdir, imgFilename[:-4]) 138 | assert os.path.exists(roiPath), "Error: rois file not found: " + roiPath 139 | rois = np.loadtxt(roiPath, np.int32) 140 | box_list.append(rois) 141 | return self.create_roidb_from_box_list(box_list, gt_roidb) 142 | 143 | def _load_annotation(self, imgIndex): 144 | """ 145 | Load image and bounding boxes info from human annotations. 146 | """ 147 | #negative images do not have any ground truth annotations 148 | if self._image_subdirs[imgIndex].lower() == "negative": 149 | return None 150 | 151 | imgPath = self.image_path_at(imgIndex) 152 | bboxesPaths = imgPath[:-4] + ".bboxes.tsv" 153 | labelsPaths = imgPath[:-4] + ".bboxes.labels.tsv" 154 | assert os.path.exists(bboxesPaths), "Error: ground truth bounding boxes file not found: " + bboxesPaths 155 | assert os.path.exists(labelsPaths), "Error: ground truth labels file not found: " + bboxesPaths 156 | bboxes = np.loadtxt(bboxesPaths, np.float32) 157 | labels = readFile(labelsPaths) 158 | 159 | # in case there's only one annotation and numpy read the array as single array, 160 | # we need to make sure the input is treated as a multi dimensional array instead of a list/ 1D array 161 | #if len(bboxes.shape) == 1: 162 | if len(bboxes)>0 and type(bboxes[0]) == np.float32: 163 | bboxes = np.array([bboxes]) 164 | 165 | #remove boxes marked as 'undecided' or 'exclude' 166 | indicesToKeep = find(labels, lambda x: x!='EXCLUDE' and x!='UNDECIDED') 167 | bboxes = [bboxes[i] for i in indicesToKeep] 168 | labels = [labels[i] for i in indicesToKeep] 169 | 170 | # Load object bounding boxes into a data frame. 171 | num_objs = len(bboxes) 172 | boxes = np.zeros((num_objs,4), dtype=np.uint16) 173 | gt_classes = np.zeros(num_objs, dtype=np.int32) 174 | overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32) 175 | for bboxIndex,(bbox,label) in enumerate(zip(bboxes,labels)): 176 | cls = self._class_to_ind[label] #.decode('utf-8')] 177 | boxes[bboxIndex, :] = bbox 178 | gt_classes[bboxIndex] = cls 179 | overlaps[bboxIndex, cls] = 1.0 180 | 181 | overlaps = scipy.sparse.csr_matrix(overlaps) 182 | 183 | return {'boxes' : boxes, 184 | 'gt_classes': gt_classes, 185 | 'gt_overlaps' : overlaps, 186 | 'flipped' : False} 187 | 188 | # main call to compute per-calass average precision 189 | # shape of all_boxes: e.g. 21 classes x 4952 images x 58 rois x 5 coords+score 190 | # (see also test_net() in fastRCNN\test.py) 191 | def evaluate_detections(self, all_boxes, output_dir, use_07_metric=False, overlapThreshold = 0.5): 192 | aps = [] 193 | for classIndex, className in enumerate(self._classes): 194 | if className != '__background__': 195 | rec, prec, ap = self._evaluate_detections(classIndex, all_boxes, use_07_metric, overlapThreshold) 196 | aps += [[className,ap]] 197 | print('AP for {:>15} = {:.4f}'.format(className, ap)) 198 | print('Mean AP = {:.4f}'.format(np.nanmean(getColumn(aps,1)))) 199 | return aps 200 | 201 | def _evaluate_detections(self, classIndex, all_boxes, use_07_metric = False, overlapThreshold = 0.5): 202 | """ 203 | Top level function that does the PASCAL VOC evaluation. 204 | 205 | [overlapThreshold]: Overlap threshold (default = 0.5) 206 | [use_07_metric]: Whether to use VOC07's 11 point AP computation (default False) 207 | """ 208 | assert (len(all_boxes) == self.num_classes) 209 | assert (len(all_boxes[0]) == self.num_images) 210 | 211 | # load ground truth annotations for this class 212 | gtInfos = [] 213 | for imgIndex in range(self.num_images): 214 | imgPath = self.image_path_at(imgIndex) 215 | imgSubir = os.path.normpath(imgPath).split(os.path.sep)[-2] 216 | if imgSubir != 'negative': 217 | gtBoxes, gtLabels = readGtAnnotation(imgPath) 218 | gtBoxes = [box for box, label in zip(gtBoxes, gtLabels) if label == self.classes[classIndex]] #.decode('utf-8') 219 | else: 220 | gtBoxes = [] 221 | gtInfos.append({'bbox': np.array(gtBoxes), 222 | 'difficult': [False] * len(gtBoxes), 223 | 'det': [False] * len(gtBoxes)}) 224 | 225 | # parse detections for this class 226 | # shape of all_boxes: e.g. 21 classes x 4952 images x 58 rois x 5 coords+score 227 | detBboxes = [] 228 | detImgIndices = [] 229 | detConfidences = [] 230 | for imgIndex in range(self.num_images): 231 | dets = all_boxes[classIndex][imgIndex] 232 | if dets != []: 233 | for k in range(dets.shape[0]): 234 | detImgIndices.append(imgIndex) 235 | detConfidences.append(dets[k, -1]) 236 | # the VOCdevkit expects 1-based indices 237 | detBboxes.append([dets[k, 0] + 1, dets[k, 1] + 1, dets[k, 2] + 1, dets[k, 3] + 1]) 238 | detBboxes = np.array(detBboxes) 239 | detConfidences = np.array(detConfidences) 240 | 241 | # debug: visualize GT and detections 242 | # if classIndex == 15: # and imgPath.endswith("WIN_20160803_11_42_36_Pro.jpg"): 243 | # imgIndex = 6 244 | # imgPath = self.image_path_at(imgIndex) 245 | # img = imread(imgPath) 246 | # tmp_gtBoxes = gtInfos[imgIndex]['bbox'] 247 | # inds = np.where(np.array(detImgIndices) == 1)[0] 248 | # tmp_detBoxes = detBboxes[inds] 249 | # print(detConfidences[inds]) 250 | # drawRectangles(img, tmp_gtBoxes, color = (255, 0, 0)) #thickness=thickness) 251 | # drawRectangles(img, tmp_detBoxes, color= (0, 255, 0)) # thickness=thickness) 252 | # imshow(img, maxDim=800) 253 | 254 | # compute precision / recall / ap 255 | rec, prec, ap = self._voc_computePrecisionRecallAp( 256 | class_recs=gtInfos, 257 | confidence=detConfidences, 258 | image_ids=detImgIndices, 259 | BB=detBboxes, 260 | ovthresh=overlapThreshold, 261 | use_07_metric=use_07_metric) 262 | 263 | return rec, prec, ap 264 | 265 | 266 | ######################################################################### 267 | # Python evaluation functions (copied/refactored from faster-RCNN) 268 | ########################################################################## 269 | def _voc_computePrecisionRecallAp(self, class_recs, confidence, image_ids, BB, ovthresh=0.5, use_07_metric=False): 270 | # sort by confidence 271 | sorted_ind = np.argsort(-confidence) 272 | BB = BB[sorted_ind, :] 273 | image_ids = [image_ids[x] for x in sorted_ind] 274 | 275 | # go down dets and mark TPs and FPs 276 | nd = len(image_ids) 277 | tp = np.zeros(nd) 278 | fp = np.zeros(nd) 279 | for d in range(nd): 280 | R = class_recs[image_ids[d]] 281 | bb = BB[d, :].astype(float) 282 | ovmax = -np.inf 283 | BBGT = R['bbox'].astype(float) 284 | 285 | if BBGT.size > 0: 286 | # compute overlaps 287 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 288 | iymin = np.maximum(BBGT[:, 1], bb[1]) 289 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 290 | iymax = np.minimum(BBGT[:, 3], bb[3]) 291 | iw = np.maximum(ixmax - ixmin + 1., 0.) 292 | ih = np.maximum(iymax - iymin + 1., 0.) 293 | inters = iw * ih 294 | 295 | # union 296 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 297 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 298 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 299 | 300 | overlaps = inters / uni 301 | ovmax = np.max(overlaps) 302 | jmax = np.argmax(overlaps) 303 | 304 | if ovmax > ovthresh: 305 | if not R['difficult'][jmax]: 306 | if not R['det'][jmax]: 307 | tp[d] = 1. 308 | R['det'][jmax] = 1 309 | else: 310 | fp[d] = 1. 311 | else: 312 | fp[d] = 1. 313 | 314 | # compute precision recall 315 | npos = sum([len(cr['bbox']) for cr in class_recs]) 316 | fp = np.cumsum(fp) 317 | tp = np.cumsum(tp) 318 | rec = tp / float(npos) 319 | # avoid divide by zero in case the first detection matches a difficult 320 | # ground truth 321 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 322 | ap = computeAveragePrecision(rec, prec, use_07_metric) 323 | return rec, prec, ap -------------------------------------------------------------------------------- /resources/cntk/config.cntk: -------------------------------------------------------------------------------- 1 | # Fast-RCNN configuration for CNTK 2 | # For algorithm and details see http://arxiv.org/abs/1504.08083 3 | # Overview: 4 | # The Fast-RCNN algorithm uses a DNN that takes as inputs a set of images 5 | # and for each image a set of ROIs (Regions of interest). It first computes 6 | # a convolutional feature map for the entire image using a series of 7 | # of convolutional layers (usually from a pretrained network). Then it 8 | # employs ROI pooling to crop out the part of the conv feature map 9 | # that corresponds to an ROI and resizes it to the input size expected 10 | # by the following layer (usually a set of pretrained fully connected layers). 11 | # Classification error and evaluation criterion are computed for each ROI. 12 | 13 | #makeMode = false 14 | command = Train:WriteTest:WriteTrain 15 | 16 | deviceId = "Auto" 17 | precision = "float" 18 | parallelTrain = "false" 19 | traceLevel = 1 20 | 21 | rootDir = "." 22 | dataDir = "$rootDir$" 23 | outputDir = "$rootDir$/tmp" 24 | 25 | modelPath = "$outputDir$/Fast-RCNN" 26 | #stderr = "$outputDir$/Fast-RCNN.log" 27 | 28 | ImageH = 1000 29 | ImageW = 1000 30 | ImageC = 3 31 | 32 | NumLabels = 22 33 | NumTrainROIs = 200 34 | NumTestROIs = 1000 35 | 36 | TrainROIDim = 800 # $NumTrainROIs$ * 4 37 | TrainROILabelDim = 4400 # $NumTrainROIs$ * $NumLabels$ 38 | TestROIDim = 4000 # $NumTestROIs$ * 4 39 | TestROILabelDim = 22000 # $NumTestROIs$ * $NumLabels$ 40 | 41 | # For training we load a pretrained AlexNet model (AlexNet.model) and clone three parts of it. 42 | # For the first part (up to pool1) we keep the weights fixed. The middle part contains the 43 | # remaining convolutional and pooling layers and the last part are the FC layers. 44 | # In the model we apply the first two cloned parts, then an ROI pooling layer and 45 | # finally the pretrained FC layers followed by a new FC layer that maps to the new 46 | # label dimensionality of 21 classes. 47 | # The inputs are images (1000 x 1000 x 3), ROIs (64 ROIs x 4 coordinates (x, y, w, h)) 48 | # and ground truht labels per ROI (64 ROIs x 21 classes). 49 | Train = { 50 | action = "train" 51 | 52 | BrainScriptNetworkBuilder = { 53 | imageShape = $ImageH$:$ImageW$:$ImageC$ # 1000:1000:3 54 | labelShape = $NumLabels$:$NumTrainROIs$ # 21:64 55 | ROIShape = 4:$NumTrainROIs$ # 4:64 56 | 57 | network = BS.Network.Load ("../../../resources/cntk/AlexNet.model") 58 | convLayers = BS.Network.CloneFunction(network.features, network.conv5_y, parameters = "constant") 59 | fcLayers = BS.Network.CloneFunction(network.pool3, network.h2_d) 60 | 61 | 62 | model (features, rois) = { 63 | featNorm = features - 114 64 | convOut = convLayers (featNorm) 65 | roiOut = ROIPooling (convOut, rois, (6:6)) 66 | fcOut = fcLayers (roiOut) 67 | W = ParameterTensor{($NumLabels$:4096), init="glorotUniform"} 68 | b = ParameterTensor{$NumLabels$, init = 'zero'} 69 | z = W * fcOut + b 70 | }.z 71 | 72 | features = Input {imageShape} 73 | roiLabels = Input {labelShape} 74 | rois = Input {ROIShape} 75 | 76 | z = model (features, rois) 77 | 78 | ce = CrossEntropyWithSoftmax(roiLabels, z, axis = 1) 79 | errs = ClassificationError(roiLabels, z, axis = 1) 80 | 81 | featureNodes = (features:rois) 82 | labelNodes = (roiLabels) 83 | criterionNodes = (ce) 84 | evaluationNodes = (errs) 85 | outputNodes = (z) 86 | } 87 | 88 | SGD = { 89 | epochSize = 0 90 | minibatchSize = 1 91 | maxEpochs = 17 92 | 93 | #learningRatesPerSample = 0.00001 94 | #momentumAsTimeConstant = 0*5:10 95 | #dropoutRate = 0 96 | 97 | learningRatesPerMB=0.00001*10:0.000001*5:0.0000001 98 | momentumPerMB=0.9 99 | gradUpdateType=None 100 | L2RegWeight=0.0005 101 | dropoutRate=0.5 #0*5:0.5 102 | 103 | numMBsToShowResult = 50 104 | } 105 | 106 | reader = { 107 | randomize = true 108 | verbosity = 2 109 | deserializers = ({ 110 | type = "CNTKTextFormatDeserializer" ; module = "CNTKTextFormatReader" 111 | file = $dataDir$/train.rois.txt 112 | input = { rois = { dim = $TrainROIDim$ ; format = "dense" } } 113 | }:{ 114 | type = "CNTKTextFormatDeserializer" ; module = "CNTKTextFormatReader" 115 | file = $dataDir$/train.roilabels.txt 116 | input = { roiLabels = { dim = $TrainROILabelDim$ ; format = "dense" } } 117 | }:{ 118 | type = "ImageDeserializer" ; module = "ImageReader" 119 | file = $dataDir$/train.txt 120 | input = { 121 | features = { transforms = ( 122 | { type = "Scale" ; width = $ImageW$ ; height = $ImageW$ ; channels = $ImageC$ ; scaleMode = "pad" ; padValue = 114 }: 123 | { type = "Transpose" } 124 | )} 125 | ignored = {labelDim = 1000} 126 | } 127 | }) 128 | } 129 | } 130 | 131 | # Write network output for entire test data set 132 | WriteTest = { 133 | action = "write" 134 | minibatchSize = 1 135 | 136 | # outputPath = "$OutputDir$/fastrcnnNetOutput" 137 | outputPath=test 138 | 139 | reader = { 140 | randomize = false 141 | verbosity = 2 142 | deserializers = ({ 143 | type = "CNTKTextFormatDeserializer" ; module = "CNTKTextFormatReader" 144 | file = $dataDir$/test.rois.txt 145 | input = { rois = { dim = $TestROIDim$ ; format = "dense" } } 146 | }:{ 147 | type = "CNTKTextFormatDeserializer" ; module = "CNTKTextFormatReader" 148 | file = $dataDir$/test.roilabels.txt 149 | input = { roiLabels = { dim = $TestROILabelDim$ ; format = "dense" } } 150 | }:{ 151 | type = "ImageDeserializer" ; module = "ImageReader" 152 | file = $dataDir$/test.txt 153 | input = { 154 | features = { transforms = ( 155 | { type = "Scale" ; width = $ImageW$ ; height = $ImageW$ ; channels = $ImageC$ ; scaleMode = "pad" ; padValue = 114 }: 156 | { type = "Transpose" } 157 | )} 158 | ignored = {labelDim = 1000} 159 | } 160 | }) 161 | } 162 | } 163 | 164 | # Write network output for entire train data set 165 | WriteTrain = { 166 | action = "write" 167 | minibatchSize = 1 168 | 169 | # outputPath = "$OutputDir$/fastrcnnNetOutput" 170 | outputPath=train 171 | 172 | reader = { 173 | randomize = false 174 | verbosity = 2 175 | deserializers = ({ 176 | type = "CNTKTextFormatDeserializer" ; module = "CNTKTextFormatReader" 177 | file = $dataDir$/train.rois.txt 178 | input = { rois = { dim = $TestROIDim$ ; format = "dense" } } 179 | }:{ 180 | type = "CNTKTextFormatDeserializer" ; module = "CNTKTextFormatReader" 181 | file = $dataDir$/train.roilabels.txt 182 | input = { roiLabels = { dim = $TestROILabelDim$ ; format = "dense" } } 183 | }:{ 184 | type = "ImageDeserializer" ; module = "ImageReader" 185 | file = $dataDir$/train.txt 186 | input = { 187 | features = { transforms = ( 188 | { type = "Scale" ; width = $ImageW$ ; height = $ImageW$ ; channels = $ImageC$ ; scaleMode = "pad" ; padValue = 114 }: 189 | { type = "Transpose" } 190 | )} 191 | ignored = {labelDim = 1000} 192 | } 193 | }) 194 | } 195 | } 196 | 197 | -------------------------------------------------------------------------------- /resources/cntk/model.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/resources/cntk/model.pdf -------------------------------------------------------------------------------- /resources/python35_64bit_requirements/opencv_python-3.2.0-cp35-cp35m-win_amd64.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/ObjectDetectionUsingCntk/9114d38f2fd7370ebb4c33268afa0d908f0116f6/resources/python35_64bit_requirements/opencv_python-3.2.0-cp35-cp35m-win_amd64.whl -------------------------------------------------------------------------------- /resources/python35_64bit_requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | ./opencv_python-3.2.0-cp35-cp35m-win_amd64.whl 2 | scikit-learn 3 | Pillow 4 | future 5 | dlib 6 | EasyDict --------------------------------------------------------------------------------