├── .gitignore ├── README.md ├── TrainClassif.ipynb ├── convertJupyter.sh ├── create_mean_std_file.py ├── dataset ├── ReadImages.ipynb ├── ReadImages.py ├── __init__.py ├── collection.ipynb └── collection.py ├── datasetTools.ipynb ├── make_plots.ipynb ├── mean_std.ipynb ├── model ├── ModelDefinition.ipynb ├── ModelDefinition.py ├── RNN.ipynb ├── RNN.py ├── Untitled.ipynb ├── __init__.py ├── cours2.ipynb ├── custom_modules.py ├── nn_utils.py ├── siamese.ipynb └── siamese.py ├── pre_proc.ipynb ├── pre_process_dataset.py ├── test ├── __init__.py ├── classif_finetune_test.py ├── classif_regions_test.py ├── instance_avg.py ├── siamese_descriptor_test.py └── siamese_regions_test.py ├── train ├── __init__.py ├── classif_finetune.py ├── classif_finetune_p.py ├── classif_regions.py ├── classif_regions_p.py ├── global_p.py ├── siamese_descriptor.py ├── siamese_descriptor_p.py ├── siamese_regions.py └── siamese_regions_p.py ├── utils.py ├── utils ├── __init__.py ├── dataset.py ├── general.py ├── image.py ├── metrics.py ├── train_classif.py ├── train_general.py └── train_siamese.py └── visualize_cnn.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Python (from Github Python gitignore) 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | env/ 8 | build/ 9 | develop-eggs/ 10 | dist/ 11 | downloads/ 12 | eggs/ 13 | .eggs/ 14 | lib/ 15 | lib64/ 16 | parts/ 17 | sdist/ 18 | var/ 19 | wheels/ 20 | *.egg-info/ 21 | .installed.cfg 22 | *.egg 23 | *.manifest 24 | *.spec 25 | pip-log.txt 26 | pip-delete-this-directory.txt 27 | .ipynb_checkpoints 28 | .python-version 29 | .env 30 | .venv 31 | venv/ 32 | ENV/ 33 | 34 | # custom 35 | data/ 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pytorch models definition and test for image retrieval 2 | 3 | This is the implementation of the paper : 4 | 5 | Portaz, M., Kohl, M., Chevallet, J. P., Quénot, G., & Mulhem, P. (2019). Object instance identification with fully convolutional networks. Multimedia Tools and Applications, 78(3), 2747-2764. 6 | 7 | If you use it, please cite : 8 | 9 | @article{portaz2019object, 10 | title={Object instance identification with fully convolutional networks}, 11 | author={Portaz, Maxime and Kohl, Matthias and Chevallet, Jean-Pierre and Qu{\'e}not, Georges and Mulhem, Philippe}, 12 | journal={Multimedia Tools and Applications}, 13 | volume={78}, 14 | number={3}, 15 | pages={2747--2764}, 16 | year={2019}, 17 | publisher={Springer} 18 | } 19 | 20 | ## Test several approaches for images retrieval: 21 | * Feature Extraction from Pretrained CNN 22 | * Pretrained CNN finetuning 23 | * Siamese network from scratch 24 | * Siamese network with pretrained network 25 | 26 | # TrainClassif 27 | Finetune a CNN for classification over few examples 28 | Finetune only the classifier or the entire network 29 | 30 | # TrainSiamese 31 | Train a siamese network with pairs selection for image retrieval 32 | -------------------------------------------------------------------------------- /convertJupyter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | jupyter nbconvert --to python model/ModelDefinition.ipynb 3 | jupyter nbconvert --to python dataset/collection.ipynb 4 | jupyter nbconvert --to python dataset/ReadImages.ipynb 5 | jupyter nbconvert --to python TrainClassif.ipynb 6 | jupyter nbconvert --to python model/siamese.ipynb 7 | jupyter nbconvert --to python trainSiamese.ipynb 8 | jupyter nbconvert --to python model/RNN.ipynb 9 | -------------------------------------------------------------------------------- /create_mean_std_file.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | import torch 4 | import torchvision.transforms as transforms 5 | import numpy as np 6 | from utils import get_images_labels, imread_rgb 7 | from utils import match_label_fou_clean2, match_label_video 8 | 9 | # create the mean std file needed to normalize images of a dataset 10 | 11 | # path to the training images of the dataset 12 | dataset_path = 'data/pre_proc/CLICIDE_video_224sq' 13 | # file to write the mean and std values to 14 | out_path = 'data/CLICIDE_224sq_train_ms.txt' 15 | # function to match labels, this is not necessary here 16 | match_labels = match_label_video 17 | # if the image size is constant, indicate it in format (C, H, W) 18 | # if the image size is not constant, use None here 19 | image_size = (3, 224, 224) 20 | dataset_full = get_images_labels(dataset_path, match_labels) 21 | 22 | mean = [0., 0., 0.] 23 | std = [0., 0., 0.] 24 | size = len(dataset_full) 25 | if image_size is not None: 26 | T = torch.Tensor(size, *(image_size)) 27 | for i, (im, _) in enumerate(dataset_full): 28 | T[i] = transforms.ToTensor()(imread_rgb(im)) 29 | for i in range(3): 30 | mean[i] = T[:, i, :, :].mean() 31 | std[i] = T[:, i, :, :].std() 32 | else: 33 | # cannot take mean/std of whole dataset tensor. 34 | # need to compute mean of all pixels and std afterwards, pixel by pixel 35 | dataset_open = [] 36 | for im, _ in dataset_full: 37 | im_o = imread_rgb(im) / 255. # cv2 images are 0-255, torch tensors are 0-1 38 | im_size = im_o.shape[0] * im_o.shape[1] 39 | dataset_open.append((im_o, im_size)) 40 | for i in range(3): 41 | mean[i] += np.sum(im_o[:, :, i]) / (im_size * size) 42 | for im_o, im_size in dataset_open: 43 | for i in range(3): 44 | std[i] += np.sum(np.square(im_o[:, :, i] - mean[i])) / (im_size * size) 45 | for i in range(3): 46 | std[i] = np.sqrt(std[i]) 47 | 48 | with open(out_path, 'w') as outfile: 49 | outfile.write(' '.join(map(repr, mean))) 50 | outfile.write('\n') 51 | outfile.write(' '.join(map(repr, std))) 52 | outfile.write('\n') 53 | -------------------------------------------------------------------------------- /dataset/ReadImages.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true, 8 | "deletable": true, 9 | "editable": true 10 | }, 11 | "outputs": [], 12 | "source": [ 13 | "from __future__ import division\n", 14 | "import glob\n", 15 | "import os.path as path\n", 16 | "from PIL import Image\n", 17 | "import torchvision.transforms as transforms" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": { 24 | "collapsed": true, 25 | "deletable": true, 26 | "editable": true 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "def readImagesInCLass(folder='.'):\n", 31 | " \"\"\"\n", 32 | " Read a folder containing images with the structure :\n", 33 | " folder\n", 34 | " --class1\n", 35 | " --image1\n", 36 | " --image2\n", 37 | " --class2\n", 38 | " --image3\n", 39 | " --image3\n", 40 | " \n", 41 | " Return :\n", 42 | " list of couple : (image, class)\n", 43 | " \"\"\"\n", 44 | " \n", 45 | " exts = ('*.jpg', '*.JPG', '*.JPEG', \"*.png\")\n", 46 | " r = []\n", 47 | " for el in glob.iglob(path.join(folder, '*')):\n", 48 | " if path.isdir(el):\n", 49 | " for ext in exts:\n", 50 | " r.extend( [(im, el.split('/')[-1]) for im in glob.iglob(path.join(el, ext)) ] )\n", 51 | " return r" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "metadata": { 58 | "collapsed": true, 59 | "deletable": true, 60 | "editable": true 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "def readImageswithPattern(folder='.', matchFunc=lambda x:x.split('.')[0]):\n", 65 | " \"\"\"\n", 66 | " Read a folder containing images where the name of the class is in the filename\n", 67 | " the match function should return the class given the filename\n", 68 | " Return :\n", 69 | " list of couple : (image, class)\n", 70 | " \"\"\"\n", 71 | " exts = ('*.jpg', '*.JPG', '*.JPEG', \"*.png\")\n", 72 | " r = []\n", 73 | " for ext in exts:\n", 74 | " r.extend( [(im, matchFunc(im)) for im in glob.iglob(path.join(folder, ext)) ] )\n", 75 | " return r" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 4, 81 | "metadata": { 82 | "collapsed": false, 83 | "deletable": true, 84 | "editable": true 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "def openAll(imageList, size=0 ):\n", 89 | " \"\"\"\n", 90 | " Open all images, return a list of PIL images\n", 91 | " \"\"\"\n", 92 | " if size == 0:\n", 93 | " return [Image.open(im) for im, c in imageList]\n", 94 | " else:\n", 95 | " return [Image.open(im).resize(size) for im, c in imageList]\n", 96 | " " 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": { 103 | "collapsed": true 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "def openDict(imageList, size=(225,225)):\n", 108 | " \"\"\"\n", 109 | " Open all images, return a dictionnary of (image name : PIL image) and resize as the given size\n", 110 | " \"\"\"\n", 111 | " return {im: Image.open(im).resize(size) for im, c in imageList}" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 5, 117 | "metadata": { 118 | "collapsed": true, 119 | "deletable": true, 120 | "editable": true 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "def positiveCouples(dataset):\n", 125 | " \"\"\"\n", 126 | " Create all positive couples in the dataset\n", 127 | " \"\"\"\n", 128 | " return [ (im[0], im2[0], 1) for im in dataset for im2 in dataset if im[1]==im2[1]]" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 6, 134 | "metadata": { 135 | "collapsed": false 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "def negativeCouples(dataset):\n", 140 | " \"\"\"\n", 141 | " Create all negative couples in the dataset\n", 142 | " \"\"\"\n", 143 | " return [ (im[0], im2[0], -1) for im in dataset for im2 in dataset if im[1] != im2[1]]" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 9, 149 | "metadata": { 150 | "collapsed": false 151 | }, 152 | "outputs": [], 153 | "source": [ 154 | "def createCouples(dataset):\n", 155 | " \"\"\"\n", 156 | " Create all couples in the dataset\n", 157 | " \"\"\"\n", 158 | " return [ (im[0], im2[0], 1) if im[1] == im2[1] else (im[0], im2[0], -1) for im in dataset for im2 in dataset]" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 10, 164 | "metadata": { 165 | "collapsed": false 166 | }, 167 | "outputs": [ 168 | { 169 | "name": "stdout", 170 | "output_type": "stream", 171 | "text": [ 172 | "27271\n", 173 | "10502754\n", 174 | "Nb of p / nb of n : 0.260 %\n", 175 | "10530025\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | "if __name__ == '__main__':\n", 181 | " dataset = readImageswithPattern('/video/CLICIDE', lambda x:x.split('/')[-1].split('-')[0]) #read Clicide dataset\n", 182 | " p = positiveCouples(dataset) #Clicide positives couples\n", 183 | " print(len(p)) #should be 27217\n", 184 | " n = negativeCouples(dataset) #Clicide negatives couples, all of them\n", 185 | " print(len(n)) #should be 10502754 (10M)\n", 186 | " print(\"Nb of p / nb of n : %.3f %%\" % (len(p)/len(n)*100)) #around 0.2% of positive examples\n", 187 | " a = createCouples(dataset)\n", 188 | " print(len(a))\n", 189 | " " 190 | ] 191 | } 192 | ], 193 | "metadata": { 194 | "kernelspec": { 195 | "display_name": "Python 2", 196 | "language": "python", 197 | "name": "python2" 198 | }, 199 | "language_info": { 200 | "codemirror_mode": { 201 | "name": "ipython", 202 | "version": 2 203 | }, 204 | "file_extension": ".py", 205 | "mimetype": "text/x-python", 206 | "name": "python", 207 | "nbconvert_exporter": "python", 208 | "pygments_lexer": "ipython2", 209 | "version": "2.7.9" 210 | } 211 | }, 212 | "nbformat": 4, 213 | "nbformat_minor": 2 214 | } 215 | -------------------------------------------------------------------------------- /dataset/ReadImages.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | from __future__ import division 7 | import glob 8 | import os.path as path 9 | from PIL import Image 10 | import torchvision.transforms as transforms 11 | 12 | 13 | # In[2]: 14 | 15 | def readImagesInCLass(folder='.'): 16 | """ 17 | Read a folder containing images with the structure : 18 | folder 19 | --class1 20 | --image1 21 | --image2 22 | --class2 23 | --image3 24 | --image3 25 | 26 | Return : 27 | list of couple : (image, class) 28 | """ 29 | 30 | exts = ('*.jpg', '*.JPG', '*.JPEG', "*.png") 31 | r = [] 32 | for el in glob.iglob(path.join(folder, '*')): 33 | if path.isdir(el): 34 | for ext in exts: 35 | r.extend( [(im, el.split('/')[-1]) for im in glob.iglob(path.join(el, ext)) ] ) 36 | return r 37 | 38 | 39 | # In[3]: 40 | 41 | def readImageswithPattern(folder='.', matchFunc=lambda x:x.split('.')[0]): 42 | """ 43 | Read a folder containing images where the name of the class is in the filename 44 | the match function should return the class given the filename 45 | Return : 46 | list of couple : (image, class) 47 | """ 48 | exts = ('*.jpg', '*.JPG', '*.JPEG', "*.png") 49 | r = [] 50 | for ext in exts: 51 | r.extend( [(im, matchFunc(im)) for im in glob.iglob(path.join(folder, ext)) ] ) 52 | return r 53 | 54 | 55 | # In[4]: 56 | 57 | def openAll(imageList, size=0 ): 58 | """ 59 | Open all images, return a list of PIL images 60 | """ 61 | if size == 0: 62 | return [Image.open(im) for im, c in imageList] 63 | else: 64 | return [Image.open(im).resize(size) for im, c in imageList] 65 | 66 | 67 | 68 | # In[ ]: 69 | 70 | def openDict(imageList, size=(225,225)): 71 | """ 72 | Open all images, return a dictionnary of (image name : PIL image) and resize as the given size 73 | """ 74 | return {im: Image.open(im).resize(size) for im, c in imageList} 75 | 76 | 77 | # In[5]: 78 | 79 | def positiveCouples(dataset): 80 | """ 81 | Create all positive couples in the dataset 82 | """ 83 | return [ (im[0], im2[0], 1) for im in dataset for im2 in dataset if im[1]==im2[1]] 84 | 85 | 86 | # In[6]: 87 | 88 | def negativeCouples(dataset): 89 | """ 90 | Create all negative couples in the dataset 91 | """ 92 | return [ (im[0], im2[0], -1) for im in dataset for im2 in dataset if im[1] != im2[1]] 93 | 94 | 95 | # In[9]: 96 | 97 | def createCouples(dataset): 98 | """ 99 | Create all couples in the dataset 100 | """ 101 | return [ (im[0], im2[0], 1) if im[1] == im2[1] else (im[0], im2[0], -1) for im in dataset for im2 in dataset] 102 | 103 | 104 | # In[10]: 105 | 106 | if __name__ == '__main__': 107 | dataset = readImageswithPattern('/video/CLICIDE', lambda x:x.split('/')[-1].split('-')[0]) #read Clicide dataset 108 | p = positiveCouples(dataset) #Clicide positives couples 109 | print(len(p)) #should be 27217 110 | n = negativeCouples(dataset) #Clicide negatives couples, all of them 111 | print(len(n)) #should be 10502754 (10M) 112 | print("Nb of p / nb of n : %.3f %%" % (len(p)/len(n)*100)) #around 0.2% of positive examples 113 | a = createCouples(dataset) 114 | print(len(a)) 115 | 116 | 117 | -------------------------------------------------------------------------------- /dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxgreat/Instance-Search/2cea5f64a2d397047072a91788af81c0ea1c6d5e/dataset/__init__.py -------------------------------------------------------------------------------- /dataset/collection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import tensorflow as tf\n", 12 | "import torch.utils.data\n", 13 | "import torchvision.transforms as transforms" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "def ComputeMean(imagesList, h=299, w=299):\n", 25 | " \"\"\"\n", 26 | " TODO : make efficient\n", 27 | " Return the mean of the collection for each chanel RGB\n", 28 | " \"\"\"\n", 29 | " r,g,b = 0,0,0\n", 30 | " toT = transforms.ToTensor()\n", 31 | "\n", 32 | " #f = FloatProgress(min=0, max=len(imagesList))\n", 33 | " #display(f)\n", 34 | "\n", 35 | " for im in imagesList:\n", 36 | " #f.value += 1\n", 37 | " t = toT(im)\n", 38 | " for e in t[0].view(-1):\n", 39 | " r += e\n", 40 | " for e in t[1].view(-1):\n", 41 | " g += e\n", 42 | " for e in t[2].view(-1):\n", 43 | " b += e\n", 44 | " return r/(len(imagesList)*h*w), g/(len(imagesList)*h*w), b/(len(imagesList)*h*w) " 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": { 51 | "collapsed": true 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "def ComputeStdDev(imagesList, mean):\n", 56 | " \"\"\"\n", 57 | " TODO : make efficient\n", 58 | " Return the std deviation for each channel over the collection\n", 59 | " \"\"\"\n", 60 | " toT = transforms.ToTensor()\n", 61 | " r,g,b = 0,0,0\n", 62 | " h = len(toT(imagesList[0])[0])\n", 63 | " w = len(toT(imagesList[0])[0][0])\n", 64 | " for im in imagesList:\n", 65 | " t = toT(im)\n", 66 | " for e in t[0].view(-1):\n", 67 | " r += (e - mean[0])**2\n", 68 | " for e in t[1].view(-1):\n", 69 | " g += (e - mean[1])**2\n", 70 | " for e in t[2].view(-1):\n", 71 | " b += (e - mean[2])**2\n", 72 | " return (r/(len(imagesList)*h*w))**0.5, (g/(len(imagesList)*h*w))**0.5, (b/(len(imagesList)*h*w))**0.5" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 2, 78 | "metadata": { 79 | "collapsed": true 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "def createConceptDict(imageList):\n", 84 | " \"\"\"\n", 85 | " Create a dictionnary that store for each concept the list of image path corresponding\n", 86 | " \"\"\"\n", 87 | " ConceptDict = {}\n", 88 | " for im in imageList:\n", 89 | " if im[1] in ConceptDict.keys():\n", 90 | " ConceptDict[im[1]].append(im[0])\n", 91 | " else:\n", 92 | " ConceptDict[im[1]] = [im[0]]\n", 93 | " return ConceptDict" 94 | ] 95 | } 96 | ], 97 | "metadata": { 98 | "kernelspec": { 99 | "display_name": "Python 2", 100 | "language": "python", 101 | "name": "python2" 102 | }, 103 | "language_info": { 104 | "codemirror_mode": { 105 | "name": "ipython", 106 | "version": 2 107 | }, 108 | "file_extension": ".py", 109 | "mimetype": "text/x-python", 110 | "name": "python", 111 | "nbconvert_exporter": "python", 112 | "pygments_lexer": "ipython2", 113 | "version": "2.7.9" 114 | } 115 | }, 116 | "nbformat": 4, 117 | "nbformat_minor": 2 118 | } 119 | -------------------------------------------------------------------------------- /dataset/collection.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import tensorflow as tf 7 | import torch.utils.data 8 | import torchvision.transforms as transforms 9 | 10 | 11 | # In[ ]: 12 | 13 | def ComputeMean(imagesList, h=299, w=299): 14 | """ 15 | TODO : make efficient 16 | Return the mean of the collection for each chanel RGB 17 | """ 18 | r,g,b = 0,0,0 19 | toT = transforms.ToTensor() 20 | 21 | #f = FloatProgress(min=0, max=len(imagesList)) 22 | #display(f) 23 | 24 | for im in imagesList: 25 | #f.value += 1 26 | t = toT(im) 27 | for e in t[0].view(-1): 28 | r += e 29 | for e in t[1].view(-1): 30 | g += e 31 | for e in t[2].view(-1): 32 | b += e 33 | return r/(len(imagesList)*h*w), g/(len(imagesList)*h*w), b/(len(imagesList)*h*w) 34 | 35 | 36 | # In[ ]: 37 | 38 | def ComputeStdDev(imagesList, mean): 39 | """ 40 | TODO : make efficient 41 | Return the std deviation for each channel over the collection 42 | """ 43 | toT = transforms.ToTensor() 44 | r,g,b = 0,0,0 45 | h = len(toT(imagesList[0])[0]) 46 | w = len(toT(imagesList[0])[0][0]) 47 | for im in imagesList: 48 | t = toT(im) 49 | for e in t[0].view(-1): 50 | r += (e - mean[0])**2 51 | for e in t[1].view(-1): 52 | g += (e - mean[1])**2 53 | for e in t[2].view(-1): 54 | b += (e - mean[2])**2 55 | return (r/(len(imagesList)*h*w))**0.5, (g/(len(imagesList)*h*w))**0.5, (b/(len(imagesList)*h*w))**0.5 56 | 57 | 58 | # In[2]: 59 | 60 | def createConceptDict(imageList): 61 | """ 62 | Create a dictionnary that store for each concept the list of image path corresponding 63 | """ 64 | ConceptDict = {} 65 | for im in imageList: 66 | if im[1] in ConceptDict.keys(): 67 | ConceptDict[im[1]].append(im[0]) 68 | else: 69 | ConceptDict[im[1]] = [im[0]] 70 | return ConceptDict 71 | 72 | -------------------------------------------------------------------------------- /datasetTools.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 8, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "\"\"\"\n", 12 | "import glob\n", 13 | "with open('CliListTest.txt', \"w\") as f:\n", 14 | " l = glob.glob('/home/data/collection/GUIMUTEIC/CLICIDE/CLICIDEMAX/test/*.JPG')\n", 15 | " print(len(l))\n", 16 | " for a in l:\n", 17 | " if not 'wall' in a:\n", 18 | " f.write(a+\"\\n\")\n", 19 | "\"\"\"\n", 20 | "with open('FouList.txt', \"r\") as f:\n", 21 | " with open('FouConcept.txt' ,\"w\") as fout:\n", 22 | " a = set()\n", 23 | " for l in f:\n", 24 | " floor, nb, _ = l.split('/')[-1].split('_')\n", 25 | " a.add(floor+'_'+nb)\n", 26 | " for e in a:\n", 27 | " fout.write(e+'\\n')\n" 28 | ] 29 | } 30 | ], 31 | "metadata": { 32 | "kernelspec": { 33 | "display_name": "Python 2", 34 | "language": "python", 35 | "name": "python2" 36 | }, 37 | "language_info": { 38 | "codemirror_mode": { 39 | "name": "ipython", 40 | "version": 2 41 | }, 42 | "file_extension": ".py", 43 | "mimetype": "text/x-python", 44 | "name": "python", 45 | "nbconvert_exporter": "python", 46 | "pygments_lexer": "ipython2", 47 | "version": "2.7.9" 48 | } 49 | }, 50 | "nbformat": 4, 51 | "nbformat_minor": 2 52 | } 53 | -------------------------------------------------------------------------------- /mean_std.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true, 8 | "deletable": true, 9 | "editable": true 10 | }, 11 | "outputs": [], 12 | "source": [ 13 | "import glob\n", 14 | "from os import path\n", 15 | "import torch\n", 16 | "import torchvision.transforms as transforms\n", 17 | "from PIL import Image\n", 18 | "from dataset.ReadImages import readImageswithPattern" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": { 25 | "collapsed": true, 26 | "deletable": true, 27 | "editable": true 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "dataset_path = '/home/mrim/kohlm/nnForRetrieval/data/pre_proc/fourviere_227sq'\n", 32 | "dataset_test = '/home/mrim/kohlm/nnForRetrieval/data/pre_proc/fourviere_227sq/test'" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "metadata": { 39 | "collapsed": false, 40 | "deletable": true, 41 | "editable": true 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "def match(x):\n", 46 | " return x.split('/')[-1].split('-')[0]\n", 47 | "\n", 48 | "trainSetFull = readImageswithPattern(dataset_path, match)\n", 49 | "testSetFull = readImageswithPattern(dataset_test, match)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 4, 55 | "metadata": { 56 | "collapsed": false, 57 | "deletable": true, 58 | "editable": true 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "labels = list(set([t[1] for t in trainSetFull]))\n", 63 | "size = sum(1 for _, lab in trainSetFull if lab in labels)\n", 64 | "T = torch.Tensor(size, 3, 227, 227)\n", 65 | "i = 0\n", 66 | "for img, lab in trainSetFull:\n", 67 | " if lab in labels:\n", 68 | " im = Image.open(img)\n", 69 | " T[i] = transforms.ToTensor()(im)\n", 70 | " i += 1" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 5, 76 | "metadata": { 77 | "collapsed": false, 78 | "deletable": true, 79 | "editable": true 80 | }, 81 | "outputs": [ 82 | { 83 | "name": "stdout", 84 | "output_type": "stream", 85 | "text": [ 86 | "[0.3643357148734363, 0.304306334270731, 0.2774018310814609]\n", 87 | "[0.21223013632973034, 0.2003156783406293, 0.19758758196073448]\n" 88 | ] 89 | } 90 | ], 91 | "source": [ 92 | "mean = [0, 0, 0]\n", 93 | "std = [0, 0, 0]\n", 94 | "for i in range(3):\n", 95 | " mean[i] = T[:, i, :, :].mean()\n", 96 | " std[i] = T[:, i, :, :].std()\n", 97 | "print(mean)\n", 98 | "print(std)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": { 105 | "collapsed": true, 106 | "deletable": true, 107 | "editable": true 108 | }, 109 | "outputs": [], 110 | "source": [] 111 | } 112 | ], 113 | "metadata": { 114 | "kernelspec": { 115 | "display_name": "Python 2", 116 | "language": "python", 117 | "name": "python2" 118 | }, 119 | "language_info": { 120 | "codemirror_mode": { 121 | "name": "ipython", 122 | "version": 2 123 | }, 124 | "file_extension": ".py", 125 | "mimetype": "text/x-python", 126 | "name": "python", 127 | "nbconvert_exporter": "python", 128 | "pygments_lexer": "ipython2", 129 | "version": "2.7.9" 130 | } 131 | }, 132 | "nbformat": 4, 133 | "nbformat_minor": 2 134 | } 135 | -------------------------------------------------------------------------------- /model/ModelDefinition.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import torch\n", 12 | "import torch.nn as nn\n", 13 | "import torch.nn.parallel" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "class maxnet(nn.Module):\n", 25 | " def __init__(self, nbClass=464):\n", 26 | " super(maxnet, self).__init__()\n", 27 | " self.features = nn.Sequential(\n", 28 | " nn.Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2)),\n", 29 | " nn.ReLU(True),\n", 30 | " nn.MaxPool2d((3, 3), stride=(2, 2), dilation=(1, 1)),\n", 31 | " nn.Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2)),\n", 32 | " nn.ReLU(True),\n", 33 | " nn.MaxPool2d((3, 3), stride=(2, 2), dilation=(1, 1)),\n", 34 | " nn.Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),\n", 35 | " nn.ReLU(True),\n", 36 | " nn.Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),\n", 37 | " nn.ReLU(True),\n", 38 | " nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),\n", 39 | " nn.ReLU(True),\n", 40 | " nn.MaxPool2d((3, 3), stride=(2, 2), dilation=(1, 1))\n", 41 | " )\n", 42 | " self.classifier = nn.Sequential(\n", 43 | " nn.Dropout(),\n", 44 | " nn.Linear(256 * 6 * 6, 4096),\n", 45 | " nn.ReLU(inplace=True),\n", 46 | " nn.Dropout(),\n", 47 | " nn.Linear(4096, 4096),\n", 48 | " nn.ReLU(inplace=True),\n", 49 | " nn.Linear(4096, nbClass),\n", 50 | " )\n", 51 | "\n", 52 | " def forward(self, x):\n", 53 | " x = self.features(x)\n", 54 | " x = x.view(x.size(0), -1)\n", 55 | " x = self.classifier(x)\n", 56 | " return x" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "collapsed": true 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "def Maxnet(nbClass=464):\n", 68 | " return maxnet(nbClass)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": { 75 | "collapsed": true 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "def copyParameters(net, modelBase):\n", 80 | " \"\"\"\n", 81 | " Copy parameters from a model to another\n", 82 | " \"\"\"\n", 83 | " #for each feature\n", 84 | " for i, f in enumerate(net.features):\n", 85 | " if type(f) is torch.nn.modules.conv.Conv2d:\n", 86 | " #we copy convolution parameters\n", 87 | " f.weight.data = modelBase.features[i].weight.data\n", 88 | " f.bias.data = modelBase.features[i].bias.data\n", 89 | "\n", 90 | " #for each classifier element\n", 91 | " for i, f in enumerate(net.classifier):\n", 92 | " if type(f) is torch.nn.modules.linear.Linear:\n", 93 | " #we copy fully connected parameters\n", 94 | " if f.weight.size() == modelBase.classifier[i].weight.size():\n", 95 | " f.weight.data = modelBase.classifier[i].weight.data\n", 96 | " f.bias.data = modelBase.classifier[i].bias.data" 97 | ] 98 | } 99 | ], 100 | "metadata": { 101 | "kernelspec": { 102 | "display_name": "Python 2", 103 | "language": "python", 104 | "name": "python2" 105 | }, 106 | "language_info": { 107 | "codemirror_mode": { 108 | "name": "ipython", 109 | "version": 2 110 | }, 111 | "file_extension": ".py", 112 | "mimetype": "text/x-python", 113 | "name": "python", 114 | "nbconvert_exporter": "python", 115 | "pygments_lexer": "ipython2", 116 | "version": "2.7.9" 117 | } 118 | }, 119 | "nbformat": 4, 120 | "nbformat_minor": 2 121 | } 122 | -------------------------------------------------------------------------------- /model/ModelDefinition.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.parallel 9 | 10 | 11 | # In[2]: 12 | 13 | class maxnet(nn.Module): 14 | def __init__(self, nbClass=464): 15 | super(maxnet, self).__init__() 16 | self.features = nn.Sequential( 17 | nn.Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2)), 18 | nn.ReLU(True), 19 | nn.MaxPool2d((3, 3), stride=(2, 2), dilation=(1, 1)), 20 | nn.Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2)), 21 | nn.ReLU(True), 22 | nn.MaxPool2d((3, 3), stride=(2, 2), dilation=(1, 1)), 23 | nn.Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), 24 | nn.ReLU(True), 25 | nn.Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), 26 | nn.ReLU(True), 27 | nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), 28 | nn.ReLU(True), 29 | nn.MaxPool2d((3, 3), stride=(2, 2), dilation=(1, 1)) 30 | ) 31 | self.classifier = nn.Sequential( 32 | nn.Dropout(), 33 | nn.Linear(256 * 6 * 6, 4096), 34 | nn.ReLU(inplace=True), 35 | nn.Dropout(), 36 | nn.Linear(4096, 4096), 37 | nn.ReLU(inplace=True), 38 | nn.Linear(4096, nbClass), 39 | ) 40 | 41 | def forward(self, x): 42 | x = self.features(x) 43 | x = x.view(x.size(0), -1) 44 | x = self.classifier(x) 45 | return x 46 | 47 | 48 | # In[ ]: 49 | 50 | def Maxnet(nbClass=464): 51 | return maxnet(nbClass) 52 | 53 | 54 | # In[ ]: 55 | 56 | def copyParameters(net, modelBase): 57 | """ 58 | Copy parameters from a model to another 59 | """ 60 | #for each feature 61 | for i, f in enumerate(net.features): 62 | if type(f) is torch.nn.modules.conv.Conv2d: 63 | #we copy convolution parameters 64 | f.weight.data = modelBase.features[i].weight.data 65 | f.bias.data = modelBase.features[i].bias.data 66 | 67 | #for each classifier element 68 | for i, f in enumerate(net.classifier): 69 | if type(f) is torch.nn.modules.linear.Linear: 70 | #we copy fully connected parameters 71 | if f.weight.size() == modelBase.classifier[i].weight.size(): 72 | f.weight.data = modelBase.classifier[i].weight.data 73 | f.bias.data = modelBase.classifier[i].bias.data 74 | 75 | -------------------------------------------------------------------------------- /model/RNN.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[2]: 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torchvision.models as models 9 | from torch.autograd import Variable 10 | 11 | 12 | # In[ ]: 13 | 14 | class RNN(nn.Module): 15 | """ 16 | Define a RNN network 17 | """ 18 | def __init__(self, net, hidden_size): 19 | super(siamese, self).__init__() 20 | self.features = net 21 | self.rnn = nn.LSTMCell(input_size=net.classifier[len(net.classifier._modules)-1], hidden_size=hidden_size) 22 | 23 | def forward(self, x, hx, cx): 24 | x = self.features(x) 25 | x = self.rnn(x, hx, cx) 26 | return x, hx 27 | 28 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxgreat/Instance-Search/2cea5f64a2d397047072a91788af81c0ea1c6d5e/model/__init__.py -------------------------------------------------------------------------------- /model/cours2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 71, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import torch\n", 12 | "import torch.nn as nn\n", 13 | "import torchvision.models as models\n", 14 | "import torchvision.transforms as transforms\n", 15 | "import random" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 5, 21 | "metadata": { 22 | "collapsed": false 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "class net1(nn.Module):\n", 27 | " def __init__(self):\n", 28 | " super(net1, self).__init__()\n", 29 | " self.layer1 = nn.Linear(225*225*3, 1000)\n", 30 | " self.relu = nn.ReLU(inplace=True) \n", 31 | " \n", 32 | " def forward(self, x):\n", 33 | " \"\"\"\n", 34 | " x est le vecteur d'entrée\n", 35 | " \"\"\"\n", 36 | " y = self.layer1(x)\n", 37 | " y = self.relu(y)\n", 38 | " return y" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 28, 44 | "metadata": { 45 | "collapsed": false 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "alex = models.alexnet(pretrained=True)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 31, 55 | "metadata": { 56 | "collapsed": false 57 | }, 58 | "outputs": [ 59 | { 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | "AlexNet (\n", 64 | " (features): Sequential (\n", 65 | " (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))\n", 66 | " (1): ReLU (inplace)\n", 67 | " (2): MaxPool2d (size=(3, 3), stride=(2, 2), dilation=(1, 1))\n", 68 | " (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))\n", 69 | " (4): ReLU (inplace)\n", 70 | " (5): MaxPool2d (size=(3, 3), stride=(2, 2), dilation=(1, 1))\n", 71 | " (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", 72 | " (7): ReLU (inplace)\n", 73 | " (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", 74 | " (9): ReLU (inplace)\n", 75 | " (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", 76 | " (11): ReLU (inplace)\n", 77 | " (12): MaxPool2d (size=(3, 3), stride=(2, 2), dilation=(1, 1))\n", 78 | " )\n", 79 | " (classifier): Sequential (\n", 80 | " (0): Dropout (p = 0.5)\n", 81 | " (1): Linear (9216 -> 4096)\n", 82 | " (2): ReLU (inplace)\n", 83 | " (3): Dropout (p = 0.5)\n", 84 | " (4): Linear (4096 -> 4096)\n", 85 | " (5): ReLU (inplace)\n", 86 | " (6): Linear (4096 -> 1000)\n", 87 | " )\n", 88 | ")\n" 89 | ] 90 | } 91 | ], 92 | "source": [ 93 | "print(alex)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 23, 99 | "metadata": { 100 | "collapsed": true 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "import PIL.Image as Image\n", 105 | "im = Image.open(\"/video/CLICIDE/10A-0.JPG\")" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 27, 111 | "metadata": { 112 | "collapsed": false 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "t = torch.Tensor(1, 3, 225, 225)\n", 117 | "trans = transforms.ToTensor()\n", 118 | "t[0] = trans(im.resize( (225, 225) ))" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 29, 124 | "metadata": { 125 | "collapsed": true 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "output = alex(Variable(t))" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 32, 135 | "metadata": { 136 | "collapsed": false 137 | }, 138 | "outputs": [ 139 | { 140 | "data": { 141 | "text/plain": [ 142 | "(Variable containing:\n", 143 | " 9.5034\n", 144 | " [torch.FloatTensor of size 1x1], Variable containing:\n", 145 | " 669\n", 146 | " [torch.LongTensor of size 1x1])" 147 | ] 148 | }, 149 | "execution_count": 32, 150 | "metadata": {}, 151 | "output_type": "execute_result" 152 | } 153 | ], 154 | "source": [ 155 | "output.max(1)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 33, 161 | "metadata": { 162 | "collapsed": true 163 | }, 164 | "outputs": [], 165 | "source": [ 166 | "class AlexNet(nn.Module):\n", 167 | " def __init__(self, num_classes=1000):\n", 168 | " super(AlexNet, self).__init__()\n", 169 | " self.features = nn.Sequential(\n", 170 | " nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),\n", 171 | " nn.ReLU(inplace=True),\n", 172 | " nn.MaxPool2d(kernel_size=3, stride=2),\n", 173 | " nn.Conv2d(64, 192, kernel_size=5, padding=2),\n", 174 | " nn.ReLU(inplace=True),\n", 175 | " nn.MaxPool2d(kernel_size=3, stride=2),\n", 176 | " nn.Conv2d(192, 384, kernel_size=3, padding=1),\n", 177 | " nn.ReLU(inplace=True),\n", 178 | " nn.Conv2d(384, 256, kernel_size=3, padding=1),\n", 179 | " nn.ReLU(inplace=True),\n", 180 | " nn.Conv2d(256, 256, kernel_size=3, padding=1),\n", 181 | " nn.ReLU(inplace=True),\n", 182 | " nn.MaxPool2d(kernel_size=3, stride=2),\n", 183 | " )\n", 184 | " self.classifier = nn.Sequential(\n", 185 | " nn.Dropout(),\n", 186 | " nn.Linear(256 * 6 * 6, 4096),\n", 187 | " nn.ReLU(inplace=True),\n", 188 | " nn.Dropout(),\n", 189 | " nn.Linear(4096, 4096),\n", 190 | " nn.ReLU(inplace=True),\n", 191 | " nn.Linear(4096, num_classes),\n", 192 | " )\n", 193 | "\n", 194 | " def forward(self, x):\n", 195 | " x = self.features(x)\n", 196 | " x = x.view(x.size(0), 256 * 6 * 6)\n", 197 | " x = self.classifier(x)\n", 198 | " return x\n", 199 | "\n" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 47, 205 | "metadata": { 206 | "collapsed": false 207 | }, 208 | "outputs": [ 209 | { 210 | "data": { 211 | "text/plain": [ 212 | "AlexNet (\n", 213 | " (features): Sequential (\n", 214 | " (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))\n", 215 | " (1): ReLU (inplace)\n", 216 | " (2): MaxPool2d (size=(3, 3), stride=(2, 2), dilation=(1, 1))\n", 217 | " (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))\n", 218 | " (4): ReLU (inplace)\n", 219 | " (5): MaxPool2d (size=(3, 3), stride=(2, 2), dilation=(1, 1))\n", 220 | " (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", 221 | " (7): ReLU (inplace)\n", 222 | " (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", 223 | " (9): ReLU (inplace)\n", 224 | " (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", 225 | " (11): ReLU (inplace)\n", 226 | " (12): MaxPool2d (size=(3, 3), stride=(2, 2), dilation=(1, 1))\n", 227 | " )\n", 228 | " (classifier): Sequential (\n", 229 | " (0): Dropout (p = 0.5)\n", 230 | " (1): Linear (9216 -> 4096)\n", 231 | " (2): ReLU (inplace)\n", 232 | " (3): Dropout (p = 0.5)\n", 233 | " (4): Linear (4096 -> 4096)\n", 234 | " (5): ReLU (inplace)\n", 235 | " (6): Linear (4096 -> 464)\n", 236 | " )\n", 237 | ")" 238 | ] 239 | }, 240 | "execution_count": 47, 241 | "metadata": {}, 242 | "output_type": "execute_result" 243 | } 244 | ], 245 | "source": [ 246 | "alex464 = models.AlexNet(464)" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 54, 252 | "metadata": { 253 | "collapsed": false 254 | }, 255 | "outputs": [ 256 | { 257 | "name": "stdout", 258 | "output_type": "stream", 259 | "text": [ 260 | "\n", 261 | "\n", 262 | "\n", 263 | "\n", 264 | "\n", 265 | "\n", 266 | "\n" 267 | ] 268 | } 269 | ], 270 | "source": [ 271 | "for c in alex.classifier:\n", 272 | " print(type(c))" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 60, 278 | "metadata": { 279 | "collapsed": true 280 | }, 281 | "outputs": [], 282 | "source": [ 283 | "def copyParameters(net, netBase):\n", 284 | " for i, f in enumerate(net.features):\n", 285 | " if type(f) is torch.nn.modules.conv.Conv2d:\n", 286 | " f.weight.data = netBase.features[i].weight.data\n", 287 | " f.bias.data = netBase.features[i].bias.data\n", 288 | " for i, c in enumerate(net.classifier):\n", 289 | " if type(c) is torch.nn.modules.linear.Linear:\n", 290 | " if c.weight.size() == netBase.classifier[i].weight.size():\n", 291 | " c.weight.data = netBase.classifier[i].weight.data\n", 292 | " c.bias.data = netBase.classifier[i].bias.data" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 56, 298 | "metadata": { 299 | "collapsed": false 300 | }, 301 | "outputs": [], 302 | "source": [ 303 | "copyParameters(alex464, models.alexnet(pretrained=True))" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 63, 309 | "metadata": { 310 | "collapsed": false 311 | }, 312 | "outputs": [], 313 | "source": [ 314 | "import torch.optim as optim\n", 315 | "criterion = nn.loss.CrossEntropyLoss()\n", 316 | "optimizer = optim.SGD(alex464.parameters(), lr=0.01, momentum=0.9)" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 116, 322 | "metadata": { 323 | "collapsed": true 324 | }, 325 | "outputs": [], 326 | "source": [ 327 | "import os.path as path\n", 328 | "import glob\n", 329 | "def readTrainingSet(folder=\".\"):\n", 330 | " \"\"\"\n", 331 | " Lit un dossier contenant des images\n", 332 | " Retourne la liste d'image avec leur classe\n", 333 | " \"\"\"\n", 334 | " matchFunc = lambda x: x.split('/')[-1].split('-')[0]\n", 335 | " \n", 336 | " exts = ('*.jpg', '*.JPG', '*.JPEG', \"*.png\")\n", 337 | " r = []\n", 338 | " for ext in exts:\n", 339 | " r.extend( [(im, matchFunc(im)) for im in glob.iglob(path.join(folder, ext)) if not 'wall' in im] )\n", 340 | " return r" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 117, 346 | "metadata": { 347 | "collapsed": false 348 | }, 349 | "outputs": [], 350 | "source": [ 351 | "trainset = readTrainingSet(\"/video/CLICIDE/\")" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 118, 357 | "metadata": { 358 | "collapsed": false 359 | }, 360 | "outputs": [], 361 | "source": [ 362 | "listLabel = [t[1] for t in trainset if not 'wall' in t[1]]" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 119, 368 | "metadata": { 369 | "collapsed": false 370 | }, 371 | "outputs": [ 372 | { 373 | "name": "stdout", 374 | "output_type": "stream", 375 | "text": [ 376 | "464\n" 377 | ] 378 | } 379 | ], 380 | "source": [ 381 | "s = set(listLabel)\n", 382 | "s = list(s)\n", 383 | "print(len(s))" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "metadata": { 390 | "collapsed": false, 391 | "scrolled": true 392 | }, 393 | "outputs": [ 394 | { 395 | "name": "stdout", 396 | "output_type": "stream", 397 | "text": [ 398 | "[1, 10] loss: 6.143\n", 399 | "[1, 20] loss: 6.084\n", 400 | "[1, 30] loss: 6.041\n", 401 | "[1, 40] loss: 5.915\n", 402 | "[1, 50] loss: 5.707\n", 403 | "[1, 60] loss: 5.503\n", 404 | "[1, 70] loss: 5.481\n", 405 | "[1, 80] loss: 5.295\n", 406 | "[1, 90] loss: 5.075\n", 407 | "[1, 100] loss: 4.737\n", 408 | "[2, 10] loss: 3.990\n", 409 | "[2, 20] loss: 4.320\n", 410 | "[2, 30] loss: 4.300\n", 411 | "[2, 40] loss: 3.964\n", 412 | "[2, 50] loss: 4.021\n", 413 | "[2, 60] loss: 3.972\n", 414 | "[2, 70] loss: 4.121\n", 415 | "[2, 80] loss: 4.176\n", 416 | "[2, 90] loss: 4.125\n", 417 | "[2, 100] loss: 3.864\n", 418 | "[3, 10] loss: 2.850\n", 419 | "[3, 20] loss: 3.330\n", 420 | "[3, 30] loss: 3.021\n", 421 | "[3, 40] loss: 3.085\n", 422 | "[3, 50] loss: 3.290\n", 423 | "[3, 60] loss: 3.062\n", 424 | "[3, 70] loss: 2.940\n", 425 | "[3, 80] loss: 2.637\n", 426 | "[3, 90] loss: 2.795\n", 427 | "[3, 100] loss: 2.678\n", 428 | "[4, 10] loss: 1.977\n", 429 | "[4, 70] loss: 1.932\n", 430 | "[4, 80] loss: 2.296\n", 431 | "[4, 90] loss: 2.013\n", 432 | "[4, 100] loss: 2.111\n", 433 | "[5, 10] loss: 1.562\n", 434 | "[5, 20] loss: 1.551\n", 435 | "[5, 30] loss: 1.635\n", 436 | "[5, 40] loss: 1.625\n", 437 | "[5, 50] loss: 1.590\n", 438 | "[5, 60] loss: 1.645\n", 439 | "[5, 70] loss: 1.597\n", 440 | "[5, 80] loss: 1.866\n", 441 | "[5, 90] loss: 2.272\n", 442 | "[5, 100] loss: 2.033\n", 443 | "[6, 10] loss: 1.091\n", 444 | "[6, 20] loss: 1.133\n", 445 | "[6, 30] loss: 1.268\n", 446 | "[6, 40] loss: 0.939\n", 447 | "[6, 50] loss: 0.975\n", 448 | "[6, 60] loss: 1.105\n", 449 | "[6, 70] loss: 1.212\n", 450 | "[6, 80] loss: 1.435\n", 451 | "[6, 90] loss: 1.345\n", 452 | "[6, 100] loss: 1.096\n", 453 | "[7, 10] loss: 0.758\n", 454 | "[7, 20] loss: 0.764\n", 455 | "[7, 30] loss: 0.777\n", 456 | "[7, 40] loss: 0.684\n", 457 | "[7, 50] loss: 0.759\n", 458 | "[7, 60] loss: 0.750\n", 459 | "[7, 70] loss: 0.798\n", 460 | "[7, 80] loss: 0.874\n", 461 | "[7, 90] loss: 0.891\n", 462 | "[7, 100] loss: 0.773\n", 463 | "[8, 10] loss: 0.472\n", 464 | "[8, 20] loss: 0.424\n", 465 | "[8, 30] loss: 0.456\n", 466 | "[8, 40] loss: 0.737\n", 467 | "[8, 50] loss: 0.534\n", 468 | "[8, 60] loss: 0.558\n", 469 | "[8, 70] loss: 0.593\n", 470 | "[8, 80] loss: 0.607\n", 471 | "[8, 90] loss: 0.521\n", 472 | "[8, 100] loss: 0.552\n", 473 | "[9, 10] loss: 0.429\n", 474 | "[9, 20] loss: 0.602\n", 475 | "[9, 30] loss: 0.457\n", 476 | "[9, 40] loss: 0.694\n", 477 | "[9, 50] loss: 0.546\n", 478 | "[9, 60] loss: 0.453\n", 479 | "[9, 70] loss: 0.541\n", 480 | "[9, 80] loss: 0.514\n", 481 | "[9, 90] loss: 0.590\n", 482 | "[9, 100] loss: 0.704\n", 483 | "[10, 10] loss: 0.502\n", 484 | "[10, 20] loss: 0.533\n" 485 | ] 486 | } 487 | ], 488 | "source": [ 489 | "batchSize = 32\n", 490 | "alex464\n", 491 | "trans = transforms.ToTensor()\n", 492 | "for epoch in range(10):\n", 493 | " \"\"\"\n", 494 | " On parcourt l'ensemble du training set\n", 495 | " \"\"\"\n", 496 | " alex464.train()\n", 497 | " running_loss = 0.0\n", 498 | " random.shuffle(trainset) \n", 499 | " for i in range(len(trainset)/batchSize):\n", 500 | " \"\"\"\n", 501 | " 1. Charge batchSize images\n", 502 | " 2. Backprop\n", 503 | " \"\"\"\n", 504 | " inputs = torch.Tensor(batchSize, 3, 225, 225)\n", 505 | " for j in range(batchSize):\n", 506 | " inputs[j] = trans(Image.open(trainset[i*batchSize+j][0]).resize( (225, 225) ))\n", 507 | " inputs = Variable(inputs)\n", 508 | " \n", 509 | " lab = Variable(torch.LongTensor([s.index(trainset[i*batchSize+j][1]) for j in range(batchSize)]))\n", 510 | " optimizer.zero_grad()\n", 511 | " \n", 512 | " outputs = alex464(inputs)\n", 513 | " loss = criterion(outputs, lab)\n", 514 | " loss.backward()\n", 515 | " optimizer.step()\n", 516 | " \n", 517 | " running_loss += loss.data[0]\n", 518 | " if i % 10 == 9: # print every 10 mini-batches\n", 519 | " print('[%d, %5d] loss: %.3f' % (epoch+1, i+1, running_loss / 10))\n", 520 | " running_loss = 0.0\n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \"\"\"\n", 525 | " Eval\n", 526 | " \"\"\"" 527 | ] 528 | } 529 | ], 530 | "metadata": { 531 | "kernelspec": { 532 | "display_name": "Python 2", 533 | "language": "python", 534 | "name": "python2" 535 | }, 536 | "language_info": { 537 | "codemirror_mode": { 538 | "name": "ipython", 539 | "version": 2 540 | }, 541 | "file_extension": ".py", 542 | "mimetype": "text/x-python", 543 | "name": "python", 544 | "nbconvert_exporter": "python", 545 | "pygments_lexer": "ipython2", 546 | "version": "2.7.9" 547 | } 548 | }, 549 | "nbformat": 4, 550 | "nbformat_minor": 2 551 | } 552 | -------------------------------------------------------------------------------- /model/custom_modules.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Function 6 | from torch.nn.parameter import Parameter 7 | import numpy as np 8 | 9 | 10 | # function to shift an input with a trainable parameter 11 | class ShiftFun(Function): 12 | 13 | def __init__(self): 14 | super(ShiftFun, self).__init__() 15 | 16 | def forward(self, input, param): 17 | self.save_for_backward(input, param) 18 | return input + param.view(1, -1).expand_as(input) 19 | 20 | def backward(self, grad_output): 21 | input, param = self.saved_tensors 22 | grad_input = grad_output.clone() 23 | buf = param.clone().resize_(input.size(0)).fill_(1) 24 | grad_param = torch.mv(grad_output.t(), buf) 25 | return grad_input, grad_param 26 | 27 | 28 | class Shift(nn.Module): 29 | 30 | def __init__(self, n_features): 31 | super(Shift, self).__init__() 32 | self.param = Parameter(torch.Tensor(n_features)) 33 | self.reset_parameters() 34 | 35 | def reset_parameters(self): 36 | self.param.data.fill_(0) 37 | 38 | def forward(self, input): 39 | return ShiftFun()(input, self.param) 40 | 41 | 42 | # autograd function to normalize an input over the rows 43 | # (each vector of a batch is normalized) 44 | # the backward step follows the implementation of 45 | # torch.legacy.nn.Normalize closely 46 | class NormalizeL2Fun(Function): 47 | 48 | def __init__(self, eps=1e-10): 49 | super(NormalizeL2Fun, self).__init__() 50 | self.eps = eps 51 | 52 | def forward(self, input): 53 | self.save_for_backward(input) 54 | self.norm2 = input.pow(2).sum(1).add_(self.eps) 55 | self.norm = self.norm2.pow(0.5) 56 | output = input / self.norm.expand_as(input) 57 | return output 58 | 59 | def backward(self, grad_output): 60 | input = self.saved_tensors[0] 61 | grad_input = self.norm2.expand_as(input) * grad_output 62 | cross = (input * grad_output).sum(1) 63 | buf = input * cross.expand_as(input) 64 | grad_input.add_(-1, buf) 65 | cross = self.norm2 * self.norm 66 | grad_input.div_(cross.expand_as(grad_input)) 67 | return grad_input 68 | 69 | 70 | class NormalizeL2(nn.Module): 71 | 72 | def __init__(self): 73 | super(NormalizeL2, self).__init__() 74 | 75 | def forward(self, input): 76 | return NormalizeL2Fun()(input) 77 | 78 | 79 | # metric loss according to Chopra et al "Learning a Similarity Metric Discriminatively, with Application to Face Verification" 80 | # since we assume normalized vectors, we use Q=2 81 | class MetricLossFun(Function): 82 | 83 | def __init__(self, size_average=True): 84 | super(MetricLossFun, self).__init__() 85 | self.size_average = size_average 86 | 87 | # TODO: more things could be done inplace 88 | # this is difficult and probs unnecessary though 89 | def terms(self, input1, input2, y): 90 | diff = input1 - input2 91 | energy = diff.norm(1, 1) 92 | e = (energy * 0).add_(np.e) # fill with e, same shape as energy 93 | exp_term = e.pow_((-2.77 * energy).div_(2)) 94 | return diff, energy, exp_term 95 | 96 | # target takes values in 1 (good), -1 (bad) so (1-target)/2 is 0 for good pairs and 1 for bad ones, (1+target) / 2 inverse 97 | def forward(self, input1, input2, y): 98 | self.save_for_backward(input1, input2, y) 99 | _, energy, exp_term = self.terms(input1, input2, y) 100 | loss = energy.mul_(energy).mul_(1 + y).div_(2) 101 | loss.add_(exp_term.mul_(1 - y).mul_(2)) 102 | loss = loss.sum(0).view(1) 103 | if self.size_average: 104 | loss.div_(y.size(0)) 105 | return loss 106 | 107 | def backward(self, grad_output): 108 | input1, input2, y = self.saved_tensors 109 | diff, energy, exp_term = self.terms(input1, input2, y) 110 | diff[diff.lt(0)] = -1 111 | diff[diff.ge(0)] = 1 112 | energy = energy.expand_as(input1) 113 | exp_term = exp_term.expand_as(input1) 114 | y_g = (1 + y).view(-1, 1).expand_as(input1) 115 | y_i = (1 - y).view(-1, 1).expand_as(input1) 116 | y_g = y_g.mul(diff).mul_(energy) 117 | y_i = y_i.mul(2.77).mul_(diff).mul_(exp_term) 118 | grad1 = y_g.add_(-1, y_i) 119 | grad2 = -grad1 120 | if self.size_average: 121 | grad1.div_(y.size(0)) 122 | grad2.div_(y.size(0)) 123 | g = grad_output[0] 124 | if g != 1: 125 | grad1.mul_(g) 126 | grad2.mul_(g) 127 | return grad1, grad2, None 128 | 129 | 130 | class MetricLoss(nn.Module): 131 | 132 | def __init__(self, size_average=True): 133 | super(MetricLoss, self).__init__() 134 | self.size_average = size_average 135 | 136 | def forward(self, input1, input2, target): 137 | return MetricLossFun(self.size_average)(input1, input2, target) 138 | 139 | 140 | class TripletLossFun(Function): 141 | 142 | def __init__(self, margin, size_average=True, normalized=True): 143 | super(TripletLossFun, self).__init__() 144 | self.size_average = size_average 145 | self.margin = margin 146 | self.normalized = normalized 147 | 148 | # calculate for each sample i: 149 | # 1/2 (||anchor_i - pos_i||^2 - ||anchor_i - neg_i||^2 + 2margin) 150 | # then clamp to positive values and sum over all samples 151 | # when normalized, ||x1-x2||^2 = 2 - 2x1.x2 152 | # so the loss for i becomes: anchor_i . neg_i - anchor_i . pos_i + margin 153 | def forward(self, anchor, pos, neg): 154 | self.save_for_backward(anchor, pos, neg) 155 | if self.normalized: 156 | loss = (anchor * neg).sum(1) 157 | loss.add_(-1, (anchor * pos).sum(1)) 158 | loss.add_(self.margin) 159 | else: 160 | sqdiff_pos = (anchor - pos).pow_(2) 161 | sqdiff_neg = (anchor - neg).pow_(2) 162 | loss = sqdiff_pos.sum(1) 163 | loss.add_(-1, sqdiff_neg.sum(1)) 164 | loss.add_(self.margin * 2) 165 | loss.div_(2) 166 | self.clamp = torch.le(loss, 0) 167 | loss[self.clamp] = 0 168 | loss = loss.sum(0).view(1) 169 | if self.size_average: 170 | loss.div_(anchor.size(0)) 171 | return loss 172 | 173 | def backward(self, grad_output): 174 | # grad_pos = -(anchor_i - pos_i) for sample i 175 | # grad_neg = (anchor_i - neg_i) 176 | # grad_anchor = (anchor_i - pos_i) - (anchor_i - neg_i) 177 | # = (neg_i - pos_i) 178 | # if normalized: grad_pos = -anchor_i, grad_neg = anchor_i 179 | # grad_anchor = neg_i - pos_i 180 | anchor, pos, neg = self.saved_tensors 181 | if self.normalized: 182 | grad_anchor = neg - pos 183 | grad_pos = -anchor 184 | grad_neg = -grad_pos 185 | else: 186 | grad_anchor = neg - pos 187 | grad_pos = pos - anchor 188 | grad_neg = anchor - neg 189 | c = self.clamp.expand_as(anchor) 190 | grad_anchor[c] = 0 191 | grad_pos[c] = 0 192 | grad_neg[c] = 0 193 | 194 | if self.size_average: 195 | grad_anchor.div_(anchor.size(0)) 196 | grad_pos.div_(anchor.size(0)) 197 | grad_neg.div_(anchor.size(0)) 198 | g = grad_output[0] 199 | if g != 1: 200 | grad_anchor = grad_anchor.mul_(g) 201 | grad_pos = grad_pos.mul_(g) 202 | grad_neg = grad_neg.mul_(g) 203 | return grad_anchor, grad_pos, grad_neg 204 | 205 | 206 | class TripletLoss(nn.Module): 207 | 208 | def __init__(self, margin, size_average=True, normalized=True): 209 | super(TripletLoss, self).__init__() 210 | self.size_average = size_average 211 | self.margin = margin 212 | self.normalized = normalized 213 | 214 | def forward(self, anchor, pos, neg): 215 | return TripletLossFun(self.margin, self.size_average, self.normalized)(anchor, pos, neg) 216 | -------------------------------------------------------------------------------- /model/nn_utils.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torchvision.models as models 3 | 4 | 5 | # n < 0 sets all modules/blocks to be untrained 6 | def set_untrained_blocks(containers, n): 7 | # first make sure everything is trainable (not trainable if n<0) 8 | for container in containers: 9 | for m in container: 10 | for p in m.parameters(): 11 | p.requires_grad = n >= 0 12 | 13 | count = 0 14 | for seq in containers: 15 | for m in seq: 16 | if count >= n: 17 | break 18 | if sum(1 for _ in m.parameters()) <= 0: 19 | # exclude modules without params from count 20 | continue 21 | for p in m.parameters(): 22 | p.requires_grad = False 23 | count += 1 24 | 25 | 26 | def convolutionalize(fc, in_size2d): 27 | # Turn an FC layer into Conv2D layer by copying weights the right way 28 | out_size = fc.out_features 29 | in_size_total = fc.in_features 30 | if in_size_total % (in_size2d[0] * in_size2d[1]) != 0: 31 | raise ValueError('FC in_feature size {0} is not divisible by in_size2d {1}'.format(in_size_total, in_size2d)) 32 | in_channels = in_size_total // (in_size2d[0] * in_size2d[1]) 33 | has_bias = fc.bias is not None 34 | conv = nn.Conv2d(in_channels, out_size, in_size2d, bias=has_bias) 35 | if has_bias: 36 | conv.bias.data = fc.bias.data.clone() 37 | for i in range(out_size): 38 | conv.weight.data[i] = fc.weight.data[i].view(in_channels, *in_size2d).clone() 39 | return conv 40 | 41 | 42 | def get_feature_size(seq, factor=1, default=-1): 43 | feature_size = default 44 | for module in seq: 45 | if isinstance(module, models.resnet.Bottleneck): 46 | feature_size = module.conv3.out_channels * factor 47 | if isinstance(module, models.resnet.BasicBlock): 48 | feature_size = module.conv2.out_channels * factor 49 | if isinstance(module, nn.modules.Conv2d): 50 | feature_size = module.out_channels * factor 51 | if isinstance(module, nn.modules.linear.Linear): 52 | feature_size = module.out_features 53 | return feature_size 54 | 55 | 56 | def extract_layers(net): 57 | if hasattr(net, 'features') and hasattr(net, 'feature_reduc') and hasattr(net, 'classifier'): 58 | return net.features, net.feature_reduc, net.classifier 59 | if isinstance(net, models.ResNet): 60 | features = [net.conv1, net.bn1, net.relu, net.maxpool] 61 | features.extend(net.layer1) 62 | features.extend(net.layer2) 63 | features.extend(net.layer3) 64 | features.extend(net.layer4) 65 | features = nn.Sequential(*features) 66 | feature_reduc = nn.Sequential(net.avgpool) 67 | classifier = nn.Sequential(net.fc) 68 | else: 69 | features, classifier = net.features, net.classifier 70 | feature_reduc = nn.Sequential() 71 | return features, feature_reduc, classifier 72 | 73 | 74 | def copy_bn_params(m, base_m): 75 | if m.weight is not None: 76 | m.weight.data.copy_(base_m.weight.data) 77 | if m.bias is not None: 78 | m.bias.data.copy_(base_m.bias.data) 79 | m.running_mean.copy_(base_m.running_mean) 80 | m.running_var.copy_(base_m.running_var) 81 | 82 | 83 | def copy_bn_all(seq, base_seq): 84 | for m, base_m in zip(seq, base_seq): 85 | if isinstance(m, nn.Sequential): 86 | copy_bn_all(m, base_m) 87 | if isinstance(m, nn.BatchNorm2d): 88 | copy_bn_params(m, base_m) 89 | if isinstance(m, models.resnet.BasicBlock): 90 | copy_bn_params(m.bn1, base_m.bn1) 91 | copy_bn_params(m.bn2, base_m.bn2) 92 | if m.downsample is None: 93 | continue 94 | copy_bn_all(m.downsample, base_m.downsample) 95 | if isinstance(m, models.resnet.Bottleneck): 96 | copy_bn_params(m.bn1, base_m.bn1) 97 | copy_bn_params(m.bn2, base_m.bn2) 98 | copy_bn_params(m.bn3, base_m.bn3) 99 | if m.downsample is None: 100 | continue 101 | copy_bn_all(m.downsample, base_m.downsample) 102 | 103 | 104 | def bn_new_params(bn, **kwargs): 105 | w, b, rm, rv = bn.weight, bn.bias, bn.running_mean, bn.running_var 106 | new_bn = nn.BatchNorm2d(bn.num_features, **kwargs) 107 | if w and new_bn.weight: 108 | new_bn.weight.data = w.data.clone() 109 | if b and new_bn.bias: 110 | new_bn.bias.data = b.data.clone() 111 | new_bn.running_mean = rm.clone() 112 | new_bn.running_var = rv.clone() 113 | return new_bn 114 | 115 | 116 | def set_batch_norm_params(seq, **kwargs): 117 | for name, block in seq._modules.items(): 118 | if isinstance(block, nn.Sequential): 119 | set_batch_norm_params(block, **kwargs) 120 | if isinstance(block, nn.BatchNorm2d): 121 | seq._modules[name] = bn_new_params(block, **kwargs) 122 | if isinstance(block, models.resnet.BasicBlock): 123 | block.bn1 = bn_new_params(block.bn1, **kwargs) 124 | block.bn2 = bn_new_params(block.bn2, **kwargs) 125 | if block.downsample is None: 126 | continue 127 | set_batch_norm_params(block.downsample, **kwargs) 128 | if isinstance(block, models.resnet.Bottleneck): 129 | block.bn1 = bn_new_params(block.bn1, **kwargs) 130 | block.bn2 = bn_new_params(block.bn2, **kwargs) 131 | block.bn3 = bn_new_params(block.bn3, **kwargs) 132 | if block.downsample is None: 133 | continue 134 | set_batch_norm_params(block.downsample, **kwargs) 135 | 136 | 137 | def set_batch_norm_train(seq, train): 138 | for block in seq: 139 | if isinstance(block, nn.Sequential): 140 | set_batch_norm_train(block, train) 141 | if isinstance(block, nn.BatchNorm2d): 142 | block.train(mode=train) 143 | if isinstance(block, models.resnet.BasicBlock): 144 | block.bn1.train(mode=train) 145 | block.bn2.train(mode=train) 146 | if block.downsample is None: 147 | continue 148 | set_batch_norm_train(block.downsample, train) 149 | if isinstance(block, models.resnet.Bottleneck): 150 | block.bn1.train(mode=train) 151 | block.bn2.train(mode=train) 152 | block.bn3.train(mode=train) 153 | if block.downsample is None: 154 | continue 155 | set_batch_norm_train(block.downsample, train) 156 | 157 | 158 | # net is assumed to have only one component containing BatchNorm modules: 159 | # net.features 160 | def set_net_train(net, train, bn_train=False): 161 | net.train(mode=train) 162 | if train and not bn_train: 163 | set_batch_norm_train(net.features, False) 164 | -------------------------------------------------------------------------------- /model/siamese.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 52, 6 | "metadata": { 7 | "collapsed": true, 8 | "deletable": true, 9 | "editable": true 10 | }, 11 | "outputs": [], 12 | "source": [ 13 | "import torch\n", 14 | "import torch.nn as nn\n", 15 | "import torchvision.models as models\n", 16 | "from torch.autograd import Variable" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 38, 22 | "metadata": { 23 | "collapsed": true, 24 | "deletable": true, 25 | "editable": true 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "class Siamese1(nn.Module):\n", 30 | " \"\"\"\n", 31 | " Define a siamese network\n", 32 | " Given a module, it will duplicate it with weight sharing, concatenate the output and add a linear classifier \n", 33 | " \"\"\"\n", 34 | " def __init__(self, net):\n", 35 | " super(siamese, self).__init__()\n", 36 | " self.features = net\n", 37 | " self.classifier = nn.Linear(net.classifier[len(net.classifier._modules)-1].out_features*2, 1)\n", 38 | " \n", 39 | " def forward(self, x1, x2):\n", 40 | " x = torch.cat( (self.features(x1), self.features(x2)), 1)\n", 41 | " x = self.classifier(x)\n", 42 | " return x" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 49, 48 | "metadata": { 49 | "collapsed": true, 50 | "deletable": true, 51 | "editable": true 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "class Siamese2(nn.Module):\n", 56 | " \"\"\"\n", 57 | " Define a siamese network\n", 58 | " Given a module, it will duplicate it with weight sharing, concatenate the output and add a linear classifier \n", 59 | " \"\"\"\n", 60 | " def __init__(self, net):\n", 61 | " super(Siamese2, self).__init__()\n", 62 | " self.features = net\n", 63 | " \n", 64 | " def forward(self, x1, x2):\n", 65 | " return (self.features(x1), self.features(x2))" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 39, 71 | "metadata": { 72 | "collapsed": false, 73 | "deletable": true, 74 | "editable": true 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "def siamese():\n", 79 | " return Siamese2(models.alexnet(pretrained=True))" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 53, 85 | "metadata": { 86 | "collapsed": false, 87 | "deletable": true, 88 | "editable": true 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "if __name__=='__main__':\n", 93 | " t = Variable(torch.Tensor(1,3,225,225))\n", 94 | " s = Siamese2(models.alexnet(pretrained=True))\n", 95 | " o = s(t, t)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": { 102 | "collapsed": true 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "# autograd function to normalize an input over the rows\n", 107 | "# (each vector of a batch is normalized)\n", 108 | "# the backward step follows the implementation of\n", 109 | "# torch.legacy.nn.Normalize closely\n", 110 | "class Normalize2DL2(Function):\n", 111 | "\n", 112 | " def __init__(self, eps=1e-10):\n", 113 | " super(Normalize2DL2, self).__init__()\n", 114 | " self.eps = eps\n", 115 | "\n", 116 | " def forward(self, input):\n", 117 | " self.norm2 = input.pow(2).sum(1).add_(self.eps)\n", 118 | " self.norm = self.norm2.pow(0.5)\n", 119 | " output = input / self.norm.expand_as(input)\n", 120 | " self.save_for_backward(input)\n", 121 | " return output\n", 122 | "\n", 123 | " def backward(self, grad_output):\n", 124 | " input = self.saved_tensors[0]\n", 125 | " gradInput = self.norm2.expand_as(input) * grad_output\n", 126 | " cross = (input * grad_output).sum(1)\n", 127 | " buf = input * cross.expand_as(input)\n", 128 | " gradInput.add_(-1, buf)\n", 129 | " cross = self.norm2 * self.norm\n", 130 | " gradInput.div_(cross.expand_as(gradInput))\n", 131 | " return gradInput" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": { 138 | "collapsed": true 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "class NormalizeL2(nn.Module):\n", 143 | "\n", 144 | " def __init__(self):\n", 145 | " super(NormalizeL2, self).__init__()\n", 146 | "\n", 147 | " def forward(self, input):\n", 148 | " return Normalize2DL2()(input)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": { 155 | "collapsed": true 156 | }, 157 | "outputs": [], 158 | "source": [ 159 | "def extract_layers(net):\n", 160 | " if isinstance(net, models.ResNet):\n", 161 | " features = [net.conv1, net.bn1, net.relu, net.maxpool]\n", 162 | " features.extend(net.layer1)\n", 163 | " features.extend(net.layer2)\n", 164 | " features.extend(net.layer3)\n", 165 | " features.extend(net.layer4)\n", 166 | " features = nn.Sequential(*features)\n", 167 | " feature_reduc = nn.Sequential(net.avgpool)\n", 168 | " classifier = nn.Sequential(net.fc)\n", 169 | " else:\n", 170 | " features, classifier = net.features, net.classifier\n", 171 | " feature_reduc = nn.Sequential()\n", 172 | " return features, feature_reduc, classifier" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": { 179 | "collapsed": true 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "class TuneClassif(nn.Module):\n", 184 | " \"\"\"\n", 185 | " Image classification network based on a pretrained network\n", 186 | " which is then finetuned to a different dataset\n", 187 | " It's assumed that the last layer of the given network\n", 188 | " is a fully connected (linear) one\n", 189 | " untrained_blocks specifies how many layers or blocks of layers are\n", 190 | " left untrained (only layers with parameters are counted). for ResNet, each 'BottleNeck' or 'BasicBlock' (block containing skip connection for residual) is considered as one block\n", 191 | " \"\"\"\n", 192 | "\n", 193 | " def __init__(self, net, num_classes, untrained_blocks=-1):\n", 194 | " super(TuneClassif, self).__init__()\n", 195 | " features, feature_reduc, classifier = extract_layers(net)\n", 196 | " if untrained_blocks < 0:\n", 197 | " untrained_blocks = sum(1 for _ in features) + sum(1 for _ in classifier)\n", 198 | " self.features = features\n", 199 | " self.feature_reduc = feature_reduc\n", 200 | " self.classifier = classifier\n", 201 | " # make sure we never retrain the first few layers\n", 202 | " # this is usually not needed\n", 203 | " seqs = [self.features, self.feature_reduc, self.classifier]\n", 204 | "\n", 205 | " def has_param(m):\n", 206 | " return sum(1 for _ in m.parameters()) > 0\n", 207 | " count = 0\n", 208 | " for module in (m for seq in seqs for m in seq if has_param(m)):\n", 209 | " if count >= untrained_blocks:\n", 210 | " break\n", 211 | " count += 1\n", 212 | " for p in module.parameters():\n", 213 | " p.requires_grad = False\n", 214 | "\n", 215 | " for name, module in self.classifier._modules.items():\n", 216 | " if module is classifier[len(classifier._modules) - 1]:\n", 217 | " self.classifier._modules[name] = nn.Linear(module.in_features, num_classes)\n", 218 | "\n", 219 | " def forward(self, x):\n", 220 | " x = self.features(x)\n", 221 | " x = self.feature_reduc(x)\n", 222 | " x = x.view(x.size(0), -1)\n", 223 | " x = self.classifier(x)\n", 224 | " return x" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": { 231 | "collapsed": true 232 | }, 233 | "outputs": [], 234 | "source": [ 235 | "class Siamese1(nn.Module):\n", 236 | " \"\"\"\n", 237 | " Define a siamese network\n", 238 | " Given a module, it will duplicate it with weight sharing, concatenate the output and add a linear classifier\n", 239 | " \"\"\"\n", 240 | " def __init__(self, net, num_classes=100, feature_dim=100, feature_size2d=(6, 6)):\n", 241 | " super(Siamese1, self).__init__()\n", 242 | " self.features = net.features\n", 243 | " spatial_factor = 4\n", 244 | " self.spatial_feature_reduc = nn.Sequential(\n", 245 | " nn.AvgPool2d(spatial_factor)\n", 246 | " )\n", 247 | " factor = feature_size2d[0] / spatial_factor * feature_size2d[1] / spatial_factor\n", 248 | " for module in self.features:\n", 249 | " if isinstance(module, models.resnet.Bottleneck):\n", 250 | " in_features = module.conv3.out_channels * factor\n", 251 | " if isinstance(module, models.resnet.BasicBlock):\n", 252 | " in_features = module.conv2.out_channels * factor\n", 253 | " if isinstance(module, nn.modules.Conv2d):\n", 254 | " in_features = module.out_channels * factor\n", 255 | " if feature_dim <= 0:\n", 256 | " for module in net.classifier:\n", 257 | " if isinstance(module, nn.modules.linear.Linear):\n", 258 | " out_features = module.out_features\n", 259 | " else:\n", 260 | " out_features = feature_dim\n", 261 | " self.feature_reduc1 = nn.Sequential(\n", 262 | " nn.Dropout(0.5),\n", 263 | " NormalizeL2(),\n", 264 | " nn.Linear(in_features, out_features)\n", 265 | " )\n", 266 | " self.feature_reduc2 = NormalizeL2()\n", 267 | "\n", 268 | " def forward_single(self, x):\n", 269 | " x = self.features(x)\n", 270 | " x = self.spatial_feature_reduc(x)\n", 271 | " x = x.view(x.size(0), -1)\n", 272 | " x = self.feature_reduc1(x)\n", 273 | " x = self.feature_reduc2(x)\n", 274 | " return x\n", 275 | "\n", 276 | " def forward(self, x1, x2=None, x3=None):\n", 277 | " if self.training:\n", 278 | " return self.forward_single(x1), self.forward_single(x2), self.forward_single(x3)\n", 279 | " else:\n", 280 | " return self.forward_single(x1)" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": { 287 | "collapsed": true 288 | }, 289 | "outputs": [], 290 | "source": [ 291 | "# metric loss according to Chopra et al \"Learning a Similarity Metric Discriminatively, with Application to Face Verification\"\n", 292 | "# since we assume normalized vectors, we use Q=2\n", 293 | "class MetricL(Function):\n", 294 | "\n", 295 | " def __init__(self, size_average=True):\n", 296 | " super(MetricL, self).__init__()\n", 297 | " self.size_average = size_average\n", 298 | "\n", 299 | " # TODO: everything could be done inplace,\n", 300 | " # more difficult though (for norm see torch.nn._functions.loss.Cosine...)\n", 301 | " def terms(self, input1, input2, y):\n", 302 | " diff = input1 - input2\n", 303 | " energy = diff.norm(1, 1)\n", 304 | " e = energy * 0 + np.e\n", 305 | " exp_term = torch.pow(e, -2.77 * energy / 2)\n", 306 | " return diff, energy, exp_term\n", 307 | "\n", 308 | " # target takes values in 1 (good), -1 (bad) so (1-target)/2 is 0 for good pairs and 1 for bad ones, (1+target) / 2 inverse\n", 309 | " def forward(self, input1, input2, y):\n", 310 | " _, energy, exp_term = self.terms(input1, input2, y)\n", 311 | " loss_g = (1 + y) * energy * energy / 2\n", 312 | " loss_i = (1 - y) * 2 * exp_term\n", 313 | " loss = (loss_g + loss_i).sum(0).view(1)\n", 314 | " if self.size_average:\n", 315 | " loss.div_(y.size(0))\n", 316 | " self.save_for_backward(input1, input2, y)\n", 317 | " return loss\n", 318 | "\n", 319 | " def backward(self, grad_output):\n", 320 | " input1, input2, y = self.saved_tensors\n", 321 | " diff, energy, exp_term = self.terms(input1, input2, y)\n", 322 | " diff[diff.lt(0)] = -1\n", 323 | " diff[diff.ge(0)] = 1\n", 324 | " y_g = (1 + y).view(-1, 1).expand_as(input1)\n", 325 | " y_i = (1 - y).view(-1, 1).expand_as(input1)\n", 326 | " energy = energy.expand_as(input1)\n", 327 | " exp_term = exp_term.expand_as(input1)\n", 328 | " grad1 = y_g * diff * energy - 2.77 * y_i * diff * exp_term\n", 329 | " grad2 = -grad1\n", 330 | " if self.size_average:\n", 331 | " grad1.div_(y.size(0))\n", 332 | " grad2.div_(y.size(0))\n", 333 | " if grad_output[0] != 1:\n", 334 | " grad1.mul_(grad_output)\n", 335 | " grad2.mul_(grad_output)\n", 336 | " return grad1, grad2, None" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "metadata": { 343 | "collapsed": true 344 | }, 345 | "outputs": [], 346 | "source": [ 347 | "class MetricLoss(nn.Module):\n", 348 | "\n", 349 | " def __init__(self, size_average=True):\n", 350 | " super(MetricLoss, self).__init__()\n", 351 | " self.size_average = size_average\n", 352 | "\n", 353 | " def forward(self, input1, input2, target):\n", 354 | " return MetricL(self.size_average)(input1, input2, target)\n", 355 | "\n", 356 | "\n", 357 | "class TripletL(Function):\n", 358 | "\n", 359 | " def __init__(self, margin, size_average=True):\n", 360 | " super(TripletL, self).__init__()\n", 361 | " self.size_average = size_average\n", 362 | " self.margin = margin\n", 363 | "\n", 364 | " def forward(self, anchor, pos, neg):\n", 365 | " sqdiff = anchor.add(-1, pos).pow_(2)\n", 366 | " sqdiff = anchor.add(-1, neg).pow_(2)\n", 367 | " loss = sqdiff.sum(1)\n", 368 | " loss.add_(-1, sqdiff.sum(1))\n", 369 | " loss.add_(self.margin)\n", 370 | " self.clamp = torch.lt(loss, 0)\n", 371 | " loss[self.clamp] = 0\n", 372 | " loss = loss.sum(0).view(1)\n", 373 | " if self.size_average:\n", 374 | " loss.div_(anchor.size(0))\n", 375 | " self.save_for_backward(anchor, pos, neg)\n", 376 | " return loss\n", 377 | "\n", 378 | " def backward(self, grad_output):\n", 379 | " # grad_pos = -2(x_anchor - x_pos)\n", 380 | " # grad_neg = 2(x_anchor - x_neg)\n", 381 | " # grad_anchor = 2(x_anchor - x_pos) - 2(x_anchor - x_neg)\n", 382 | " # = -(grad_pos + grad_neg)\n", 383 | " anchor, pos, neg = self.saved_tensors\n", 384 | " c = self.clamp.expand_as(anchor)\n", 385 | " anchor[c] = 0\n", 386 | " pos[c] = 0\n", 387 | " neg[c] = 0\n", 388 | " anchor_sum = anchor.sum(0)\n", 389 | " grad_pos = anchor_sum.add(-1, pos.sum(0)).mul_(-2)\n", 390 | " grad_neg = anchor_sum.add_(-1, neg.sum(0)).mul_(2)\n", 391 | " grad_anchor = grad_pos.add(grad_neg).mul_(-1)\n", 392 | "\n", 393 | " if self.size_average:\n", 394 | " grad_anchor.div_(anchor.size(0))\n", 395 | " grad_pos.div_(anchor.size(0))\n", 396 | " grad_neg.div_(anchor.size(0))\n", 397 | " if grad_output[0] != 1:\n", 398 | " grad_anchor = grad_anchor.mul_(grad_output)\n", 399 | " grad_pos = grad_pos.mul_(grad_output)\n", 400 | " grad_neg = grad_neg.mul_(grad_output)\n", 401 | " grad_anchor = grad_anchor.expand_as(anchor)\n", 402 | " grad_pos = grad_pos.expand_as(anchor)\n", 403 | " grad_neg = grad_neg.expand_as(anchor)\n", 404 | " return grad_anchor, grad_pos, grad_neg\n", 405 | "\n", 406 | "\n", 407 | "class TripletLoss(nn.Module):\n", 408 | "\n", 409 | " def __init__(self, margin, size_average=True):\n", 410 | " super(TripletLoss, self).__init__()\n", 411 | " self.size_average = size_average\n", 412 | " self.margin = margin\n", 413 | "\n", 414 | " def forward(self, anchor, pos, neg):\n", 415 | " return TripletL(self.margin, self.size_average)(anchor, pos, neg)" 416 | ] 417 | } 418 | ], 419 | "metadata": { 420 | "kernelspec": { 421 | "display_name": "Python 2", 422 | "language": "python", 423 | "name": "python2" 424 | }, 425 | "language_info": { 426 | "codemirror_mode": { 427 | "name": "ipython", 428 | "version": 2 429 | }, 430 | "file_extension": ".py", 431 | "mimetype": "text/x-python", 432 | "name": "python", 433 | "nbconvert_exporter": "python", 434 | "pygments_lexer": "ipython2", 435 | "version": "2.7.9" 436 | } 437 | }, 438 | "nbformat": 4, 439 | "nbformat_minor": 2 440 | } 441 | -------------------------------------------------------------------------------- /model/siamese.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | from nn_utils import * 7 | from custom_modules import * 8 | 9 | 10 | class TuneClassif(nn.Module): 11 | """ 12 | Image classification network based on a pretrained network 13 | which is then finetuned to a different dataset 14 | It's assumed that the last layer of the given network 15 | is a fully connected (linear) one 16 | untrained specifies how many layers or blocks of layers are 17 | left untrained (only layers with parameters are counted). for ResNet, each 'BottleNeck' or 'BasicBlock' (block containing skip connection for residual) is considered as one block 18 | """ 19 | 20 | def __init__(self, net, num_classes, untrained=-1, reduc=True): 21 | super(TuneClassif, self).__init__() 22 | self.features, self.feature_reduc, self.classifier = extract_layers(net) 23 | # make sure we never retrain the first few layers 24 | # this is usually not needed 25 | set_untrained_blocks([self.features, self.classifier], untrained) 26 | 27 | # replace last module of classifier with a reduced one 28 | last_module = self.classifier[len(self.classifier._modules) - 1] 29 | if not isinstance(last_module, nn.Linear) or last_module.out_features != num_classes: 30 | for name, module in self.classifier._modules.items(): 31 | if module is last_module: 32 | self.classifier._modules[name] = nn.Linear(module.in_features, num_classes) 33 | 34 | self.feature_size = num_classes 35 | # if no reduc is wanted, remove it 36 | if not reduc: 37 | factor = 1 38 | for m in self.feature_reduc: 39 | try: 40 | factor *= (m.kernel_size[0] * m.kernel_size[1]) 41 | except TypeError: 42 | factor *= m.kernel_size * m.kernel_size 43 | # increase the number of input features on first classifier module 44 | for name, module in self.classifier._modules.items(): 45 | if module is self.classifier[0]: 46 | self.classifier._modules[name] = nn.Linear(module.in_features * factor, module.out_features) 47 | self.feature_reduc = nn.Sequential() 48 | 49 | def forward(self, x): 50 | x = self.features(x) 51 | x = self.feature_reduc(x) 52 | x = x.view(x.size(0), -1) 53 | x = self.classifier(x) 54 | return x 55 | 56 | 57 | class TuneClassifSub(TuneClassif): 58 | """ 59 | Image classification network based on a pretrained network 60 | which is then finetuned to a different dataset, as above 61 | Here, all sub-parts of the image are classified by 62 | convolutionalizing the linear classification layers 63 | """ 64 | def __init__(self, net, num_classes, feature_size2d, untrained=-1): 65 | super(TuneClassifSub, self).__init__(net, num_classes, untrained, reduc=True) 66 | reduc_count = sum(1 for _ in self.feature_reduc) 67 | if reduc_count > 0: 68 | # in a ResNet, apply stride 1 feature size avg pool reduction 69 | self.feature_reduc = nn.Sequential( 70 | nn.AvgPool2d(feature_size2d, stride=1) 71 | ) 72 | # convolutionalize the linear layers in classifier 73 | count = 0 74 | for name, module in self.classifier._modules.items(): 75 | if isinstance(module, nn.modules.linear.Linear): 76 | size2d = feature_size2d 77 | if reduc_count > 0 or count > 0: 78 | size2d = (1, 1) 79 | self.classifier._modules[name] = convolutionalize(module, size2d) 80 | count += 1 81 | 82 | def forward_single(self, x): 83 | x = self.features(x) 84 | x = self.feature_reduc(x) 85 | x = self.classifier(x) 86 | return x 87 | 88 | def forward(self, *scales): 89 | return [self.forward_single(x) for x in scales] 90 | 91 | 92 | class DescriptorNet(nn.Module): 93 | """ 94 | Define a siamese network 95 | Given a network, obtain its features, then apply spatial reduction 96 | (optional) and a norm, shift+linear, norm reduction to obtain a 97 | descriptor. 98 | TODO description 99 | """ 100 | def __init__(self, net, feature_dim, feature_size2d, untrained=-1): 101 | super(DescriptorNet, self).__init__() 102 | self.features, _, classifier = extract_layers(net) 103 | set_untrained_blocks([self.features], untrained) 104 | factor = feature_size2d[0] * feature_size2d[1] 105 | in_features = get_feature_size(self.features, factor) 106 | if feature_dim <= 0: 107 | self.feature_size = get_feature_size(classifier) 108 | else: 109 | self.feature_size = feature_dim 110 | self.feature_reduc1 = nn.Sequential( 111 | NormalizeL2(), 112 | Shift(in_features), 113 | nn.Linear(in_features, self.feature_size) 114 | ) 115 | self.feature_reduc2 = NormalizeL2() 116 | 117 | def forward_single(self, x): 118 | x = self.features(x) 119 | x = x.view(x.size(0), -1) 120 | x = self.feature_reduc1(x) 121 | x = self.feature_reduc2(x) 122 | return x 123 | 124 | def forward(self, x1, x2=None, x3=None): 125 | if self.training and x3 is not None: 126 | return self.forward_single(x1), self.forward_single(x2), self.forward_single(x3) 127 | elif self.training: 128 | return self.forward_single(x1), self.forward_single(x2) 129 | else: 130 | return self.forward_single(x1) 131 | 132 | 133 | class RegionDescriptorNet(nn.Module): 134 | """ 135 | Define a siamese network 136 | Given a network, obtain its features and apply spatial reduction 137 | (optional). The feature maps can have any size here, so we apply 138 | a classifier (obtained from the given network) to all locations 139 | in the feature map. Finally, we sum the features in those regions 140 | obtaining the highest classification values and apply normalization, 141 | shifting, linear, normalization to obtain a global descriptor. 142 | In order to allow training for both the descriptor and the classifier, 143 | the classification values are output as well as the descriptor 144 | for all input images. 145 | 146 | Use the k highest values from the classifier to obtain descriptor 147 | """ 148 | def __init__(self, net, k, feature_dim, feature_size2d, untrained=-1): 149 | super(RegionDescriptorNet, self).__init__() 150 | self.k = k 151 | self.feature_size2d = feature_size2d 152 | self.features, self.feature_reduc, self.classifier = extract_layers(net) 153 | 154 | # factor = 1 155 | factor = feature_size2d[0] * feature_size2d[1] 156 | in_features = get_feature_size(self.features, factor) 157 | if feature_dim <= 0: 158 | self.feature_size = get_feature_size(classifier) 159 | else: 160 | self.feature_size = feature_dim 161 | reduc_count = sum(1 for _ in self.feature_reduc) 162 | if reduc_count > 0: 163 | # we are a ResNet or similar, apply feature_size AvgPool stride 1 164 | self.feature_reduc = nn.Sequential( 165 | nn.AvgPool2d(feature_size2d, stride=1) 166 | ) 167 | # convolutionalize the linear layers in classifier 168 | count = 0 169 | for name, module in self.classifier._modules.items(): 170 | if isinstance(module, nn.modules.linear.Linear): 171 | size2d = feature_size2d 172 | if reduc_count > 0 or count > 0: 173 | size2d = (1, 1) 174 | self.classifier._modules[name] = convolutionalize(module, size2d) 175 | count += 1 176 | set_untrained_blocks([self.features, self.classifier], untrained) 177 | self.feature_reduc1 = nn.Sequential( 178 | NormalizeL2(), 179 | Shift(in_features), 180 | nn.Linear(in_features, self.feature_size) 181 | ) 182 | self.feature_reduc2 = NormalizeL2() 183 | 184 | # this can only be done using a single input (batch size: 1) TODO 185 | def forward_single(self, x): 186 | x = self.features(x) 187 | c = self.feature_reduc(x) 188 | c = self.classifier(c) 189 | # get maximal classification values and choose indexes with 190 | # highest maximal classification 191 | c_maxv, _ = c.max(1) 192 | c_maxv = c_maxv.view(-1) 193 | k = min(c_maxv.size(0), self.k) 194 | _, flat_idx = c_maxv.topk(k) 195 | 196 | # transform flat classification indexes to feature indexes 197 | # first, flat index -> 2d classification index, then add 198 | # feature size to obtain the region in feature map 199 | def feature_idx(flat_idx): 200 | cls_idx = flat_idx // c.size(3), flat_idx % c.size(3) 201 | return (cls_idx[0], cls_idx[0] + self.feature_size2d[0], 202 | cls_idx[1], cls_idx[1] + self.feature_size2d[1]) 203 | top_idx = [feature_idx(int(i)) for i in flat_idx.data] 204 | # needed for output 205 | tmp = c_maxv.data.clone().resize_(c.size(0), self.feature_size) 206 | acc = Variable(tmp.fill_(0)) 207 | tmp = c_maxv.data.clone().resize_(c.size(0), c.size(1), self.k) 208 | cls_out = Variable(tmp.fill_(0)) 209 | 210 | # for all top maximal classification indexes, output the actual 211 | # classification values at those indexes 212 | # for the descriptor, use the feature indexes and then reduce 213 | # accumulate regional descriptors using addition 214 | i = 0 215 | for x1, x2, y1, y2 in top_idx: 216 | cls_out[:, :, i] = c[:, :, x1, y1] 217 | i += 1 218 | region = x[:, :, x1:x2, y1:y2].contiguous().view(x.size(0), -1) 219 | region = self.feature_reduc1(region) 220 | acc = acc + region 221 | # finally, perform final reduction (normalization) 222 | x = self.feature_reduc2(acc) 223 | return x, cls_out 224 | 225 | def forward(self, x1, x2=None, x3=None): 226 | if self.training and x3 is not None: 227 | return self.forward_single(x1), self.forward_single(x2), self.forward_single(x3) 228 | elif self.training: 229 | return self.forward_single(x1), self.forward_single(x2) 230 | else: 231 | return self.forward_single(x1)[0] 232 | -------------------------------------------------------------------------------- /pre_process_dataset.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | import cv2 4 | import numpy as np 5 | from os import path 6 | from utils import get_images_labels, match_label_fou_clean2, match_label_video 7 | 8 | # resize all images of a dataset and place them into a new folder 9 | 10 | # path to folders containing train and test images 11 | dataset = '/home/mrim/data/collection/GUIMUTEIC/FOURVIERE_CLEAN2/TRAIN_I' 12 | dataset_test = '/home/mrim/data/collection/GUIMUTEIC/FOURVIERE_CLEAN2/TEST_I' 13 | # function to match the labels in image names 14 | match_labels = match_label_fou_clean2 15 | # paths where the resized images are placed 16 | out_path = './data/pre_proc/fourviere_clean2_448' 17 | out_path_test = './data/pre_proc/fourviere_clean2_448/test' 18 | 19 | # training and test sets (scaled to 300 on the small side) 20 | dataSetFull = get_images_labels(dataset, match_labels) 21 | testSetFull = get_images_labels(dataset_test, match_labels) 22 | 23 | 24 | # resize function 25 | def resize(dataset, out_path, max_ar, newsize1, newsize2=None): 26 | for im, lab in dataset: 27 | im_o = cv2.imread(im) 28 | h, w, _ = im_o.shape 29 | if max_ar >= 1. and ((h > w and float(h) / w > max_ar) or (h < w and float(w) / h > max_ar)): 30 | # force a max aspect ratio of max_ar by padding image with random uniform noise 31 | def pad_rand(vector, pad_width, iaxis, kwargs): 32 | if pad_width[0] > 0: 33 | vector[:pad_width[0]] = np.random.randint(256, size=pad_width[0]) 34 | if pad_width[1] > 0: 35 | vector[-pad_width[1]:] = np.random.randint(256, size=pad_width[1]) 36 | return vector 37 | if h > w: 38 | ow = int(np.ceil(float(h) / max_ar)) 39 | w_pad = (ow - w) // 2 40 | w_mod = (ow - w) % 2 41 | im_o = np.pad(im_o, ((0, 0), (w_pad + w_mod, w_pad), (0, 0)), pad_rand) 42 | else: 43 | oh = int(np.ceil(float(w) / max_ar)) 44 | h_pad = (oh - h) // 2 45 | h_mod = (oh - h) % 2 46 | im_o = np.pad(im_o, ((h_pad + h_mod, h_pad), (0, 0), (0, 0)), pad_rand) 47 | h, w, _ = im_o.shape 48 | if newsize2 is None: 49 | if (w <= h and w == newsize1) or (h <= w and h == newsize1): 50 | ow, oh = w, h 51 | elif (w < h): 52 | ow, oh = newsize1, int(round(float(newsize1 * h) / w)) 53 | else: 54 | ow, oh = int(round(float(newsize1 * w) / h)), newsize1 55 | else: 56 | ow, oh = newsize1, newsize2 57 | if ow == w and oh == h: 58 | im_out = im_o 59 | else: 60 | im_out = cv2.resize(im_o, (ow, oh), interpolation=cv2.INTER_CUBIC) 61 | out_p = path.join(out_path, im.split('/')[-1]) 62 | print('/'.join(im.split('/')[-3:]), '->', '/'.join(out_p.split('/')[-3:])) 63 | cv2.imwrite(out_p, im_out) 64 | 65 | 66 | resize(dataSetFull, out_path, 2.0, 448) 67 | resize(testSetFull, out_path_test, 2.0, 448) 68 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxgreat/Instance-Search/2cea5f64a2d397047072a91788af81c0ea1c6d5e/test/__init__.py -------------------------------------------------------------------------------- /test/classif_finetune_test.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | import traceback 4 | import sys 5 | import getopt 6 | import torchvision.transforms as transforms 7 | from model.nn_utils import set_net_train 8 | from utils import * 9 | from train.classif_finetune import P, labels, test_classif_net 10 | from train.global_p import * 11 | from train.classif_finetune import get_embeddings, get_class_net 12 | from instance_avg import instance_avg 13 | 14 | 15 | def usage(): 16 | print('Usage: ' + sys.argv[0] + ' [options]') 17 | prefix = 'Options:\n\tRequired:\n' 18 | o1 = ('--dataset=\t\tThe path to the dataset containing all ' + 19 | 'reference images. It should contain a sub-folder "test" ' + 20 | 'containing all test images\n') 21 | o2 = ('--model=\t\tEither AlexNet or ResNet152 to specify the ' + 22 | 'type of model.\n') 23 | o3 = ('--weights=\t\tThe filename containing weights of a ' + 24 | 'network trained for sub-region classification.\n') 25 | o4 = ('--device=\t\tThe GPU device used for testing. ' + 26 | 'If negative, CPU is used.\n') 27 | o5 = ('--classify=\t\tTrue/yes/y/1 if the classification ' + 28 | 'feature should be tested. Otherwise, convolutional features ' + 29 | 'are tested.\n') 30 | o6 = ('--batch=\t\tThe batch size to use.\n') 31 | o7 = ('--dba=\t\tUse DBA with given k. If k = 0, do not use DBA. ' + 32 | 'If k<0, use all neighbors within the same instance.\n') 33 | o8 = '--help\t\tShow this help\n' 34 | print(prefix + o1 + o2 + o3 + o4 + o5 + o6 + o7 + o8) 35 | 36 | 37 | def main(dataset_full, model, weights, device, classify, batch_size, dba): 38 | # training and test sets 39 | dataset_id = parse_dataset_id(dataset_full) 40 | match_labels = match_label_functions[dataset_id] 41 | train_set_full = get_images_labels(dataset_full, match_labels) 42 | test_set_full = get_images_labels(dataset_full + '/test', match_labels) 43 | 44 | labels_list = [t[1] for t in train_set_full] 45 | # setup global params so that testing functions work properly 46 | labels.extend(sorted(list(set(labels_list)))) 47 | P.test_pre_proc = True # we always pre process images 48 | P.cuda_device = device 49 | P.image_input_size = image_sizes[dataset_id] 50 | P.test_batch_size = batch_size 51 | P.preload_net = weights 52 | P.cnn_model = model 53 | P.feature_size2d = feature_sizes[model, image_sizes[dataset_id]] 54 | P.embeddings_classify = classify 55 | out_size = len(labels) if classify else flat_feature_sizes[model, P.image_input_size] 56 | P.feature_dim = out_size 57 | 58 | print('Loading and transforming train/test sets.') 59 | 60 | # open the images (and transform already if possible) 61 | # do that only if it fits in memory ! 62 | m, s = read_mean_std(mean_std_files[dataset_id]) 63 | test_trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize(m, s)]) 64 | test_set, test_train_set = [], [] 65 | for im, lab in train_set_full: 66 | im_o = imread_rgb(im) 67 | test_train_set.append((test_trans(im_o), lab, im)) 68 | 69 | for im, lab in test_set_full: 70 | if lab not in labels: 71 | continue 72 | im_o = imread_rgb(im) 73 | test_set.append((test_trans(im_o), lab, im)) 74 | 75 | print('Testing network on dataset with ID {0}'.format(dataset_id)) 76 | class_net = get_class_net() 77 | set_net_train(class_net, False) 78 | c, t = test_classif_net(class_net, test_set) 79 | print('Classification (TEST): {0} / {1} - acc: {2:.4f}'.format(c, t, float(c) / t)) 80 | test_embeddings = get_embeddings(class_net, test_set, device, out_size) 81 | ref_embeddings = get_embeddings(class_net, test_train_set, device, out_size) 82 | sim = torch.mm(test_embeddings, ref_embeddings.t()) 83 | prec1, c, t, _, _ = precision1(sim, test_set, test_train_set) 84 | mAP = mean_avg_precision(sim, test_set, test_train_set) 85 | print('Descriptor (TEST): {0} / {1} - acc: {2:.4f} - mAP:{3:.4f}'.format(c, t, prec1, mAP)) 86 | if dba == 0: 87 | return 88 | print('Testing using instance feature augmentation') 89 | dba_embeddings, dba_set = instance_avg(device, ref_embeddings, 90 | test_train_set, labels, dba) 91 | sim = torch.mm(test_embeddings, dba_embeddings.t()) 92 | prec1, c, t, _, _ = precision1(sim, test_set, dba_set) 93 | mAP = mean_avg_precision(sim, test_set, dba_set) 94 | print('Descriptor (TEST DBA k={4}): {0} / {1} - acc: {2:.4f} - mAP:{3:.4f}'.format(c, t, prec1, mAP, dba)) 95 | 96 | 97 | if __name__ == '__main__': 98 | options_l = (['help', 'dataset=', 'model=', 'weights=', 'device=', 99 | 'classify=', 'batch=', 'dba=']) 100 | try: 101 | opts, args = getopt.getopt(sys.argv[1:], '', options_l) 102 | except getopt.GetoptError: 103 | usage() 104 | sys.exit(2) 105 | dataset_full, model, weights, device = None, None, None, None 106 | classify, batch_size, dba = None, None, -1 107 | for opt, arg in opts: 108 | if opt in ('--help'): 109 | usage() 110 | sys.exit() 111 | elif opt in ('--dataset'): 112 | dataset_full = check_folder(arg, 'dataset', True, usage) 113 | elif opt in ('--model'): 114 | model = check_model(arg, usage) 115 | elif opt in ('--weights'): 116 | weights = check_file(arg, 'initialization weights', True, usage) 117 | elif opt in ('--device'): 118 | device = check_int(arg, 'device', usage) 119 | elif opt in ('--classify'): 120 | classify = check_bool(arg, 'classify', usage) 121 | elif opt in ('--batch'): 122 | batch_size = check_int(arg, 'batch', usage) 123 | elif opt in ('--dba'): 124 | dba = check_int(arg, 'dba', usage) 125 | if (dataset_full is None or model is None or 126 | device is None or classify is None or batch_size is None): 127 | print('One or more required arguments is missing.') 128 | usage() 129 | sys.exit(2) 130 | 131 | with torch.cuda.device(device): 132 | try: 133 | main(dataset_full, model, weights, device, classify, 134 | batch_size, dba) 135 | except: 136 | log_detail(P, None, traceback.format_exc()) 137 | raise 138 | -------------------------------------------------------------------------------- /test/classif_regions_test.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | import traceback 4 | import sys 5 | import getopt 6 | import torchvision.transforms as transforms 7 | from model.nn_utils import set_net_train 8 | from utils import * 9 | from train.classif_regions import P, labels, test_classif_net 10 | from train.classif_regions import get_embeddings, get_class_net 11 | from instance_avg import instance_avg 12 | 13 | 14 | def usage(): 15 | print('Usage: ' + sys.argv[0] + ' [options]') 16 | prefix = 'Options:\n\tRequired:\n' 17 | o1 = ('--dataset=\t\tThe path to the dataset containing all ' + 18 | 'reference images. It should contain a sub-folder "test" ' + 19 | 'containing all test images\n') 20 | o2 = ('--model=\t\tEither AlexNet or ResNet152 to specify the ' + 21 | 'type of model.\n') 22 | o3 = ('--weights=\t\tThe filename containing weights of a ' + 23 | 'network trained for sub-region classification.\n') 24 | o4 = ('--device=\t\tThe GPU device used for testing. ' + 25 | 'If negative, CPU is used.\n') 26 | o5 = ('--dba=\t\tUse DBA with given k. If k = 0, do not use DBA. ' + 27 | 'If k<0, use all neighbors within the same instance.\n') 28 | o6 = '--help\t\tShow this help\n' 29 | print(prefix + o1 + o2 + o3 + o4 + o5 + o6) 30 | 31 | 32 | def main(dataset_full, model, weights, device, dba): 33 | # training and test sets 34 | dataset_id = parse_dataset_id(dataset_full) 35 | match_labels = match_label_functions[dataset_id] 36 | train_set_full = get_images_labels(dataset_full, match_labels) 37 | test_set_full = get_images_labels(dataset_full + '/test', match_labels) 38 | 39 | labels_list = [t[1] for t in train_set_full] 40 | # setup global params so that testing functions work properly 41 | labels.extend(sorted(list(set(labels_list)))) 42 | P.test_pre_proc = True # we always pre process images 43 | P.cuda_device = device 44 | P.preload_net = weights 45 | P.cnn_model = model 46 | P.feature_size2d = feature_sizes[model, image_sizes[dataset_id]] 47 | P.bn_model = '' # only useful for training 48 | 49 | print('Loading and transforming train/test sets.') 50 | 51 | # open the images (and transform already if possible) 52 | # do that only if it fits in memory ! 53 | m, s = read_mean_std(mean_std_files[dataset_id]) 54 | test_trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize(m, s)]) 55 | test_set, test_train_set = [], [] 56 | for im, lab in train_set_full: 57 | im_o = imread_rgb(im) 58 | test_train_set.append((test_trans(im_o), lab, im)) 59 | 60 | for im, lab in test_set_full: 61 | if lab not in labels: 62 | continue 63 | im_o = imread_rgb(im) 64 | test_set.append((test_trans(im_o), lab, im)) 65 | 66 | print('Testing network on dataset with ID {0}'.format(dataset_id)) 67 | class_net = get_class_net() 68 | set_net_train(class_net, False) 69 | c, t = test_classif_net(class_net, test_set) 70 | print('Classification (TEST): {0} / {1} - acc: {2:.4f}'.format(c, t, float(c) / t)) 71 | test_embeddings = get_embeddings(class_net, test_set, device, len(labels)) 72 | ref_embeddings = get_embeddings(class_net, test_train_set, device, len(labels)) 73 | sim = torch.mm(test_embeddings, ref_embeddings.t()) 74 | prec1, c, t, _, _ = precision1(sim, test_set, test_train_set) 75 | mAP = mean_avg_precision(sim, test_set, test_train_set) 76 | print('Descriptor (TEST): {0} / {1} - acc: {2:.4f} - mAP:{3:.4f}'.format(c, t, prec1, mAP)) 77 | if dba == 0: 78 | return 79 | print('Testing using instance feature augmentation') 80 | dba_embeddings, dba_set = instance_avg(device, ref_embeddings, 81 | test_train_set, labels, dba) 82 | sim = torch.mm(test_embeddings, dba_embeddings.t()) 83 | prec1, c, t, _, _ = precision1(sim, test_set, dba_set) 84 | mAP = mean_avg_precision(sim, test_set, dba_set) 85 | print('Descriptor (TEST DBA k={4}): {0} / {1} - acc: {2:.4f} - mAP:{3:.4f}'.format(c, t, prec1, mAP, dba)) 86 | 87 | 88 | if __name__ == '__main__': 89 | options_l = (['help', 'dataset=', 'model=', 'weights=', 'device=', 'dba=']) 90 | try: 91 | opts, args = getopt.getopt(sys.argv[1:], '', options_l) 92 | except getopt.GetoptError: 93 | usage() 94 | sys.exit(2) 95 | dataset_full, model, weights, device, dba = None, None, None, None, -1 96 | for opt, arg in opts: 97 | if opt in ('--help'): 98 | usage() 99 | sys.exit() 100 | elif opt in ('--dataset'): 101 | dataset_full = check_folder(arg, 'dataset', True, usage) 102 | elif opt in ('--model'): 103 | model = check_model(arg, usage) 104 | elif opt in ('--weights'): 105 | weights = check_file(arg, 'initialization weights', True, usage) 106 | elif opt in ('--device'): 107 | device = check_int(arg, 'device', usage) 108 | elif opt in ('--dba'): 109 | dba = check_int(arg, 'dba', usage) 110 | if (dataset_full is None or model is None or 111 | weights is None or device is None): 112 | print('One or more required arguments is missing.') 113 | usage() 114 | sys.exit(2) 115 | 116 | with torch.cuda.device(device): 117 | try: 118 | main(dataset_full, model, weights, device, dba) 119 | except: 120 | log_detail(P, None, traceback.format_exc()) 121 | raise 122 | -------------------------------------------------------------------------------- /test/instance_avg.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | import torch 4 | from utils import get_lab_indicators 5 | 6 | 7 | def instance_avg(device, embeddings, dataset, labels, k=-1): 8 | # create new embeddings for the dataset, where each embedding 9 | # is replaced with a weighted sum of the k nearest neighbors 10 | # within its instance. if k is negative, all neighbors of the instance 11 | # are used 12 | sim = torch.mm(embeddings, embeddings.t()) 13 | # for each embedding, set the similarities to embeddings of different 14 | # labels to -2, plus to itself, so the maximal similarities are always 15 | # neighbors of the same instance 16 | lab_ind = get_lab_indicators(dataset, device) 17 | new_embeddings = embeddings.clone() 18 | for i, (_, lab, _) in enumerate(dataset): 19 | num_neighbors = lab_ind[lab].sum() - 1 20 | if k >= 0 and k < num_neighbors: 21 | num_neighbors = k 22 | if num_neighbors <= 0: 23 | new_embeddings[i] = embeddings[i] 24 | continue 25 | sim[i, i] = -2 26 | sim[i][1 - lab_ind[lab]] = -2 27 | _, best_neighbors = torch.sort(sim[i], dim=0, descending=True) 28 | agg_embedding = embeddings[i].clone() 29 | for j in range(num_neighbors): 30 | weight = (num_neighbors - j) / float(num_neighbors + 1) 31 | agg_embedding += embeddings[best_neighbors[j]] * weight 32 | new_embeddings[i] = agg_embedding / (agg_embedding.norm() + 1e-10) 33 | return new_embeddings, dataset 34 | 35 | 36 | # a method of simply averaging the descriptors for each instance 37 | # this is less useful as it may pull outliers into the average 38 | # def instance_avg(device, embeddings, dataset, labels): 39 | # # create a fictional dataset with one entry per label, with 40 | # # its embeddings as the average of all descriptors of each label 41 | # fictional_set = [(None, lab, None) for lab in labels] 42 | # new_embeddings = tensor(device, len(labels), embeddings.size(1)) 43 | # avg = {lab: tensor(device, embeddings.size(1)).fill_(0) 44 | # for lab in labels} 45 | # for embedding, (_, lab, _) in zip(embeddings, dataset): 46 | # avg[lab] += embedding # no need to average since we normalize 47 | # for i, lab in enumerate(labels): 48 | # new_embeddings[i] = avg[lab] / (avg[lab].norm() + 1e-10) 49 | # return new_embeddings, fictional_set 50 | -------------------------------------------------------------------------------- /test/siamese_descriptor_test.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | import traceback 4 | import sys 5 | import getopt 6 | import torchvision.transforms as transforms 7 | from model.nn_utils import set_net_train 8 | from utils import * 9 | from train.siamese_descriptor import P, labels 10 | from train.siamese_descriptor import get_embeddings, get_siamese_net 11 | from instance_avg import instance_avg 12 | 13 | 14 | def usage(): 15 | print('Usage: ' + sys.argv[0] + ' [options]') 16 | prefix = 'Options:\n\tRequired:\n' 17 | o1 = ('--dataset=\t\tThe path to the dataset containing all ' + 18 | 'reference images. It should contain a sub-folder "test" ' + 19 | 'containing all test images\n') 20 | o2 = ('--model=\t\tEither AlexNet or ResNet152 to specify the ' + 21 | 'type of model.\n') 22 | o3 = ('--weights=\t\tThe filename containing weights of a ' + 23 | 'network trained for sub-region classification.\n') 24 | o4 = ('--device=\t\tThe GPU device used for testing. ' + 25 | 'If negative, CPU is used.\n') 26 | o5 = ('--feature-dim=\t\tThe feature dimensionality of the network.\n') 27 | o6 = ('--batch=\t\tThe batch size to use.\n') 28 | o7 = ('--dba=\t\tUse DBA with given k. If k = 0, do not use DBA. ' + 29 | 'If k<0, use all neighbors within the same instance.\n') 30 | o8 = '--help\t\tShow this help\n' 31 | print(prefix + o1 + o2 + o3 + o4 + o5 + o6 + o7 + o8) 32 | 33 | 34 | def main(dataset_full, model, weights, device, feature_dim, batch_size, dba): 35 | # training and test sets 36 | dataset_id = parse_dataset_id(dataset_full) 37 | match_labels = match_label_functions[dataset_id] 38 | train_set_full = get_images_labels(dataset_full, match_labels) 39 | test_set_full = get_images_labels(dataset_full + '/test', match_labels) 40 | 41 | labels_list = [t[1] for t in train_set_full] 42 | # setup global params so that testing functions work properly 43 | labels.extend(sorted(list(set(labels_list)))) 44 | P.num_classes = len(labels) 45 | P.test_pre_proc = True # we always pre process images 46 | P.cuda_device = device 47 | P.image_input_size = image_sizes[dataset_id] 48 | P.preload_net = weights 49 | P.cnn_model = model 50 | P.feature_size2d = feature_sizes[model, image_sizes[dataset_id]] 51 | P.classif_model = '' # only useful for training 52 | P.feature_dim = feature_dim 53 | P.test_batch_size = batch_size 54 | 55 | print('Loading and transforming train/test sets.') 56 | 57 | # open the images (and transform already if possible) 58 | # do that only if it fits in memory ! 59 | m, s = read_mean_std(mean_std_files[dataset_id]) 60 | test_trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize(m, s)]) 61 | test_set, test_train_set = [], [] 62 | for im, lab in train_set_full: 63 | im_o = imread_rgb(im) 64 | test_train_set.append((test_trans(im_o), lab, im)) 65 | 66 | for im, lab in test_set_full: 67 | if lab not in labels: 68 | continue 69 | im_o = imread_rgb(im) 70 | test_set.append((test_trans(im_o), lab, im)) 71 | 72 | print('Testing network on dataset with ID {0}'.format(dataset_id)) 73 | net = get_siamese_net() 74 | set_net_train(net, False) 75 | test_embeddings = get_embeddings(net, test_set, device, net.feature_size) 76 | ref_embeddings = get_embeddings(net, test_train_set, device, net.feature_size) 77 | sim = torch.mm(test_embeddings, ref_embeddings.t()) 78 | prec1, c, t, _, _ = precision1(sim, test_set, test_train_set) 79 | mAP = mean_avg_precision(sim, test_set, test_train_set) 80 | print('Descriptor (TEST): {0} / {1} - acc: {2:.4f} - mAP:{3:.4f}'.format(c, t, prec1, mAP)) 81 | if dba == 0: 82 | return 83 | print('Testing using instance feature augmentation') 84 | dba_embeddings, dba_set = instance_avg(device, ref_embeddings, 85 | test_train_set, labels, dba) 86 | sim = torch.mm(test_embeddings, dba_embeddings.t()) 87 | prec1, c, t, _, _ = precision1(sim, test_set, dba_set) 88 | mAP = mean_avg_precision(sim, test_set, dba_set) 89 | print('Descriptor (TEST DBA k={4}): {0} / {1} - acc: {2:.4f} - mAP:{3:.4f}'.format(c, t, prec1, mAP, dba)) 90 | 91 | 92 | if __name__ == '__main__': 93 | options_l = (['help', 'dataset=', 'model=', 'weights=', 'device=', 94 | 'feature-dim=', 'batch=', 'dba=']) 95 | try: 96 | opts, args = getopt.getopt(sys.argv[1:], '', options_l) 97 | except getopt.GetoptError: 98 | usage() 99 | sys.exit(2) 100 | dataset_full, model, weights, device = None, None, None, None 101 | feature_dim, batch_size, dba = None, None, -1 102 | for opt, arg in opts: 103 | if opt in ('--help'): 104 | usage() 105 | sys.exit() 106 | elif opt in ('--dataset'): 107 | dataset_full = check_folder(arg, 'dataset', True, usage) 108 | elif opt in ('--model'): 109 | model = check_model(arg, usage) 110 | elif opt in ('--weights'): 111 | weights = check_file(arg, 'initialization weights', True, usage) 112 | elif opt in ('--device'): 113 | device = check_int(arg, 'device', usage) 114 | elif opt in ('--feature-dim'): 115 | feature_dim = check_int(arg, 'feature-dim', usage) 116 | elif opt in ('--batch'): 117 | batch_size = check_int(arg, 'batch', usage) 118 | elif opt in ('--dba'): 119 | dba = check_int(arg, 'dba', usage) 120 | if (dataset_full is None or model is None or 121 | weights is None or device is None or 122 | feature_dim is None or batch_size is None): 123 | print('One or more required arguments is missing.') 124 | usage() 125 | sys.exit(2) 126 | 127 | with torch.cuda.device(device): 128 | try: 129 | main(dataset_full, model, weights, device, feature_dim, 130 | batch_size, dba) 131 | except: 132 | log_detail(P, None, traceback.format_exc()) 133 | raise 134 | -------------------------------------------------------------------------------- /test/siamese_regions_test.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | import traceback 4 | import sys 5 | import getopt 6 | import torchvision.transforms as transforms 7 | from model.nn_utils import set_net_train 8 | from utils import * 9 | from train.siamese_regions import P, labels 10 | from train.siamese_regions import get_embeddings, get_siamese_net 11 | from instance_avg import instance_avg 12 | 13 | 14 | def usage(): 15 | print('Usage: ' + sys.argv[0] + ' [options]') 16 | prefix = 'Options:\n\tRequired:\n' 17 | o1 = ('--dataset=\t\tThe path to the dataset containing all ' + 18 | 'reference images. It should contain a sub-folder "test" ' + 19 | 'containing all test images\n') 20 | o2 = ('--model=\t\tEither AlexNet or ResNet152 to specify the ' + 21 | 'type of model.\n') 22 | o3 = ('--weights=\t\tThe filename containing weights of a ' + 23 | 'network trained for sub-region classification.\n') 24 | o4 = ('--device=\t\tThe GPU device used for testing. ' + 25 | 'If negative, CPU is used.\n') 26 | o5 = ('--feature-dim=\t\tThe feature dimensionality of the network.\n') 27 | o6 = ('--regions-k=\t\tUse DBA with given k. If k = 0, do not use DBA. ' + 29 | 'If k<0, use all neighbors within the same instance.\n') 30 | o8 = '--help\t\tShow this help\n' 31 | print(prefix + o1 + o2 + o3 + o4 + o5 + o6 + o7 + o8) 32 | 33 | 34 | def main(dataset_full, model, weights, device, feature_dim, regions_k, dba): 35 | # training and test sets 36 | dataset_id = parse_dataset_id(dataset_full) 37 | match_labels = match_label_functions[dataset_id] 38 | train_set_full = get_images_labels(dataset_full, match_labels) 39 | test_set_full = get_images_labels(dataset_full + '/test', match_labels) 40 | 41 | labels_list = [t[1] for t in train_set_full] 42 | # setup global params so that testing functions work properly 43 | labels.extend(sorted(list(set(labels_list)))) 44 | P.num_classes = len(labels) 45 | P.test_pre_proc = True # we always pre process images 46 | P.cuda_device = device 47 | P.preload_net = weights 48 | P.cnn_model = model 49 | P.feature_size2d = feature_sizes[model, image_sizes[dataset_id]] 50 | P.classif_model = '' # only useful for training 51 | P.feature_dim = feature_dim 52 | P.regions_k = regions_k 53 | 54 | print('Loading and transforming train/test sets.') 55 | 56 | # open the images (and transform already if possible) 57 | # do that only if it fits in memory ! 58 | m, s = read_mean_std(mean_std_files[dataset_id]) 59 | test_trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize(m, s)]) 60 | test_set, test_train_set = [], [] 61 | for im, lab in train_set_full: 62 | im_o = imread_rgb(im) 63 | test_train_set.append((test_trans(im_o), lab, im)) 64 | 65 | for im, lab in test_set_full: 66 | if lab not in labels: 67 | continue 68 | im_o = imread_rgb(im) 69 | test_set.append((test_trans(im_o), lab, im)) 70 | 71 | print('Testing network on dataset with ID {0}'.format(dataset_id)) 72 | net = get_siamese_net() 73 | set_net_train(net, False) 74 | test_embeddings = get_embeddings(net, test_set, device, net.feature_size) 75 | ref_embeddings = get_embeddings(net, test_train_set, device, net.feature_size) 76 | sim = torch.mm(test_embeddings, ref_embeddings.t()) 77 | prec1, c, t, _, _ = precision1(sim, test_set, test_train_set) 78 | mAP = mean_avg_precision(sim, test_set, test_train_set) 79 | print('Descriptor (TEST): {0} / {1} - acc: {2:.4f} - mAP:{3:.4f}'.format(c, t, prec1, mAP)) 80 | if dba == 0: 81 | return 82 | print('Testing using instance feature augmentation') 83 | dba_embeddings, dba_set = instance_avg(device, ref_embeddings, 84 | test_train_set, labels, dba) 85 | sim = torch.mm(test_embeddings, dba_embeddings.t()) 86 | prec1, c, t, _, _ = precision1(sim, test_set, dba_set) 87 | mAP = mean_avg_precision(sim, test_set, dba_set) 88 | print('Descriptor (TEST DBA k={4}): {0} / {1} - acc: {2:.4f} - mAP:{3:.4f}'.format(c, t, prec1, mAP, dba)) 89 | 90 | 91 | if __name__ == '__main__': 92 | options_l = (['help', 'dataset=', 'model=', 'weights=', 'device=', 93 | 'feature-dim=', 'regions-k=', 'dba=']) 94 | try: 95 | opts, args = getopt.getopt(sys.argv[1:], '', options_l) 96 | except getopt.GetoptError: 97 | usage() 98 | sys.exit(2) 99 | dataset_full, model, weights, device = None, None, None, None 100 | feature_dim, regions_k, dba = None, None, -1 101 | for opt, arg in opts: 102 | if opt in ('--help'): 103 | usage() 104 | sys.exit() 105 | elif opt in ('--dataset'): 106 | dataset_full = check_folder(arg, 'dataset', True, usage) 107 | elif opt in ('--model'): 108 | model = check_model(arg, usage) 109 | elif opt in ('--weights'): 110 | weights = check_file(arg, 'initialization weights', True, usage) 111 | elif opt in ('--device'): 112 | device = check_int(arg, 'device', usage) 113 | elif opt in ('--feature-dim'): 114 | feature_dim = check_int(arg, 'feature-dim', usage) 115 | elif opt in ('--regions-k'): 116 | regions_k = check_int(arg, 'regions-k', usage) 117 | elif opt in ('--dba'): 118 | dba = check_int(arg, 'dba', usage) 119 | if (dataset_full is None or model is None or 120 | weights is None or device is None or 121 | feature_dim is None or regions_k is None): 122 | print('One or more required arguments is missing.') 123 | usage() 124 | sys.exit(2) 125 | 126 | with torch.cuda.device(device): 127 | try: 128 | main(dataset_full, model, weights, device, feature_dim, 129 | regions_k, dba) 130 | except: 131 | log_detail(P, None, traceback.format_exc()) 132 | raise 133 | -------------------------------------------------------------------------------- /train/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxgreat/Instance-Search/2cea5f64a2d397047072a91788af81c0ea1c6d5e/train/__init__.py -------------------------------------------------------------------------------- /train/classif_finetune.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | import traceback 4 | import random 5 | import torch 6 | import torch.optim as optim 7 | import torch.nn as nn 8 | import torchvision.models as models 9 | import torchvision.transforms as transforms 10 | from torch.autograd import Variable 11 | from classif_finetune_p import P 12 | from utils import move_device, tensor_t, tensor, fold_batches, train_gen 13 | from utils import imread_rgb, log, log_detail, test_print_classif 14 | from utils import test_print_descriptor, get_images_labels 15 | from model.siamese import TuneClassif 16 | from model.custom_modules import NormalizeL2Fun 17 | 18 | # keep labels as global variable. they are initialized after 19 | # train set has been loaded and then kept constant 20 | labels = [] 21 | train_type = P.cnn_model.lower() + ' Classification simple fine-tuning' 22 | 23 | 24 | # test a classifier model. it should be in eval mode 25 | def test_classif_net(net, test_set): 26 | """ 27 | Test the network accuracy on a test_set 28 | Return the number of success and the number of evaluations done 29 | """ 30 | trans = P.test_trans 31 | if P.test_pre_proc: 32 | trans = transforms.Compose([]) 33 | 34 | def eval_batch_test(last, i, is_final, batch): 35 | correct, total = last 36 | n = len(batch) 37 | test_in = tensor(P.cuda_device, n, *P.image_input_size) 38 | for j, (testIm, _, _) in enumerate(batch): 39 | test_in[j] = trans(testIm) 40 | out = net(Variable(test_in, volatile=True)).data 41 | # first get all maximal values for classification 42 | # then, use the spatial region with the highest maximal value 43 | # to make a prediction 44 | _, predicted = torch.max(out, 1) 45 | total += n 46 | correct += sum(labels.index(testLabel) == predicted[j][0] for j, (_, testLabel, _) in enumerate(batch)) 47 | return correct, total 48 | 49 | # batch size has to be 1 here 50 | return fold_batches(eval_batch_test, (0, 0), test_set, P.test_batch_size) 51 | 52 | 53 | def train_classif(net, train_set, testset_tuple, criterion, optimizer, best_score=0): 54 | # trans is a list of transforms for each scale here 55 | trans = P.train_trans 56 | if P.train_pre_proc: 57 | trans = transforms.Compose([]) 58 | 59 | # images are already pre-processed in all cases 60 | def create_epoch(epoch, train_set, testset_tuple): 61 | random.shuffle(train_set) 62 | # labels are needed for stats 63 | return train_set, {} 64 | 65 | def create_batch(batch, n): 66 | train_in = tensor(P.cuda_device, n, *P.image_input_size) 67 | labels_in = tensor_t(torch.LongTensor, P.cuda_device, n) 68 | for j, (im, lab, _) in enumerate(batch): 69 | train_in[j] = trans(im) 70 | labels_in[j] = labels.index(lab) 71 | return [train_in], [labels_in] 72 | 73 | def create_loss(t_out, labels_list): 74 | return criterion(t_out, labels_list[0]), None 75 | 76 | train_gen(train_type, P, test_print_classif, test_classif_net, net, 77 | train_set, testset_tuple, optimizer, create_epoch, create_batch, 78 | create_loss, best_score=best_score) 79 | 80 | 81 | # get the embeddings as the normalized output of the classification 82 | def get_embeddings(net, dataset, device, out_size): 83 | trans = P.test_trans 84 | if P.test_pre_proc: 85 | trans = transforms.Compose([]) 86 | 87 | if not P.embeddings_classify: 88 | # remove classifier and add back later 89 | classifier = net.classifier 90 | net.classifier = nn.Sequential() 91 | 92 | def batch(last, i, is_final, batch): 93 | embeddings = last 94 | n = len(batch) 95 | test_in = tensor(P.cuda_device, n, *P.image_input_size) 96 | for j, (testIm, _, _) in enumerate(batch): 97 | test_in[j] = trans(testIm) 98 | out = net(Variable(test_in, volatile=True)) 99 | # we have the classification values. just normalize 100 | out = NormalizeL2Fun()(out) 101 | out = out.data 102 | for j in range(n): 103 | embeddings[i + j] = out[j] 104 | return embeddings 105 | 106 | init = tensor(device, len(dataset), out_size) 107 | embeddings = fold_batches(batch, init, dataset, P.test_batch_size) 108 | if not P.embeddings_classify: 109 | net.classifier = classifier 110 | return embeddings 111 | 112 | 113 | def get_class_net(): 114 | model = models.alexnet 115 | if P.cnn_model.lower() == 'resnet152': 116 | model = models.resnet152 117 | net = TuneClassif(model(pretrained=True), len(labels), untrained=P.untrained_blocks) 118 | if P.preload_net: 119 | net.load_state_dict(torch.load(P.preload_net, map_location=lambda storage, location: storage.cpu())) 120 | net = move_device(net, P.cuda_device) 121 | return net 122 | 123 | 124 | def main(): 125 | # training and test sets 126 | train_set_full = get_images_labels(P.dataset_full, P.match_labels) 127 | test_set_full = get_images_labels(P.dataset_full + '/test', P.match_labels) 128 | 129 | labels_list = [t[1] for t in train_set_full] 130 | # we have to give a number to each label, 131 | # so we need a list here for the index 132 | labels.extend(sorted(list(set(labels_list)))) 133 | 134 | log(P, 'Loading and transforming train/test sets.') 135 | 136 | # open the images (and transform already if possible) 137 | # do that only if it fits in memory ! 138 | train_set, test_train_set, test_set = [], [], [] 139 | train_pre_f = P.train_trans if P.train_pre_proc else transforms.Compose([]) 140 | test_pre_f = P.test_trans if P.test_pre_proc else transforms.Compose([]) 141 | for im, lab in train_set_full: 142 | im_o = imread_rgb(im) 143 | train_set.append((train_pre_f(im_o), lab, im)) 144 | test_train_set.append((test_pre_f(im_o), lab, im)) 145 | 146 | for im, lab in test_set_full: 147 | if lab not in labels: 148 | continue 149 | im_o = imread_rgb(im) 150 | test_set.append((test_pre_f(im_o), lab, im)) 151 | 152 | class_net = get_class_net() 153 | optimizer = optim.SGD((p for p in class_net.parameters() if p.requires_grad), lr=P.train_lr, momentum=P.train_momentum, weight_decay=P.train_weight_decay) 154 | criterion = nn.CrossEntropyLoss(size_average=P.train_loss_avg) 155 | testset_tuple = (test_set, test_train_set) 156 | if P.test_upfront: 157 | log(P, 'Upfront testing of classification model') 158 | score = test_print_classif(train_type, P, class_net, testset_tuple, test_classif_net) 159 | else: 160 | score = 0 161 | if P.train: 162 | log(P, 'Starting classification training') 163 | train_classif(class_net, train_set, testset_tuple, criterion, optimizer, best_score=score) 164 | log(P, 'Finished classification training') 165 | if P.test_descriptor_net: 166 | log(P, 'Testing as descriptor') 167 | test_print_descriptor(train_type, P, class_net, testset_tuple, get_embeddings) 168 | 169 | 170 | if __name__ == '__main__': 171 | with torch.cuda.device(P.cuda_device): 172 | try: 173 | main() 174 | except: 175 | log_detail(P, None, traceback.format_exc()) 176 | raise 177 | -------------------------------------------------------------------------------- /train/classif_finetune_p.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | from datetime import datetime 4 | from utils import * # image transforms, general utilities 5 | from global_p import * # global config 6 | 7 | # in AlexNet, there are 5 convolutional layers with parameters 8 | # and 3 FC layers in the classifier 9 | # in ResNet, before first layer, there are 2 modules with parameters. 10 | # then number of blocks per layers: 11 | # ResNet152 - layer 1: 3, layer 2: 8, layer 3: 36, layer 4: 3 12 | # ResNet50 - layer 1: 3, layer 2: 4, layer 3: 6, layer 4: 3 13 | # finally, a single FC layer is used as classifier 14 | untrained_blocks = { 15 | 'alexnet': 4, 16 | 'resnet152': 2 + 3 + 8 + 36 17 | } 18 | 19 | 20 | # parameters for the sub-regions classification training with AlexNet 21 | class Params(object): 22 | 23 | def __init__(self): 24 | # general parameters 25 | self.cnn_model = 'AlexNet' 26 | self.dataset_full = 'data/pre_proc/CLICIDE_video_224sq' 27 | self.cuda_device = 0 28 | self.dataset_id = parse_dataset_id(self.dataset_full) 29 | # the file containing mean and standard deviation values 30 | # for a new dataset, simply use the filename here or add it to the 31 | # global_p module parameters 32 | # (this is valid for the following parameters, too) 33 | self.mean_std_file = mean_std_files[self.dataset_id] 34 | # the function for obtaining labels from a filename in the dataset 35 | # this function takes a filename and returns a unique label 36 | self.match_labels = match_label_functions[self.dataset_id] 37 | # input size. this is usually always (3, 224, 224) unless larger 38 | # fixed-size images should be used 39 | self.image_input_size = image_sizes[self.dataset_id] 40 | # the number of different labels in the dataset 41 | self.num_classes = num_classes[self.dataset_id] 42 | # the 2D size of the convolutional features of the base network 43 | self.feature_size2d = feature_sizes[(self.cnn_model.lower(), self.image_input_size)] 44 | # the number of blocks in the base network that should not be trained 45 | # (starting from the lowest and going to higher layers/blocks) 46 | # usually, block represents a layer with parameters, 47 | # for ResNet or equivalent, block is a whole block of layers 48 | self.untrained_blocks = untrained_blocks[self.cnn_model.lower()] 49 | 50 | # read mean and standard of dataset here to define transforms already 51 | m, s = read_mean_std(self.mean_std_file) 52 | 53 | # Classification net general and test params 54 | self.preload_net = '' # allows to continue training a network 55 | self.test_upfront = True 56 | self.train = True 57 | self.test_batch_size = 64 58 | self.test_pre_proc = True 59 | self.test_trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize(m, s)]) 60 | 61 | # Classification net training params 62 | self.train_epochs = 50 63 | self.train_batch_size = 32 64 | self.train_micro_batch = 0 65 | self.train_aug_rot = r = 180 66 | self.train_aug_hrange = hr = 0.25 67 | self.train_aug_vrange = vr = 0.25 68 | self.train_aug_hsrange = hsr = 0.25 69 | self.train_aug_vsrange = vsr = 0.25 70 | self.train_aug_hflip = hflip = True 71 | trans = transforms.Compose([random_affine_noisy_cv(rotation=r, h_range=hr, v_range=vr, hs_range=hsr, vs_range=vsr, h_flip=hflip), transforms.ToTensor(), transforms.Normalize(m, s)]) 72 | # transformation for each scale 73 | self.train_trans = trans 74 | self.train_pre_proc = False 75 | 76 | self.train_lr = 1e-2 77 | self.train_momentum = 0.9 78 | self.train_weight_decay = 5e-4 79 | self.train_optim = 'SGD' 80 | self.train_annealing = {30: 0.1} 81 | self.train_loss_avg = True 82 | self.train_loss_int = 10 83 | self.train_test_int = 0 84 | # the batch norm layer cannot be trained if the micro-batch size 85 | # is too small, as global variances/means cannot be properly 86 | # approximated in this case. so train only when having a batch 87 | # of at least 16 88 | self.train_bn = self.train_micro_batch >= 16 or (self.train_micro_batch <= 0 and (self.train_batch_size >= 16 or self.train_batch_size <= 0)) 89 | 90 | # Descriptor net parameters 91 | # if True, test the network as a descriptor 92 | # (using the normalized classification output): 93 | self.test_descriptor_net = True 94 | # the threshold (in Bytes) for embeddings to be computed on GPU 95 | self.embeddings_cuda_size = 2 ** 30 96 | # if True, use classifier output for embeddings. 97 | # else use convolutional features 98 | self.embeddings_classify = False 99 | self.feature_dim = self.num_classes if self.embeddings_classify else flat_feature_sizes[(self.cnn_model.lower(), self.image_input_size)] 100 | # UUID for these parameters (current time) 101 | self.uuid = datetime.now() 102 | self.save_dir = 'data' 103 | self.log_file = path.join(self.save_dir, unique_str(self) + '.log') 104 | 105 | 106 | # global test params: 107 | P = Params() 108 | -------------------------------------------------------------------------------- /train/classif_regions.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | import traceback 4 | import random 5 | import torch 6 | import torch.optim as optim 7 | import torch.nn as nn 8 | import torchvision.models as models 9 | import torchvision.transforms as transforms 10 | from torch.autograd import Variable 11 | from classif_regions_p import P 12 | from utils import move_device, tensor_t, tensor, fold_batches, train_gen 13 | from utils import imread_rgb, log, log_detail, test_print_classif 14 | from utils import test_print_descriptor, get_images_labels 15 | from model.siamese import TuneClassif, TuneClassifSub 16 | from model.custom_modules import NormalizeL2Fun 17 | 18 | # keep labels as global variable. they are initialized after 19 | # train set has been loaded and then kept constant 20 | labels = [] 21 | train_type = P.cnn_model.lower() + ' Classification sub-regions' 22 | 23 | 24 | # test a classifier model. it should be in eval mode 25 | def test_classif_net(net, test_set): 26 | """ 27 | Test the network accuracy on a test_set 28 | Return the number of success and the number of evaluations done 29 | """ 30 | trans = P.test_trans 31 | if P.test_pre_proc: 32 | trans = transforms.Compose([]) 33 | 34 | def eval_batch_test(last, i, is_final, batch): 35 | correct, total = last 36 | im_trans = trans(batch[0][0]) 37 | test_in = move_device(im_trans.unsqueeze(0), P.cuda_device) 38 | out = net(Variable(test_in, volatile=True))[0].data 39 | # first get all maximal values for classification 40 | # then, use the spatial region with the highest maximal value 41 | # to make a prediction 42 | max_pred, predicted = torch.max(out, 1) 43 | _, max_subp = torch.max(max_pred.view(-1), 0) 44 | predicted = predicted.view(-1)[max_subp[0]] 45 | total += 1 46 | correct += (labels.index(batch[0][1]) == predicted) 47 | 48 | return correct, total 49 | 50 | # batch size has to be 1 here 51 | return fold_batches(eval_batch_test, (0, 0), test_set, 1) 52 | 53 | 54 | def train_classif_subparts(net, train_set, testset_tuple, criterion, optimizer, best_score=0): 55 | # trans is a list of transforms for each scale here 56 | trans_scales = P.train_trans 57 | for i, t in enumerate(trans_scales): 58 | if P.train_pre_proc[i]: 59 | trans_scales[i] = transforms.Compose([]) 60 | 61 | # images are already pre-processed in all cases 62 | def create_epoch(epoch, train_set, testset_tuple): 63 | random.shuffle(train_set) 64 | # labels are needed for stats 65 | return train_set, {} 66 | 67 | def create_batch(batch, n): 68 | # must proceed image by image (since different input sizes) 69 | # each image/batch is composed of multiple scales 70 | n_sc = len(batch[0][0]) 71 | train_in_scales = [] 72 | labels_in = tensor_t(torch.LongTensor, P.cuda_device, 1) 73 | labels_in.fill_(labels.index(batch[0][1])) 74 | for j in range(n_sc): 75 | im = trans_scales[j](batch[0][0][j]) 76 | train_in = move_device(im.unsqueeze(0), P.cuda_device) 77 | train_in_scales.append(train_in) 78 | return train_in_scales, [labels_in] 79 | 80 | def create_loss(scales_out, labels_list): 81 | # scales_out is a list over all scales, 82 | # with all sub-region classifications for each scale 83 | labels_in = labels_list[0] 84 | loss = None 85 | for s, t_out in enumerate(scales_out): 86 | # batch size is 1, only consider this output 87 | t_out0 = t_out[0] 88 | # all spatial outputs are of shape (num_classes, width, height) 89 | # make a 'batch' as follows: (width * height, num_classes) 90 | # then apply loss to the whole batch, and accumulate over scales 91 | t_out_all = t_out0.view(t_out0.size(0), -1).t() 92 | if loss is None: 93 | loss = criterion(t_out_all, labels_in.expand(t_out_all.size(0))) 94 | else: 95 | loss += criterion(t_out_all, labels_in.expand(t_out_all.size(0))) 96 | if P.train_loss_avg: 97 | loss /= len(scales_out) 98 | return loss, None 99 | 100 | train_gen(train_type, P, test_print_classif, test_classif_net, net, 101 | train_set, testset_tuple, optimizer, create_epoch, create_batch, 102 | create_loss, best_score=best_score) 103 | 104 | 105 | # get the embeddings as the normalized output of the classification 106 | # values where the highest maximal activation occurred 107 | def get_embeddings(net, dataset, device, out_size): 108 | test_trans = P.test_trans 109 | if P.test_pre_proc: 110 | test_trans = transforms.Compose([]) 111 | 112 | def batch(last, i, is_final, batch): 113 | embeddings = last 114 | im_trans = test_trans(batch[0][0]) 115 | test_in = move_device(im_trans.unsqueeze(0), P.cuda_device) 116 | out = net(Variable(test_in, volatile=True))[0].data 117 | # first, determine location of highest maximal activation 118 | max_pred, _ = out.max(1) 119 | max_pred1, max_i1 = max_pred.max(2) 120 | _, max_i2 = max_pred1.max(3) 121 | i2 = max_i2.view(-1)[0] 122 | i1 = max_i1.view(-1)[i2] 123 | # we have the indexes of the highest maximal activation, 124 | # get the classification values at this point and normalize 125 | out = out[:, :, i1, i2] 126 | out = NormalizeL2Fun()(Variable(out, volatile=True)) 127 | out = out.data 128 | embeddings[i] = out[0] 129 | return embeddings 130 | 131 | init = tensor(device, len(dataset), out_size) 132 | return fold_batches(batch, init, dataset, 1) 133 | 134 | 135 | def get_class_net(): 136 | model = models.alexnet 137 | if P.cnn_model.lower() == 'resnet152': 138 | model = models.resnet152 139 | if P.bn_model: 140 | bn_model = TuneClassif(model(), len(labels)) 141 | bn_model.load_state_dict(torch.load(P.bn_model, map_location=lambda storage, location: storage.cpu())) 142 | # copy_bn_all(net.features, bn_model.features) 143 | else: 144 | bn_model = model(pretrained=True) 145 | net = TuneClassifSub(bn_model, len(labels), P.feature_size2d, untrained=P.untrained_blocks) 146 | if P.preload_net: 147 | net.load_state_dict(torch.load(P.preload_net, map_location=lambda storage, location: storage.cpu())) 148 | net = move_device(net, P.cuda_device) 149 | return net 150 | 151 | 152 | def main(): 153 | # training and test sets 154 | train_set_full = get_images_labels(P.dataset_full, P.match_labels) 155 | test_set_full = get_images_labels(P.dataset_full + '/test', P.match_labels) 156 | 157 | labels_list = [t[1] for t in train_set_full] 158 | # we have to give a number to each label, 159 | # so we need a list here for the index 160 | labels.extend(sorted(list(set(labels_list)))) 161 | 162 | log(P, 'Loading and transforming train/test sets.') 163 | 164 | # open the images (and transform already if possible) 165 | # do that only if it fits in memory ! 166 | train_set, test_train_set, test_set = [], [], [] 167 | train_pre_f = [t if pre_proc else transforms.Compose([]) for t, pre_proc in zip(P.train_trans, P.train_pre_proc)] 168 | test_pre_f = P.test_trans if P.test_pre_proc else transforms.Compose([]) 169 | train_scales = P.train_sub_scales 170 | for im, lab in train_set_full: 171 | im_o = imread_rgb(im) 172 | scales = [t(im_o) for t in train_scales] 173 | train_set.append((scales, lab, im)) 174 | for j, t in enumerate(train_pre_f): 175 | scales[j] = t(scales[j]) 176 | im_pre_test = test_pre_f(im_o) if test_pre_f else im_o 177 | test_train_set.append((im_pre_test, lab, im)) 178 | 179 | for im, lab in test_set_full: 180 | if lab not in labels: 181 | continue 182 | im_o = imread_rgb(im) 183 | test_set.append((test_pre_f(im_o), lab, im)) 184 | 185 | class_net = get_class_net() 186 | optimizer = optim.SGD((p for p in class_net.parameters() if p.requires_grad), lr=P.train_lr, momentum=P.train_momentum, weight_decay=P.train_weight_decay) 187 | criterion = nn.CrossEntropyLoss(size_average=P.train_loss_avg) 188 | testset_tuple = (test_set, test_train_set) 189 | if P.test_upfront: 190 | log(P, 'Upfront testing of classification model') 191 | score = test_print_classif(train_type, P, class_net, testset_tuple, test_classif_net) 192 | else: 193 | score = 0 194 | if P.train: 195 | log(P, 'Starting classification training') 196 | train_classif_subparts(class_net, train_set, testset_tuple, criterion, optimizer, best_score=score) 197 | log(P, 'Finished classification training') 198 | if P.test_descriptor_net: 199 | log(P, 'Testing as descriptor') 200 | test_print_descriptor(train_type, P, class_net, testset_tuple, get_embeddings) 201 | 202 | 203 | if __name__ == '__main__': 204 | with torch.cuda.device(P.cuda_device): 205 | try: 206 | main() 207 | except: 208 | log_detail(P, None, traceback.format_exc()) 209 | raise 210 | -------------------------------------------------------------------------------- /train/classif_regions_p.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | from datetime import datetime 4 | from utils import * # image transforms, general utilities 5 | from global_p import * # global config 6 | 7 | # in AlexNet, there are 5 convolutional layers with parameters 8 | # and 3 FC layers in the classifier 9 | # in ResNet, before first layer, there are 2 modules with parameters. 10 | # then number of blocks per layers: 11 | # ResNet152 - layer 1: 3, layer 2: 8, layer 3: 36, layer 4: 3 12 | # ResNet50 - layer 1: 3, layer 2: 4, layer 3: 6, layer 4: 3 13 | # finally, a single FC layer is used as classifier 14 | untrained_blocks = { 15 | 'alexnet': 4, 16 | 'resnet152': 2 + 3 + 8 + 36 17 | } 18 | 19 | 20 | # parameters for the sub-regions classification training with AlexNet 21 | class Params(object): 22 | 23 | def __init__(self): 24 | # general parameters 25 | self.cnn_model = 'ResNet152' 26 | self.dataset_full = 'data/pre_proc/fourviere_clean2_448' 27 | self.cuda_device = 0 28 | self.dataset_id = parse_dataset_id(self.dataset_full) 29 | # the file containing mean and standard deviation values 30 | # for a new dataset, simply use the filename here or add it to the 31 | # global_p module parameters 32 | # (this is valid for the following parameters, too) 33 | self.mean_std_file = mean_std_files[self.dataset_id] 34 | # the function for obtaining labels from a filename in the dataset 35 | # this function takes a filename and returns a unique label 36 | self.match_labels = match_label_functions[self.dataset_id] 37 | # input size. this is usually always (3, 224, 224) unless larger 38 | # fixed-size images should be used 39 | self.image_input_size = image_sizes[self.dataset_id] 40 | # the number of different labels in the dataset 41 | self.num_classes = num_classes[self.dataset_id] 42 | # the 2D size of the convolutional features of the base network 43 | self.feature_size2d = feature_sizes[(self.cnn_model.lower(), self.image_input_size)] 44 | # the number of blocks in the base network that should not be trained 45 | # (starting from the lowest and going to higher layers/blocks) 46 | # usually, block represents a layer with parameters, 47 | # for ResNet or equivalent, block is a whole block of layers 48 | self.untrained_blocks = untrained_blocks[self.cnn_model.lower()] 49 | 50 | # read mean and standard of dataset here to define transforms already 51 | m, s = read_mean_std(self.mean_std_file) 52 | 53 | # Classification net general and test params 54 | self.preload_net = '' # allows to continue training a network 55 | self.bn_model = 'data/final_classif_ft/fou_resnet152.pth.tar' 56 | self.test_upfront = True 57 | self.train = True 58 | self.test_pre_proc = True 59 | self.test_trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize(m, s)]) 60 | 61 | # Classification net training params 62 | self.train_epochs = 50 63 | self.train_batch_size = 32 64 | self.train_micro_batch = 1 # has to be 1 65 | self.train_aug_rot = r = 45 66 | self.train_aug_hrange = hr = 0 67 | self.train_aug_vrange = vr = 0 68 | self.train_aug_hsrange = hsr = 0.25 69 | self.train_aug_vsrange = vsr = 0.25 70 | self.train_aug_hflip = hflip = True 71 | trans = transforms.Compose([random_affine_noisy_cv(rotation=r, h_range=hr, v_range=vr, hs_range=hsr, vs_range=vsr, h_flip=hflip), transforms.ToTensor(), transforms.Normalize(m, s)]) 72 | # list of transforms for all scales 73 | # the train_trans parameter should be a list of same 74 | # length representing the train transformation for each scale 75 | self.train_sub_scales = [transforms.Compose([]), scale_cv(224)] 76 | # transformation for each scale 77 | self.train_trans = [trans, trans] 78 | self.train_pre_proc = [False, False] 79 | 80 | self.train_lr = 1e-3 81 | self.train_momentum = 0.9 82 | self.train_weight_decay = 5e-4 83 | self.train_optim = 'SGD' 84 | self.train_annealing = {30: 0.1} 85 | self.train_loss_avg = True 86 | self.train_loss_int = 10 87 | self.train_test_int = 0 88 | # the batch norm layer cannot be trained if the micro-batch size 89 | # is too small, as global variances/means cannot be properly 90 | # approximated in this case. so train only when having a batch 91 | # of at least 16 92 | self.train_bn = False 93 | 94 | # Descriptor net parameters 95 | # if True, test the network as a descriptor 96 | # (using the normalized classification output): 97 | self.test_descriptor_net = True 98 | # the threshold (in Bytes) for embeddings to be computed on GPU 99 | self.embeddings_cuda_size = 2 ** 30 100 | self.feature_dim = self.num_classes 101 | 102 | # UUID for these parameters (current time) 103 | self.uuid = datetime.now() 104 | self.save_dir = 'data' 105 | self.log_file = path.join(self.save_dir, unique_str(self) + '.log') 106 | 107 | 108 | # global test params: 109 | P = Params() 110 | -------------------------------------------------------------------------------- /train/global_p.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | # from utils import trans_str, fun_str 4 | 5 | 6 | def match_label_fou_clean2(x): 7 | s = x.split('/')[-1].split('_') 8 | return s[0] + s[1] 9 | 10 | 11 | def match_label_video(x): 12 | return x.split('/')[-1].split('-')[0] 13 | 14 | 15 | def match_label_oxford(x): 16 | return x.split('/')[-1].split('_')[0] 17 | 18 | 19 | image_sizes = { 20 | 'CLICIDE': (3, 224, 224), 21 | 'CLICIDE_max_224sq': (3, 224, 224), 22 | 'CLICIDE_video_227sq': (3, 227, 227), 23 | 'CLICIDE_video_224sq': (3, 224, 224), 24 | 'CLICIDE_video_384': (3, 224, 224), 25 | 'CLICIDE_video_448': (3, 224, 224), 26 | 'fourviere_clean2_224sq': (3, 224, 224), 27 | 'fourviere_clean2_384': (3, 224, 224), 28 | 'fourviere_clean2_448': (3, 224, 224), 29 | 'oxford5k_video_224sq': (3, 224, 224), 30 | 'oxford5k_video_384': (3, 224, 224) 31 | } 32 | 33 | num_classes = { 34 | 'CLICIDE': 464, 35 | 'CLICIDE_max_224sq': 464, 36 | 'CLICIDE_video_227sq': 464, 37 | 'CLICIDE_video_224sq': 464, 38 | 'CLICIDE_video_384': 464, 39 | 'CLICIDE_video_448': 464, 40 | 'fourviere_clean2_224sq': 311, 41 | 'fourviere_clean2_384': 311, 42 | 'fourviere_clean2_448': 311, 43 | 'oxford5k_video_224sq': 17, 44 | 'oxford5k_video_384': 17 45 | } 46 | 47 | feature_sizes = { 48 | ('alexnet', (3, 224, 224)): (6, 6), 49 | ('resnet152', (3, 224, 224)): (7, 7), 50 | ('resnet152', (3, 227, 227)): (8, 8) 51 | } 52 | 53 | flat_feature_sizes = { 54 | ('alexnet', (3, 224, 224)): 9216, 55 | ('resnet152', (3, 224, 224)): 2048 56 | } 57 | 58 | mean_std_files = { 59 | 'CLICIDE': 'data/CLICIDE_224sq_train_ms.txt', 60 | 'CLICIDE_video_227sq': 'data/cli.txt', 61 | 'CLICIDE_video_224sq': 'data/CLICIDE_224sq_train_ms.txt', 62 | 'CLICIDE_max_224sq': 'data/CLICIDE_224sq_train_ms.txt', 63 | 'CLICIDE_video_384': 'data/CLICIDE_384_train_ms.txt', 64 | 'CLICIDE_video_448': 'data/CLICIDE_448_train_ms.txt', 65 | 'fourviere_clean2_224sq': 'data/fourviere_224sq_train_ms.txt', 66 | 'fourviere_clean2_384': 'data/fourviere_384_train_ms.txt', 67 | 'fourviere_clean2_448': 'data/fourviere_448_train_ms.txt', 68 | 'oxford5k_video_224sq': 'data/oxford5k_224sq_train_ms.txt', 69 | 'oxford5k_video_384': 'data/oxford5k_384_train_ms.txt', 70 | } 71 | 72 | match_label_functions = { 73 | 'CLICIDE': match_label_video, 74 | 'CLICIDE_video_227sq': match_label_video, 75 | 'CLICIDE_max_224sq': match_label_video, 76 | 'CLICIDE_video_224sq': match_label_video, 77 | 'CLICIDE_video_384': match_label_video, 78 | 'CLICIDE_video_448': match_label_video, 79 | 'fourviere_clean2_224sq': match_label_fou_clean2, 80 | 'fourviere_clean2_384': match_label_fou_clean2, 81 | 'fourviere_clean2_448': match_label_fou_clean2, 82 | 'oxford5k_video_224sq': match_label_oxford, 83 | 'oxford5k_video_384': match_label_oxford 84 | } 85 | -------------------------------------------------------------------------------- /train/siamese_descriptor.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | import traceback 4 | import random 5 | import numpy as np 6 | import torch 7 | import torch.optim as optim 8 | import torchvision.models as models 9 | import torchvision.transforms as transforms 10 | from torch.autograd import Variable 11 | from siamese_descriptor_p import P 12 | from utils import move_device, tensor_t, tensor, fold_batches, train_gen 13 | from utils import imread_rgb, log, log_detail, get_lab_indicators 14 | from utils import get_images_labels, get_similarities, embeddings_device_dim 15 | from utils import test_print_descriptor, choose_rand_neg, get_pos_couples 16 | from model.siamese import TuneClassif, DescriptorNet 17 | from model.custom_modules import TripletLoss 18 | 19 | # keep labels as global variable. they are initialized after 20 | # train set has been loaded and then kept constant 21 | labels = [] 22 | train_type = P.cnn_model.lower() + ' Siamese descriptor' 23 | 24 | 25 | def get_embeddings(net, dataset, device, out_size): 26 | test_trans = P.test_trans 27 | if P.test_pre_proc: 28 | test_trans = transforms.Compose([]) 29 | 30 | def batch(last, i, is_final, batch): 31 | embeddings = last 32 | test_in = tensor(P.cuda_device, len(batch), *P.image_input_size) 33 | for j, (im, _, _) in enumerate(batch): 34 | test_in[j] = test_trans(im) 35 | out = net(Variable(test_in, volatile=True)).data 36 | for j, embedding in enumerate(out): 37 | embeddings[i + j] = embedding 38 | return embeddings 39 | 40 | init = tensor(device, len(dataset), out_size) 41 | return fold_batches(batch, init, dataset, P.test_batch_size) 42 | 43 | 44 | # train using triplets, constructing triplets from all positive couples 45 | def train_siam_triplets_pos_couples(net, train_set, testset_tuple, criterion, optimizer, best_score=0): 46 | """ 47 | TODO 48 | """ 49 | train_trans = P.train_trans 50 | if P.train_pre_proc: 51 | train_trans = transforms.Compose([]) 52 | 53 | couples = get_pos_couples(train_set) 54 | sim_device, _ = embeddings_device_dim(P, net, len(train_set), sim_matrix=True) 55 | lab_indicators = get_lab_indicators(train_set, sim_device) 56 | num_pos = sum(len(couples[lab]) for lab in couples) 57 | log(P, '#pos (without order, with duplicates):{0}'.format(num_pos)) 58 | 59 | # fold over positive couples here and choose negative for each pos 60 | # need to make sure the couples are evenly distributed 61 | # such that all batches can have couples from every instance 62 | def shuffle_couples(couples): 63 | for lab in couples: 64 | random.shuffle(couples[lab]) 65 | # get x such that only 20% of labels have more than x couples 66 | a = np.array([len(couples[lab]) for lab in couples]) 67 | x = int(np.percentile(a, 80)) 68 | out = [] 69 | keys = couples.keys() 70 | random.shuffle(keys) 71 | # append the elements to out in a strided way 72 | # (up to x elements per label) 73 | for count in range(x): 74 | for lab in keys: 75 | if count >= len(couples[lab]): 76 | continue 77 | out.append(couples[lab][count]) 78 | # the last elements in the longer lists are inserted at random 79 | for lab in keys: 80 | for i in range(x, len(couples[lab])): 81 | out.insert(random.randrange(len(out)), couples[lab][i]) 82 | return out 83 | 84 | def create_epoch(epoch, couples, testset_tuple): 85 | test_ref_set = testset_tuple[1] 86 | # use the test-train set to obtain embeddings and similarities 87 | # (since it may be transformed differently than train set) 88 | similarities, _ = get_similarities(P, get_embeddings, net, test_ref_set) 89 | 90 | # shuffle the couples 91 | shuffled = shuffle_couples(couples) 92 | return shuffled, {'epoch': epoch, 'similarities': similarities} 93 | 94 | def create_batch(batch, n, epoch, similarities): 95 | # one image at a time. batch is always of size 1 96 | train_in1 = tensor(P.cuda_device, n, *P.image_input_size) 97 | train_in2 = tensor(P.cuda_device, n, *P.image_input_size) 98 | train_in3 = tensor(P.cuda_device, n, *P.image_input_size) 99 | labels_in = tensor_t(torch.LongTensor, P.cuda_device, n) 100 | # we get positive couples. find negatives for them 101 | for j, (lab, (i1, i2), (im1, im2)) in enumerate(batch): 102 | im3 = None 103 | # choose a semi-hard negative. see FaceNet 104 | # paper by Schroff et al for details. 105 | # essentially, choose hardest negative that is still 106 | # easier than the positive. this should avoid 107 | # collapsing the model at beginning of training 108 | ind_exl = lab_indicators[lab] 109 | sim_pos = similarities[i1, i2] 110 | if epoch < P.train_epoch_switch: 111 | # exclude all positives as well as any that are 112 | # more similar than sim_pos 113 | ind_exl = ind_exl | similarities[i1].ge(sim_pos) 114 | if ind_exl.sum() >= similarities.size(0): 115 | p = 'cant find semi-hard neg for' 116 | s = 'falling back to random neg' 117 | n_pos = lab_indicators[lab].sum() 118 | n_ge = similarities[i1].ge(sim_pos).sum() 119 | n_tot = similarities.size(0) 120 | print('{0} {1}-{2}-{3} (#pos:{4}, #ge:{5}, #total:{6}), {7}'.format(p, i1, i2, lab, n_pos, n_ge, n_tot, s)) 121 | else: 122 | # similarities must be in [-1, 1] 123 | # set all similarities of excluded indexes to -2 124 | # then take argmax (highest similarity not excluded) 125 | sims = similarities[i1].clone() 126 | sims[ind_exl] = -2 127 | _, k = sims.max(0) 128 | im3 = train_set[k[0]][0] 129 | if im3 is None: 130 | # default to random negative 131 | im3 = choose_rand_neg(train_set, lab) 132 | # one image at a time 133 | train_in1[j] = train_trans(im1) 134 | train_in2[j] = train_trans(im2) 135 | train_in3[j] = train_trans(im3) 136 | labels_in[j] = labels.index(lab) 137 | # return input tensors and labels 138 | return [train_in1, train_in2, train_in3], [labels_in] 139 | 140 | def create_loss(out, labels_list): 141 | # out is a tuple of 3 descriptors 142 | # simply apply triplet loss 143 | loss = criterion(*out) 144 | return loss, None 145 | 146 | train_gen(train_type, P, test_print_descriptor, get_embeddings, net, 147 | couples, testset_tuple, optimizer, create_epoch, create_batch, 148 | create_loss, best_score=best_score) 149 | 150 | 151 | def get_siamese_net(): 152 | model = models.alexnet 153 | if P.cnn_model.lower() == 'resnet152': 154 | model = models.resnet152 155 | class_net = TuneClassif(model(pretrained=True), P.num_classes, untrained=P.untrained_blocks) 156 | if P.classif_model: 157 | class_net.load_state_dict(torch.load(P.classif_model, map_location=lambda storage, location: storage.cpu())) 158 | net = DescriptorNet(class_net, P.feature_dim, P.feature_size2d, untrained=P.untrained_blocks) 159 | if P.preload_net: 160 | net.load_state_dict(torch.load(P.preload_net, map_location=lambda storage, location: storage.cpu())) 161 | net = move_device(net, P.cuda_device) 162 | return net 163 | 164 | 165 | def main(): 166 | # training and test sets 167 | train_set_full = get_images_labels(P.dataset_full, P.match_labels) 168 | test_set_full = get_images_labels(P.dataset_full + '/test', P.match_labels) 169 | 170 | labels_list = [t[1] for t in train_set_full] 171 | # we have to give a number to each label, 172 | # so we need a list here for the index 173 | labels.extend(sorted(list(set(labels_list)))) 174 | 175 | log(P, 'Loading and transforming train/test sets.') 176 | 177 | train_set, test_train_set, test_set = [], [], [] 178 | train_pre_f = P.train_trans if P.train_pre_proc else transforms.Compose([]) 179 | test_pre_f = P.test_trans if P.test_pre_proc else transforms.Compose([]) 180 | for im, lab in train_set_full: 181 | im_o = imread_rgb(im) 182 | train_set.append((train_pre_f(im_o), lab, im)) 183 | test_train_set.append((test_pre_f(im_o), lab, im)) 184 | 185 | for im, lab in test_set_full: 186 | if lab not in labels: 187 | continue 188 | im_o = imread_rgb(im) 189 | test_set.append((test_pre_f(im_o), lab, im)) 190 | 191 | siam_net = get_siamese_net() 192 | optimizer = optim.SGD((p for p in siam_net.parameters() if p.requires_grad), lr=P.train_lr, momentum=P.train_momentum, weight_decay=P.train_weight_decay) 193 | criterion = TripletLoss(P.triplet_margin, P.train_loss_avg) 194 | testset_tuple = (test_set, test_train_set) 195 | if P.test_upfront: 196 | log(P, 'Upfront testing of descriptor model') 197 | score = test_print_descriptor(train_type, P, siam_net, testset_tuple, get_embeddings) 198 | else: 199 | score = 0 200 | if P.train: 201 | log(P, 'Starting region-descriptor training') 202 | train_siam_triplets_pos_couples(siam_net, train_set, testset_tuple, criterion, optimizer, best_score=score) 203 | log(P, 'Finished region-descriptor training') 204 | if P.test_descriptor_net: 205 | log(P, 'Testing as descriptor') 206 | # set best score high enough such that it will never be saved 207 | test_print_descriptor(train_type, P, siam_net, testset_tuple, get_embeddings, best_score=len(test_set) + 1) 208 | 209 | 210 | if __name__ == '__main__': 211 | with torch.cuda.device(P.cuda_device): 212 | try: 213 | main() 214 | except: 215 | log_detail(P, None, traceback.format_exc()) 216 | raise 217 | -------------------------------------------------------------------------------- /train/siamese_descriptor_p.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | from datetime import datetime 4 | from utils import * # image transforms, general utilities 5 | from global_p import * # global config 6 | 7 | # in AlexNet, there are 5 convolutional layers with parameters 8 | # and 3 FC layers in the classifier 9 | # in ResNet, before first layer, there are 2 modules with parameters. 10 | # then number of blocks per layers: 11 | # ResNet152 - layer 1: 3, layer 2: 8, layer 3: 36, layer 4: 3 12 | # ResNet50 - layer 1: 3, layer 2: 4, layer 3: 6, layer 4: 3 13 | # finally, a single FC layer is used as classifier 14 | untrained_blocks = { 15 | 'alexnet': 4, 16 | 'resnet152': 2 + 3 + 8 + 36 17 | } 18 | 19 | 20 | # parameters for the sub-regions classification training with AlexNet 21 | class Params(object): 22 | 23 | def __init__(self): 24 | # general parameters 25 | self.cnn_model = 'ResNet152' 26 | self.dataset_full = 'data/pre_proc/CLICIDE_video_224sq' 27 | self.cuda_device = 0 28 | self.dataset_id = parse_dataset_id(self.dataset_full) 29 | # the file containing mean and standard deviation values 30 | # for a new dataset, simply use the filename here or add it to the 31 | # global_p module parameters 32 | # (this is valid for the following parameters, too) 33 | self.mean_std_file = mean_std_files[self.dataset_id] 34 | # the function for obtaining labels from a filename in the dataset 35 | # this function takes a filename and returns a unique label 36 | self.match_labels = match_label_functions[self.dataset_id] 37 | # input size. this is usually always (3, 224, 224) unless larger 38 | # fixed-size images should be used 39 | self.image_input_size = image_sizes[self.dataset_id] 40 | # the number of different labels in the dataset 41 | self.num_classes = num_classes[self.dataset_id] 42 | # the 2D size of the convolutional features of the base network 43 | self.feature_size2d = feature_sizes[(self.cnn_model.lower(), self.image_input_size)] 44 | # the number of blocks in the base network that should not be trained 45 | # (starting from the lowest and going to higher layers/blocks) 46 | # usually, block represents a layer with parameters, 47 | # for ResNet or equivalent, block is a whole block of layers 48 | self.untrained_blocks = untrained_blocks[self.cnn_model.lower()] 49 | 50 | # read mean and standard of dataset here to define transforms already 51 | m, s = read_mean_std(self.mean_std_file) 52 | 53 | # Classification net general and test params 54 | self.preload_net = '' # allows to continue training a network 55 | self.classif_model = 'data/final_classif_ft/cli_resnet152.pth.tar' 56 | self.test_upfront = True 57 | self.train = True 58 | self.test_batch_size = 32 59 | self.test_pre_proc = True 60 | self.test_trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize(m, s)]) 61 | 62 | # Classification net training params 63 | self.train_epochs = 20 64 | self.train_batch_size = 64 65 | self.train_micro_batch = 8 66 | self.train_aug_rot = r = 45 67 | self.train_aug_hrange = hr = 0 68 | self.train_aug_vrange = vr = 0 69 | self.train_aug_hsrange = hsr = 0.25 70 | self.train_aug_vsrange = vsr = 0.25 71 | self.train_aug_hflip = hflip = True 72 | trans = transforms.Compose([random_affine_noisy_cv(rotation=r, h_range=hr, v_range=vr, hs_range=hsr, vs_range=vsr, h_flip=hflip), transforms.ToTensor(), transforms.Normalize(m, s)]) 73 | 74 | # transformation for each scale 75 | self.train_trans = trans 76 | self.train_pre_proc = False 77 | 78 | self.train_lr = 1e-3 79 | self.train_momentum = 0.9 80 | self.train_weight_decay = 5e-4 81 | self.train_optim = 'SGD' 82 | self.train_annealing = {} 83 | self.train_loss_avg = False 84 | self.train_loss_int = 10 85 | self.train_test_int = 0 86 | # the batch norm layer cannot be trained if the micro-batch size 87 | # is too small, as global variances/means cannot be properly 88 | # approximated in this case. so train only when having a batch 89 | # of at least 16 90 | self.train_bn = self.train_micro_batch >= 16 or (self.train_micro_batch <= 0 and (self.train_batch_size >= 16 or self.train_batch_size <= 0)) 91 | 92 | # Descriptor net parameters 93 | # if True, test the network as a descriptor 94 | # (using the normalized classification output): 95 | self.test_descriptor_net = True 96 | # the threshold (in Bytes) for embeddings to be computed on GPU 97 | self.embeddings_cuda_size = 2 ** 30 98 | self.feature_dim = 2048 99 | self.triplet_margin = 0.1 100 | # number of epochs after which semi-hard triplet choice switches 101 | # to hard triplet choice 102 | self.train_epoch_switch = 2 103 | 104 | # UUID for these parameters (current time) 105 | self.uuid = datetime.now() 106 | self.save_dir = 'data' 107 | self.log_file = path.join(self.save_dir, unique_str(self) + '.log') 108 | 109 | 110 | # global test params: 111 | P = Params() 112 | -------------------------------------------------------------------------------- /train/siamese_regions.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | import traceback 4 | import random 5 | import numpy as np 6 | import torch 7 | import torch.optim as optim 8 | import torch.nn as nn 9 | import torchvision.models as models 10 | import torchvision.transforms as transforms 11 | from torch.autograd import Variable 12 | from siamese_regions_p import P 13 | from utils import move_device, tensor_t, tensor, fold_batches, train_gen 14 | from utils import imread_rgb, log, log_detail, get_lab_indicators 15 | from utils import get_images_labels, get_similarities, embeddings_device_dim 16 | from utils import test_print_descriptor, choose_rand_neg, get_pos_couples 17 | from model.siamese import TuneClassifSub, RegionDescriptorNet 18 | from model.custom_modules import TripletLoss 19 | 20 | # keep labels as global variable. they are initialized after 21 | # train set has been loaded and then kept constant 22 | labels = [] 23 | train_type = P.cnn_model.lower() + ' Siamese sub-regions' 24 | 25 | 26 | def get_embeddings(net, dataset, device, out_size): 27 | test_trans = P.test_trans 28 | if P.test_pre_proc: 29 | test_trans = transforms.Compose([]) 30 | 31 | def batch(last, i, is_final, batch): 32 | embeddings = last 33 | # one image at a time 34 | test_in = move_device(test_trans(batch[0][0]).unsqueeze(0), P.cuda_device) 35 | 36 | out = net(Variable(test_in, volatile=True)).data 37 | embeddings[i] = out[0] 38 | return embeddings 39 | 40 | init = tensor(device, len(dataset), out_size) 41 | return fold_batches(batch, init, dataset, 1) 42 | 43 | 44 | # train using triplets, constructing triplets from all positive couples 45 | def train_siam_triplets_pos_couples(net, train_set, testset_tuple, criterion, criterion2, optimizer, best_score=0): 46 | """ 47 | TODO 48 | """ 49 | train_trans = P.train_trans 50 | if P.train_pre_proc: 51 | train_trans = transforms.Compose([]) 52 | 53 | couples = get_pos_couples(train_set) 54 | sim_device, _ = embeddings_device_dim(P, net, len(train_set), sim_matrix=True) 55 | lab_indicators = get_lab_indicators(train_set, sim_device) 56 | num_pos = sum(len(couples[lab]) for lab in couples) 57 | log(P, '#pos (without order, with duplicates):{0}'.format(num_pos)) 58 | 59 | # fold over positive couples here and choose negative for each pos 60 | # need to make sure the couples are evenly distributed 61 | # such that all batches can have couples from every instance 62 | def shuffle_couples(couples): 63 | for lab in couples: 64 | random.shuffle(couples[lab]) 65 | # get x such that only 20% of labels have more than x couples 66 | a = np.array([len(couples[lab]) for lab in couples]) 67 | x = int(np.percentile(a, 80)) 68 | out = [] 69 | keys = couples.keys() 70 | random.shuffle(keys) 71 | # append the elements to out in a strided way 72 | # (up to x elements per label) 73 | for count in range(x): 74 | for lab in keys: 75 | if count >= len(couples[lab]): 76 | continue 77 | out.append(couples[lab][count]) 78 | # the last elements in the longer lists are inserted at random 79 | for lab in keys: 80 | for i in range(x, len(couples[lab])): 81 | out.insert(random.randrange(len(out)), couples[lab][i]) 82 | return out 83 | 84 | def create_epoch(epoch, couples, testset_tuple): 85 | test_ref_set = testset_tuple[1] 86 | # use the test-train set to obtain embeddings and similarities 87 | # (since it may be transformed differently than train set) 88 | similarities, _ = get_similarities(P, get_embeddings, net, test_ref_set) 89 | 90 | # shuffle the couples 91 | shuffled = shuffle_couples(couples) 92 | return shuffled, {'epoch': epoch, 'similarities': similarities} 93 | 94 | def create_batch(batch, n, epoch, similarities): 95 | # one image at a time. batch is always of size 1 96 | lab, (i1, i2), (im1, im2) = batch[0] 97 | labels_in = tensor_t(torch.LongTensor, P.cuda_device, 1) 98 | labels_in[0] = labels.index(lab) 99 | # we get a positive couple. find negative for it 100 | im3 = None 101 | # choose a semi-hard negative. see FaceNet 102 | # paper by Schroff et al for details. 103 | # essentially, choose hardest negative that is still 104 | # easier than the positive. this should avoid 105 | # collapsing the model at beginning of training 106 | ind_exl = lab_indicators[lab] 107 | sim_pos = similarities[i1, i2] 108 | if epoch < P.train_epoch_switch: 109 | # exclude all positives as well as any that are 110 | # more similar than sim_pos 111 | ind_exl = ind_exl | similarities[i1].ge(sim_pos) 112 | if ind_exl.sum() >= similarities.size(0): 113 | p = 'cant find semi-hard neg for' 114 | s = 'falling back to random neg' 115 | n_pos = lab_indicators[lab].sum() 116 | n_ge = similarities[i1].ge(sim_pos).sum() 117 | n_tot = similarities.size(0) 118 | print('{0} {1}-{2}-{3} (#pos:{4}, #ge:{5}, #total:{6}), {7}'.format(p, i1, i2, lab, n_pos, n_ge, n_tot, s)) 119 | else: 120 | # similarities must be in [-1, 1] 121 | # set all similarities of excluded indexes to -2 122 | # then take argmax (highest similarity not excluded) 123 | sims = similarities[i1].clone() 124 | sims[ind_exl] = -2 125 | _, k = sims.max(0) 126 | im3 = train_set[k[0]][0] 127 | if im3 is None: 128 | # default to random negative 129 | im3 = choose_rand_neg(train_set, lab) 130 | # one image at a time 131 | train_in1 = move_device(train_trans(im1).unsqueeze(0), P.cuda_device) 132 | train_in2 = move_device(train_trans(im2).unsqueeze(0), P.cuda_device) 133 | train_in3 = move_device(train_trans(im3).unsqueeze(0), P.cuda_device) 134 | # return input tensors and labels 135 | return [train_in1, train_in2, train_in3], [labels_in] 136 | 137 | def create_loss(out, labels_list): 138 | # out is a tuple of 3 tuples, each for the descriptor 139 | # and a tensor with all classification results for the highest 140 | # classification values. the first loss is a simple loss on the 141 | # descriptors. the second loss is a classification loss for 142 | # each sub-region of the anchor input (first input). 143 | # we simply sum-aggregate here 144 | loss = criterion(*(t for t, _ in out)) 145 | cls_out = out[0][1] # classification values for anchor 146 | # there is only 1 batch of k classification values, so cls_out 147 | # has dimension (1, num_classes, k). need to get (k, num_classes) 148 | cls_out_all = cls_out.squeeze(0).t() 149 | loss2 = criterion2(cls_out_all, labels_list[0].expand(cls_out_all.size(0))) 150 | return loss, loss2 151 | 152 | train_gen(train_type, P, test_print_descriptor, get_embeddings, net, 153 | couples, testset_tuple, optimizer, create_epoch, create_batch, 154 | create_loss, best_score=best_score) 155 | 156 | 157 | def get_siamese_net(): 158 | model = models.alexnet 159 | if P.cnn_model.lower() == 'resnet152': 160 | model = models.resnet152 161 | class_net = TuneClassifSub(model(pretrained=True), P.num_classes, P.feature_size2d, untrained=P.untrained_blocks) 162 | if P.classif_model: 163 | class_net.load_state_dict(torch.load(P.classif_model, map_location=lambda storage, location: storage.cpu())) 164 | net = RegionDescriptorNet(class_net, P.regions_k, P.feature_dim, P.feature_size2d, untrained=P.untrained_blocks) 165 | if P.preload_net: 166 | net.load_state_dict(torch.load(P.preload_net, map_location=lambda storage, location: storage.cpu())) 167 | net = move_device(net, P.cuda_device) 168 | return net 169 | 170 | 171 | def main(): 172 | # training and test sets 173 | train_set_full = get_images_labels(P.dataset_full, P.match_labels) 174 | test_set_full = get_images_labels(P.dataset_full + '/test', P.match_labels) 175 | 176 | labels_list = [t[1] for t in train_set_full] 177 | # we have to give a number to each label, 178 | # so we need a list here for the index 179 | labels.extend(sorted(list(set(labels_list)))) 180 | 181 | log(P, 'Loading and transforming train/test sets.') 182 | 183 | train_set, test_train_set, test_set = [], [], [] 184 | train_pre_f = P.train_trans if P.train_pre_proc else transforms.Compose([]) 185 | test_pre_f = P.test_trans if P.test_pre_proc else transforms.Compose([]) 186 | for im, lab in train_set_full: 187 | im_o = imread_rgb(im) 188 | train_set.append((train_pre_f(im_o), lab, im)) 189 | test_train_set.append((test_pre_f(im_o), lab, im)) 190 | 191 | for im, lab in test_set_full: 192 | if lab not in labels: 193 | continue 194 | im_o = imread_rgb(im) 195 | test_set.append((test_pre_f(im_o), lab, im)) 196 | 197 | siam_net = get_siamese_net() 198 | optimizer = optim.SGD((p for p in siam_net.parameters() if p.requires_grad), lr=P.train_lr, momentum=P.train_momentum, weight_decay=P.train_weight_decay) 199 | criterion = TripletLoss(P.triplet_margin, P.train_loss_avg) 200 | criterion2 = nn.CrossEntropyLoss(size_average=P.train_loss2_avg) 201 | testset_tuple = (test_set, test_train_set) 202 | if P.test_upfront: 203 | log(P, 'Upfront testing of descriptor model') 204 | score = test_print_descriptor(train_type, P, siam_net, testset_tuple, get_embeddings) 205 | else: 206 | score = 0 207 | if P.train: 208 | log(P, 'Starting region-descriptor training') 209 | train_siam_triplets_pos_couples(siam_net, train_set, testset_tuple, criterion, criterion2, optimizer, best_score=score) 210 | log(P, 'Finished region-descriptor training') 211 | if P.test_descriptor_net: 212 | log(P, 'Testing as descriptor') 213 | # set best score high enough such that it will never be saved 214 | test_print_descriptor(train_type, P, siam_net, testset_tuple, get_embeddings, best_score=len(test_set) + 1) 215 | 216 | 217 | if __name__ == '__main__': 218 | with torch.cuda.device(P.cuda_device): 219 | try: 220 | main() 221 | except: 222 | log_detail(P, None, traceback.format_exc()) 223 | raise 224 | -------------------------------------------------------------------------------- /train/siamese_regions_p.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | from datetime import datetime 4 | from utils import * # image transforms, general utilities 5 | from global_p import * # global config 6 | 7 | # in AlexNet, there are 5 convolutional layers with parameters 8 | # and 3 FC layers in the classifier 9 | # in ResNet, before first layer, there are 2 modules with parameters. 10 | # then number of blocks per layers: 11 | # ResNet152 - layer 1: 3, layer 2: 8, layer 3: 36, layer 4: 3 12 | # ResNet50 - layer 1: 3, layer 2: 4, layer 3: 6, layer 4: 3 13 | # finally, a single FC layer is used as classifier 14 | untrained_blocks = { 15 | 'alexnet': 4, 16 | 'resnet152': 2 + 3 + 8 + 36 17 | } 18 | 19 | 20 | # parameters for the sub-regions classification training with AlexNet 21 | class Params(object): 22 | 23 | def __init__(self): 24 | # general parameters 25 | self.cnn_model = 'ResNet152' 26 | self.dataset_full = 'data/pre_proc/CLICIDE_video_448' 27 | self.cuda_device = 0 28 | self.dataset_id = parse_dataset_id(self.dataset_full) 29 | # the file containing mean and standard deviation values 30 | # for a new dataset, simply use the filename here or add it to the 31 | # global_p module parameters 32 | # (this is valid for the following parameters, too) 33 | self.mean_std_file = mean_std_files[self.dataset_id] 34 | # the function for obtaining labels from a filename in the dataset 35 | # this function takes a filename and returns a unique label 36 | self.match_labels = match_label_functions[self.dataset_id] 37 | # input size. this is usually always (3, 224, 224) unless larger 38 | # fixed-size images should be used 39 | self.image_input_size = image_sizes[self.dataset_id] 40 | # the number of different labels in the dataset 41 | self.num_classes = num_classes[self.dataset_id] 42 | # the 2D size of the convolutional features of the base network 43 | self.feature_size2d = feature_sizes[(self.cnn_model.lower(), self.image_input_size)] 44 | # the number of blocks in the base network that should not be trained 45 | # (starting from the lowest and going to higher layers/blocks) 46 | # usually, block represents a layer with parameters, 47 | # for ResNet or equivalent, block is a whole block of layers 48 | self.untrained_blocks = untrained_blocks[self.cnn_model.lower()] 49 | 50 | # read mean and standard of dataset here to define transforms already 51 | m, s = read_mean_std(self.mean_std_file) 52 | 53 | # Classification net general and test params 54 | self.preload_net = '' # allows to continue training a network 55 | self.classif_model = 'data/final_classif_sub/cli_resnet152.pth.tar' 56 | self.test_upfront = True 57 | self.train = True 58 | self.test_pre_proc = True 59 | self.test_trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize(m, s)]) 60 | 61 | # Classification net training params 62 | self.train_epochs = 20 63 | self.train_batch_size = 64 64 | self.train_micro_batch = 1 # has to be 1 65 | self.train_aug_rot = r = 45 66 | self.train_aug_hrange = hr = 0 67 | self.train_aug_vrange = vr = 0 68 | self.train_aug_hsrange = hsr = 0.25 69 | self.train_aug_vsrange = vsr = 0.25 70 | self.train_aug_hflip = hflip = True 71 | trans = transforms.Compose([random_affine_noisy_cv(rotation=r, h_range=hr, v_range=vr, hs_range=hsr, vs_range=vsr, h_flip=hflip), transforms.ToTensor(), transforms.Normalize(m, s)]) 72 | 73 | # transformation for each scale 74 | self.train_trans = trans 75 | self.train_pre_proc = False 76 | 77 | self.train_lr = 1e-4 78 | self.train_momentum = 0.9 79 | self.train_weight_decay = 0. 80 | self.train_optim = 'SGD' 81 | self.train_annealing = {} 82 | self.train_loss_avg = False 83 | self.train_loss_int = 10 84 | self.train_test_int = 0 85 | # the batch norm layer cannot be trained if the micro-batch size 86 | # is too small, as global variances/means cannot be properly 87 | # approximated in this case. so train only when having a batch 88 | # of at least 16 89 | self.train_bn = self.train_micro_batch >= 16 or (self.train_micro_batch <= 0 and (self.train_batch_size >= 16 or self.train_batch_size <= 0)) 90 | 91 | # Descriptor net parameters 92 | # if True, test the network as a descriptor 93 | # (using the normalized classification output): 94 | self.test_descriptor_net = True 95 | # the threshold (in Bytes) for embeddings to be computed on GPU 96 | self.embeddings_cuda_size = 2 ** 30 97 | self.feature_dim = 2048 98 | self.regions_k = 6 99 | self.train_loss2_alpha = 1.0 100 | self.train_loss2_avg = True 101 | self.triplet_margin = 0.1 102 | # number of epochs after which semi-hard triplet choice switches 103 | # to hard triplet choice 104 | self.train_epoch_switch = 2 105 | 106 | # UUID for these parameters (current time) 107 | self.uuid = datetime.now() 108 | self.save_dir = 'data' 109 | self.log_file = path.join(self.save_dir, unique_str(self) + '.log') 110 | 111 | 112 | # global test params: 113 | P = Params() 114 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from dataset import * 2 | from general import * 3 | from image import * 4 | from metrics import * 5 | from train_classif import * 6 | from train_general import * 7 | from train_siamese import * 8 | -------------------------------------------------------------------------------- /utils/dataset.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | import itertools 4 | import glob 5 | import random 6 | import torch 7 | from os import path 8 | from general import tensor_t 9 | 10 | 11 | def get_images_labels(folder='.', label_f=lambda x: x.split('.')[0]): 12 | """ 13 | Read a folder containing images where the name of the class is in the filename 14 | the label function should return the label given the filename 15 | Return : 16 | list of couple : (image filename, label) 17 | """ 18 | exts = ('*.jpg', '*.JPG', '*.JPEG', "*.png") 19 | r = [] 20 | for ext in exts: 21 | r.extend([(im, label_f(im)) for im in glob.iglob(path.join(folder, ext))]) 22 | return r 23 | 24 | 25 | # get couples of images as a dict with images as keys and all 26 | # images of same label as values 27 | def get_pos_couples_ibi(dataset, duplicate=True): 28 | couples = {} 29 | for (_, l1, name1), (im2, l2, name2) in itertools.product(dataset, dataset): 30 | if l1 != l2 or (name1 is name2 and not duplicate): 31 | continue 32 | if name1 in couples: 33 | couples[name1].append(im2) 34 | else: 35 | couples[name1] = [im2] 36 | return couples 37 | 38 | 39 | # get the positive couples of a dataset as a dict with labels as keys 40 | def get_pos_couples(dataset, duplicate=True): 41 | couples = {} 42 | comb = itertools.combinations_with_replacement 43 | if not duplicate: 44 | comb = itertools.combinations 45 | for (i1, (x1, l1, _)), (i2, (x2, l2, _)) in comb(enumerate(dataset), 2): 46 | if l1 != l2: 47 | continue 48 | t = (l1, (i1, i2), (x1, x2)) 49 | if l1 in couples: 50 | couples[l1].append(t) 51 | else: 52 | couples[l1] = [t] 53 | return couples 54 | 55 | 56 | # return a random negative for the given label and train set 57 | def choose_rand_neg(train_set, lab): 58 | im_neg, lab_neg, _ = random.choice(train_set) 59 | while (lab_neg == lab): 60 | im_neg, lab_neg, _ = random.choice(train_set) 61 | return im_neg 62 | 63 | 64 | # get byte tensors indicating the indexes of images having a different label 65 | def get_lab_indicators(dataset, device): 66 | n = len(dataset) 67 | indicators = {} 68 | for _, lab1, _ in dataset: 69 | if lab1 in indicators: 70 | continue 71 | indicator = tensor_t(torch.ByteTensor, device, n).fill_(0) 72 | for i2, (_, lab2, _) in enumerate(dataset): 73 | if lab1 == lab2: 74 | indicator[i2] = 1 75 | indicators[lab1] = indicator 76 | return indicators 77 | -------------------------------------------------------------------------------- /utils/general.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | from __future__ import print_function 4 | import sys 5 | import tempfile 6 | import inspect 7 | import types 8 | import torch 9 | import os 10 | 11 | 12 | # to check an option specifying a file that should or should not exist 13 | def check_file(arg, name, should_exist, usage): 14 | if should_exist and not os.path.isfile(arg): 15 | print('Cannot find {0} file at path {1}\n'.format(name, arg)) 16 | usage() 17 | sys.exit(2) 18 | if not should_exist and os.path.isfile(arg): 19 | print('Cannot overwrite {0} file at path {1}\n'.format(name, arg)) 20 | usage() 21 | sys.exit(2) 22 | return arg 23 | 24 | 25 | # to check an option specifying a folder that should or should not exist 26 | def check_folder(arg, name, should_exist, usage): 27 | if should_exist and not os.path.isdir(arg): 28 | print('Cannot find {0} folder at path {1}\n'.format(name, arg)) 29 | usage() 30 | sys.exit(2) 31 | if not should_exist and os.path.isdir(arg): 32 | print('Cannot overwrite {0} folder at path {1}\n'.format(name, arg)) 33 | usage() 34 | sys.exit(2) 35 | return arg 36 | 37 | 38 | # to check an option specifying the model 39 | def check_model(arg, usage): 40 | if arg.lower() == 'alexnet' or arg.lower() == 'resnet152': 41 | return arg.lower() 42 | print('Model {0} is not a valid model'.format(arg)) 43 | usage() 44 | sys.exit(2) 45 | 46 | 47 | # to check an option specifying an integer 48 | def check_int(arg, name, usage): 49 | try: 50 | return int(arg) 51 | except ValueError: 52 | print('{0} was given as {1}. This is not an integer.\n' 53 | .format(name, arg)) 54 | usage() 55 | sys.exit(2) 56 | 57 | 58 | # to check an option specifying a boolean 59 | def check_bool(arg, name, usage): 60 | arg = arg.lower() 61 | if arg == '': 62 | print('{0} was not given. It should be a boolean (true/yes/y/1 for True and otherwise False).'.format(name)) 63 | usage() 64 | sys.exit(2) 65 | if arg == 'true' or arg == 'yes' or arg == 'y' or arg == '1': 66 | return True 67 | return False 68 | 69 | 70 | def parse_dataset_id(dataset_full): 71 | if dataset_full.endswith('/'): 72 | dataset_full = dataset_full[:-1] 73 | return dataset_full.split('/')[-1] 74 | 75 | 76 | def read_mean_std(fname): 77 | with open(fname) as f: 78 | mean = map(float, f.readline().split(' ')) 79 | std = map(float, f.readline().split(' ')) 80 | return mean, std 81 | 82 | 83 | def fun_str(f): 84 | if f.__class__ in (types.FunctionType, types.BuiltinFunctionType, types.BuiltinMethodType): 85 | return f.__name__ 86 | else: 87 | return f.__class__.__name__ 88 | 89 | 90 | def trans_str(trans): 91 | return ','.join(fun_str(t) for t in trans.transforms) 92 | 93 | 94 | def move_device(obj, device): 95 | if device >= 0: 96 | return obj.cuda() 97 | else: 98 | return obj.cpu() 99 | 100 | 101 | def tensor_t(t, device, *sizes): 102 | return move_device(t(*sizes), device) 103 | 104 | 105 | def tensor(device, *sizes): 106 | return tensor_t(torch.Tensor, device, *sizes) 107 | 108 | 109 | def unique_str(P): 110 | return P.uuid.strftime('%Y%m%d-%H%M%S-%f') 111 | 112 | 113 | def save(P, f, prefix): 114 | f.write('{0}\n\n'.format(prefix)) 115 | # for name, value in sorted(vars(P).items()): 116 | # if name == 'uuid': 117 | # continue 118 | # if name in ('test_trans', 'train_trans', 'train_sub_scales'): 119 | # if type(value) is list or type(value) is tuple: 120 | # value = ', '.join(trans_str(t) for t in value) 121 | # else: 122 | # value = trans_str(value) 123 | # elif name in ('match_labels_f'): 124 | # value = fun_str(value) 125 | # f.write('{0}:{1}\n'.format(name, value)) 126 | f.write(inspect.getsource(P.__class__)) 127 | f.close() 128 | 129 | 130 | def save_uuid(P, prefix): 131 | f = tempfile.NamedTemporaryFile(dir=P.save_dir, delete=False) 132 | save(P, f, prefix) 133 | # the following will not work on Windows (would need to add a remove first) 134 | os.rename(f.name, os.path.join(P.save_dir, unique_str(P) + '.params')) 135 | 136 | 137 | def log_detail(P, p_file, *args): 138 | if p_file: 139 | print(*args, file=p_file) 140 | if P.log_file: 141 | with open(P.log_file, 'a') as f: 142 | print(*args, file=f) 143 | 144 | 145 | def log(P, *args): 146 | log_detail(P, sys.stdout, *args) 147 | 148 | 149 | def mod_param(p_file, param, new_value): 150 | with open(p_file) as f_in: 151 | with open(p_file + '.tmp', 'w') as f_out: 152 | for line in f_in: 153 | if (line.strip().startswith('self.' + param) and 154 | len(line.split('=')) == 2): 155 | new_line = (line.split('=')[0] + '= \'' + 156 | new_value + '\'\n') 157 | f_out.write(new_line) 158 | else: 159 | f_out.write(line) 160 | # as above, in Windows a remove would be needed first 161 | os.rename(p_file + '.tmp', p_file) 162 | -------------------------------------------------------------------------------- /utils/image.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | import torchvision.transforms as transforms 4 | from PIL import Image 5 | import cv2 6 | from scipy.ndimage import interpolation 7 | import numpy as np 8 | import random 9 | 10 | 11 | # ---------------------- Image transformations ----------------- 12 | def norm_image_t(tensor): 13 | m = s = [] 14 | for t in tensor: 15 | m.append(t.mean()) 16 | s.append(t.std()) 17 | return transforms.Normalize(m, s)(tensor) 18 | 19 | 20 | # pad a PIL image to a square 21 | def pad_square(img): 22 | longer_side = max(img.size) 23 | h_pad = (longer_side - img.size[0]) // 2 24 | h_mod = (longer_side - img.size[0]) % 2 25 | v_pad = (longer_side - img.size[1]) // 2 26 | v_mod = (longer_side - img.size[1]) % 2 27 | return img.crop((-h_pad - h_mod, -v_pad - v_mod, img.size[0] + h_pad, img.size[1] + v_pad)) 28 | 29 | 30 | # randomly rotate, shift and scale vertically and horizontally a PIL image with given angle in degrees and shifting/scaling ratios 31 | # inspired by http://stackoverflow.com/questions/7501009/affine-transform-in-pil-python 32 | def random_affine(rotation=0, h_range=0, v_range=0, hs_range=0, vs_range=0): 33 | rotation = rotation * (np.pi / 180) 34 | 35 | def rand_affine(im): 36 | angle = random.uniform(-rotation, rotation) 37 | x, y = im.size[0] / 2, im.size[1] / 2 38 | nx = x + random.uniform(-h_range, h_range) * im.size[0] 39 | ny = y + random.uniform(-v_range, v_range) * im.size[1] 40 | sx = 1 + random.uniform(-hs_range, hs_range) 41 | sy = 1 + random.uniform(-vs_range, vs_range) 42 | cos, sin = np.cos(angle), np.sin(angle) 43 | a, b = cos / sx, sin / sx 44 | c = x - nx * a - ny * b 45 | d, e = -sin / sy, cos / sy 46 | f = y - nx * d - ny * e 47 | return im.transform(im.size, Image.AFFINE, (a, b, c, d, e, f), resample=Image.BICUBIC) 48 | return rand_affine 49 | 50 | 51 | def pad_square_cv(img): 52 | longer_side = max(img.shape[:2]) 53 | v_pad = (longer_side - img.shape[0]) // 2 54 | v_mod = (longer_side - img.shape[0]) % 2 55 | h_pad = (longer_side - img.shape[1]) // 2 56 | h_mod = (longer_side - img.shape[1]) % 2 57 | return np.pad(img, ((v_pad + v_mod, v_pad), (h_pad + h_mod, h_pad), (0, 0)), 'constant', constant_values=((0, 0), (0, 0), (0, 0))) 58 | 59 | 60 | def scale_cv(new_size, inter=cv2.INTER_CUBIC): 61 | if isinstance(new_size, tuple): 62 | def sc_cv(img): 63 | return cv2.resize(img, new_size, interpolation=inter) 64 | return sc_cv 65 | else: 66 | def sc_cv(img): 67 | h, w, _ = img.shape 68 | if (w <= h and w == new_size) or (h <= w and h == new_size): 69 | return img 70 | if w < h: 71 | ow = new_size 72 | oh = int(round(float(new_size * h) / w)) 73 | return cv2.resize(img, (ow, oh), interpolation=inter) 74 | else: 75 | oh = new_size 76 | ow = int(round(float(new_size * w) / h)) 77 | return cv2.resize(img, (ow, oh), interpolation=inter) 78 | return sc_cv 79 | 80 | 81 | def center_crop_cv(size): 82 | if not isinstance(size, tuple): 83 | size = (int(size), int(size)) 84 | 85 | def cent_crop_cv(img): 86 | h, w, _ = img.shape 87 | th, tw = size 88 | x1 = int(round((w - tw) / 2.)) 89 | y1 = int(round((h - th) / 2.)) 90 | return img[y1:y1 + th, x1:x1 + tw] 91 | return cent_crop_cv 92 | 93 | 94 | def random_crop_cv(size): 95 | if not isinstance(size, tuple): 96 | size = (int(size), int(size)) 97 | 98 | def rand_crop_cv(img): 99 | h, w, _ = img.shape 100 | th, tw = size 101 | if w == tw and h == th: 102 | return img 103 | x1 = random.randint(0, w - tw) 104 | y1 = random.randint(0, h - th) 105 | return img[y1:y1 + th, x1:x1 + tw] 106 | return rand_crop_cv 107 | 108 | 109 | # crop randomly using same aspect ratio as image 110 | # such that shorter side has given size 111 | def random_crop_keep_ar_cv(short_side): 112 | def rand_crop_cv(img): 113 | h, w, _ = img.shape 114 | if (h <= w and h == short_side) or (w <= h and w == short_side): 115 | return img 116 | if h < w: 117 | th = short_side 118 | tw = int(round(float(short_side * w) / h)) 119 | else: 120 | tw = short_side 121 | th = int(round(float(short_side * h) / w)) 122 | x1 = random.randint(0, w - tw) 123 | y1 = random.randint(0, h - th) 124 | return img[y1:y1 + th, x1:x1 + tw] 125 | return rand_crop_cv 126 | 127 | 128 | def affine_cv(img, angle, v_shift, h_shift, sx, sy, cval=0.): 129 | # apply translation first to allow the center to be 130 | # offset to any position when using rotation 131 | mat = np.array([ 132 | [sy * np.cos(angle), -sy * np.sin(angle), v_shift], 133 | [sx * np.sin(angle), sx * np.cos(angle), h_shift], 134 | [0., 0., 1.] 135 | ]) 136 | # make sure the transform is applied at the center of the image, 137 | # then reset it afterwards 138 | offset = (img.shape[0] / 2.0 + 0.5, img.shape[1] / 2.0 + 0.5) 139 | mat = np.dot(np.dot( 140 | np.array([ 141 | [1., 0., offset[0]], 142 | [0., 1., offset[1]], 143 | [0., 0., 1.]]), 144 | mat), 145 | np.array([ 146 | [1., 0., -offset[0]], 147 | [0., 1., -offset[1]], 148 | [0., 0., 1.]])) 149 | 150 | def t(channel): 151 | return interpolation.affine_transform(channel, mat[:2, :2], mat[:2, 2], cval=cval) 152 | # apply transformation to each channel separately 153 | return np.dstack(map(t, (img[:, :, i] for i in range(img.shape[2])))) 154 | 155 | 156 | def random_affine_scale_cv(range_low, range_high): 157 | def rand_aff_scale_cv(img): 158 | scale = random.uniform(range_low, range_high) 159 | return affine_cv(img, 0., 0., 0., scale, scale) 160 | return rand_aff_scale_cv 161 | 162 | 163 | def affine_scale_noisy_cv(scale): 164 | def aff_scale_noisy(img): 165 | img = affine_cv(img.astype(float), 0., 0., 0., scale, scale, cval=.1) 166 | img[img == .1] = np.random.randint(256, size=np.sum(img == .1)) 167 | return img.astype(np.uint8) 168 | return aff_scale_noisy 169 | 170 | 171 | def random_affine_noisy_cv(rotation=0, h_range=0, v_range=0, hs_range=0, vs_range=0, h_flip=False): 172 | rotation = rotation * (np.pi / 180) 173 | 174 | def rand_aff_noisy_cv(img): 175 | # compose the affine transformation applied to x 176 | angle = np.random.uniform(-rotation, rotation) 177 | # shift needs to be scaled by size of image in that dimension 178 | v_shift = np.random.uniform(-v_range, v_range) * img.shape[0] 179 | h_shift = np.random.uniform(-h_range, h_range) * img.shape[1] 180 | sx = 1 + random.uniform(-hs_range, hs_range) 181 | sy = 1 + random.uniform(-vs_range, vs_range) 182 | if h_flip and random.random() < 0.5: 183 | sx = -sx 184 | img = affine_cv(img.astype(float), angle, v_shift, h_shift, sx, sy, cval=.1) 185 | img[img == .1] = np.random.randint(256, size=np.sum(img == .1)) 186 | return img.astype(np.uint8) 187 | return rand_aff_noisy_cv 188 | 189 | 190 | def random_affine_cv(rotation=0, h_range=0, v_range=0, hs_range=0, vs_range=0, h_flip=False): 191 | rotation = rotation * (np.pi / 180) 192 | 193 | def rand_affine_cv(img): 194 | # compose the affine transformation applied to x 195 | angle = np.random.uniform(-rotation, rotation) 196 | # shift needs to be scaled by size of image in that dimension 197 | v_shift = np.random.uniform(-v_range, v_range) * img.shape[0] 198 | h_shift = np.random.uniform(-h_range, h_range) * img.shape[1] 199 | sx = 1 + random.uniform(-hs_range, hs_range) 200 | sy = 1 + random.uniform(-vs_range, vs_range) 201 | if h_flip and random.random() < 0.5: 202 | sx = -sx 203 | return affine_cv(img, angle, v_shift, h_shift, sx, sy) 204 | return rand_affine_cv 205 | 206 | 207 | def random_h_flip_cv(img): 208 | return img[:, ::-1, :].copy() if random.random() < 0.5 else img 209 | 210 | 211 | def imread_rgb(fname): 212 | # read and convert image from BGR to RGB 213 | im = cv2.imread(fname) 214 | return cv2.cvtColor(im, cv2.COLOR_BGR2RGB) 215 | 216 | 217 | def tensor_2_bgr(tensor): 218 | # convert RGB tensor to BGR numpy array as used in OpenCV 219 | return cv2.cvtColor(tensor.numpy(), cv2.COLOR_RGB2BGR) 220 | -------------------------------------------------------------------------------- /utils/metrics.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | 4 | # Evaluation metrics (Precision@1 and mAP) given similarity matrix 5 | # Similarity matrix must have size 'test set size' x 'ref set size' 6 | # and contains in each row the similarity of that test (query) image 7 | # with all ref images 8 | def precision1(sim, test_set, ref_set, kth=1): 9 | total = sim.size(0) 10 | if kth <= 1: 11 | max_sim, max_idx = sim.max(1) 12 | else: 13 | max_sim, max_idx = sim.kthvalue(sim.size(1) - kth + 1, 1) 14 | max_label = [] 15 | for i in range(sim.size(0)): 16 | # get label from ref set which obtained highest score 17 | max_label.append(ref_set[max_idx[i, 0]][1]) 18 | correct = sum(test_label == max_label[j] for j, (_, test_label, _) in enumerate(test_set)) 19 | return float(correct) / total, correct, total, max_sim, max_label 20 | 21 | 22 | # according to Oxford buildings dataset definition of AP 23 | # the kth argument allows to ignore the k highest ranked elements of ref set 24 | # this is used to compute AP even for the train set against train set 25 | def avg_precision(sim, i, test_set, ref_set, kth=1): 26 | test_label = test_set[i][1] 27 | n_pos = sum(test_label == ref_label for _, ref_label, _ in ref_set) 28 | n_pos -= (kth - 1) 29 | if n_pos <= 0: 30 | return None 31 | old_recall, old_precision, ap = 0.0, 1.0, 0.0 32 | intersect_size, j = 0, 0 33 | _, ranked_list = sim[i].sort(dim=0, descending=True) 34 | for n, k in enumerate(ranked_list): 35 | if n + 1 < kth: 36 | continue 37 | if ref_set[k][1] == test_label: 38 | intersect_size += 1 39 | 40 | recall = intersect_size / float(n_pos) 41 | precision = intersect_size / (j + 1.0) 42 | ap += (recall - old_recall) * ((old_precision + precision) / 2.0) 43 | old_recall, old_precision = recall, precision 44 | j += 1 45 | return ap 46 | 47 | 48 | def mean_avg_precision(sim, test_set, ref_set, kth=1): 49 | aps = [] 50 | for i in range(sim.size(0)): 51 | # compute ap for each test image 52 | ap = avg_precision(sim, i, test_set, ref_set, kth) 53 | if ap is not None: 54 | aps.append(ap) 55 | return sum(aps) / float(len(aps)) 56 | -------------------------------------------------------------------------------- /utils/train_classif.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | import torch 4 | from model.nn_utils import set_net_train 5 | from os import path 6 | from general import log, save_uuid, unique_str 7 | 8 | 9 | # Generic function to test and print stats when training a classification net 10 | def test_print_classif(train_type, P, net, testset_tuple, test_net, best_score=0, epoch=0): 11 | test_set, test_train_set = testset_tuple 12 | set_net_train(net, False) 13 | c, t = test_net(net, test_set) 14 | if (c > best_score): 15 | best_score = c 16 | prefix = '{0}, EPOCH:{1}, SCORE:{2}'.format(train_type, epoch, c) 17 | save_uuid(P, prefix) 18 | torch.save(net.state_dict(), path.join(P.save_dir, unique_str(P) + "_best_classif.pth.tar")) 19 | log(P, 'TEST - correct: {0} / {1} - acc: {2}'.format(c, t, float(c) / t)) 20 | 21 | c, t = test_net(net, test_train_set) 22 | torch.save(net.state_dict(), path.join(P.save_dir, "model_classif_" + str(epoch) + ".pth.tar")) 23 | log(P, 'TRAIN - correct: {0} / {1} - acc: {2}'.format(c, t, float(c) / t)) 24 | set_net_train(net, True, bn_train=P.train_bn) 25 | return best_score 26 | -------------------------------------------------------------------------------- /utils/train_general.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | import gc 4 | import functools 5 | import torch.optim as optim 6 | from torch.autograd import Variable 7 | from model.nn_utils import set_net_train 8 | from general import log 9 | 10 | 11 | # Generic function to output the stats 12 | def output_stats(train_type, P, test_print, test_net, net, testset_tuple, epoch, batch_count, is_final, loss, running_loss, score): 13 | disp_int = P.train_loss_int 14 | running_loss += loss 15 | if batch_count % disp_int == disp_int - 1: 16 | log(P, '[{0:d}, {1:5d}] loss: {2:.5f}'.format(epoch + 1, batch_count + 1, running_loss / disp_int)) 17 | running_loss = 0.0 18 | test_int = P.train_test_int 19 | if ((test_int > 0 and batch_count % test_int == test_int - 1) or 20 | (test_int <= 0 and is_final)): 21 | score = test_print(train_type, P, net, testset_tuple, test_net, score, epoch + 1) 22 | return running_loss, score 23 | 24 | 25 | # evaluate a function by batches of size batch_size on the set x 26 | # and fold over the returned values 27 | def fold_batches(f, init, x, batch_size, cut_end=False, add_args={}): 28 | nx = len(x) 29 | if batch_size <= 0: 30 | return f(init, 0, True, x, **add_args) 31 | 32 | def red(last, idx): 33 | end = min(idx + batch_size, nx) 34 | if cut_end and idx + batch_size > nx: 35 | return last 36 | is_final = end > nx - batch_size if cut_end else end == nx 37 | return f(last, idx, is_final, x[idx:end], **add_args) 38 | return functools.reduce(red, range(0, nx, batch_size), init) 39 | 40 | 41 | def anneal(net, optimizer, epoch, annealing_dict): 42 | if epoch not in annealing_dict: 43 | return optimizer 44 | default_group = optimizer.state_dict()['param_groups'][0] 45 | lr = default_group['lr'] * annealing_dict[epoch] 46 | momentum = default_group['momentum'] 47 | weight_decay = default_group['weight_decay'] 48 | return optim.SGD((p for p in net.parameters() if p.requires_grad), lr=lr, momentum=momentum, weight_decay=weight_decay) 49 | 50 | 51 | def micro_batch_gen(last, i, is_final, batch, P, net, create_batch, batch_args, create_loss): 52 | gc.collect() 53 | prev_val, mini_batch_size = last 54 | n = len(batch) 55 | tensors_in, labels_in = create_batch(batch, n, **batch_args) 56 | tensors_out = net(*(Variable(t) for t in tensors_in)) 57 | loss, loss2 = create_loss(tensors_out, [Variable(l) for l in labels_in]) 58 | loss_micro = loss * n / mini_batch_size if P.train_loss_avg else loss 59 | val = loss_micro.data[0] 60 | if loss2 is not None: 61 | loss2_micro = loss2 * n / mini_batch_size if P.train_loss2_avg else loss2 62 | loss_micro = loss_micro + P.train_loss2_alpha * loss2_micro 63 | val = val + P.train_loss2_alpha * loss2_micro.data[0] 64 | loss_micro.backward() 65 | return prev_val + val, mini_batch_size 66 | 67 | 68 | def mini_batch_gen(last, i, is_final, batch, train_type, P, test_print, test_net, net, optimizer, testset_tuple, epoch, micro_args): 69 | batch_count, score, running_loss = last 70 | optimizer.zero_grad() 71 | loss, _ = fold_batches(micro_batch_gen, (0.0, len(batch)), batch, P.train_micro_batch, add_args=micro_args) 72 | optimizer.step() 73 | running_loss, score = output_stats(train_type, P, test_print, test_net, net, testset_tuple, epoch, batch_count, is_final, loss, running_loss, score) 74 | return batch_count + 1, score, running_loss 75 | 76 | 77 | def train_gen(train_type, P, test_print, test_net, net, train_set, testset_tuple, optimizer, create_epoch, create_batch, create_loss, best_score=0): 78 | set_net_train(net, True, bn_train=P.train_bn) 79 | for epoch in range(P.train_epochs): 80 | # annealing 81 | optimizer = anneal(net, optimizer, epoch, P.train_annealing) 82 | 83 | dataset, batch_args = create_epoch(epoch, train_set, testset_tuple) 84 | 85 | micro_args = { 86 | 'P': P, 87 | 'net': net, 88 | 'create_batch': create_batch, 89 | 'batch_args': batch_args, 90 | 'create_loss': create_loss 91 | } 92 | mini_args = { 93 | 'train_type': train_type, 94 | 'P': P, 95 | 'test_print': test_print, 96 | 'test_net': test_net, 97 | 'net': net, 98 | 'optimizer': optimizer, 99 | 'testset_tuple': testset_tuple, 100 | 'epoch': epoch, 101 | 'micro_args': micro_args 102 | } 103 | 104 | init = 0, best_score, 0.0 # batch count, score, running loss 105 | _, best_score, _ = fold_batches(mini_batch_gen, init, dataset, P.train_batch_size, cut_end=True, add_args=mini_args) 106 | -------------------------------------------------------------------------------- /utils/train_siamese.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | import torch 4 | import random 5 | from os import path 6 | from model.nn_utils import set_net_train 7 | from general import tensor_t 8 | from general import log, save_uuid, unique_str 9 | from dataset import get_pos_couples 10 | from metrics import precision1, mean_avg_precision 11 | 12 | 13 | # get byte tensors indicating the indexes of images having a different label 14 | def get_lab_indicators(dataset, device): 15 | n = len(dataset) 16 | indicators = {} 17 | for _, lab1, _ in dataset: 18 | if lab1 in indicators: 19 | continue 20 | indicator = tensor_t(torch.ByteTensor, device, n).fill_(0) 21 | for i2, (_, lab2, _) in enumerate(dataset): 22 | if lab1 == lab2: 23 | indicator[i2] = 1 24 | indicators[lab1] = indicator 25 | return indicators 26 | 27 | 28 | # determine the device where embeddings should be stored 29 | # and the feature dimension for a descriptor 30 | def embeddings_device_dim(P, net, n, sim_matrix=False): 31 | # get best device for embeddings (and possibly similarity matrix), 32 | # as well as the feature vector size. 33 | # usually, this is the configured cuda device. 34 | # but it could be CPU if embeddings/number of items are too large 35 | device = P.cuda_device 36 | out_size = P.feature_dim 37 | if hasattr(net, 'feature_size') and out_size <= 0: 38 | out_size = net.feature_size 39 | if n * out_size * 4 > P.embeddings_cuda_size: 40 | device = -1 41 | if sim_matrix and n * n * 4 > P.embeddings_cuda_size: 42 | device = -1 43 | return device, out_size 44 | 45 | 46 | # get all similarities between pairs of images of the dataset 47 | # net is assumed to be in train mode 48 | def get_similarities(P, get_embeddings, net, dataset): 49 | set_net_train(net, False) 50 | n = len(dataset) 51 | d, o = embeddings_device_dim(P, net, n, sim_matrix=True) 52 | embeddings = get_embeddings(net, dataset, d, o) 53 | similarities = torch.mm(embeddings, embeddings.t()) 54 | set_net_train(net, True, bn_train=P.train_bn) 55 | return similarities, d 56 | 57 | 58 | # accuracy of a net giving feature vectors for each image, evaluated over test set and test ref set (where the images are searched for) 59 | # the model should be in eval mode 60 | # for each pair of images, this only considers the maximal similarity (precision at 1, not the average precision/ranking on the ref set). TODO 61 | def test_descriptor_net(P, get_embeddings, net, test_set, test_ref_set, kth=1): 62 | d, o = embeddings_device_dim(P, net, max(len(test_set), len(test_ref_set))) 63 | test_embeddings = get_embeddings(net, test_set, d, o) 64 | ref_embeddings = get_embeddings(net, test_ref_set, d, o) 65 | 66 | # calculate all similarities as a simple matrix multiplication 67 | # since embeddings are assumed to be normalized 68 | # the similarities here should always be on CPU 69 | # (kthvalue is only implemented there and we don't need GPU perf) 70 | sim = torch.mm(test_embeddings, ref_embeddings.t()).cpu() 71 | # stats 72 | prec1, correct, total, max_sim, max_label = precision1(sim, test_set, test_ref_set, kth) 73 | mAP = mean_avg_precision(sim, test_set, test_ref_set, kth) 74 | sum_pos = sum(sim[i, j] for i, (_, test_label, _) in enumerate(test_set) for j, (_, ref_label, _) in enumerate(test_ref_set) if test_label == ref_label) 75 | sum_neg = sim.sum() - sum_pos 76 | sum_max = max_sim.sum() 77 | lab_dict = dict([(lab, {}) for _, lab, _ in test_set]) 78 | for j, (_, lab, _) in enumerate(test_set): 79 | d = lab_dict[lab] 80 | lab = max_label[j] 81 | d.setdefault(lab, d.get(lab, 0) + 1) 82 | return prec1, correct, total, sum_pos, sum_neg, sum_max, mAP, lab_dict 83 | 84 | 85 | # Generic function to test and print stats when training a descriptor net 86 | def test_print_descriptor(train_type, P, net, testset_tuple, get_embeddings, best_score=0, epoch=0): 87 | def print_stats(prefix, p1, c, t, avg_pos, avg_neg, avg_max, mAP): 88 | s1 = 'Correct: {0} / {1} - acc: {2:.4f} - mAP:{3:.4f}\n'.format(c, t, p1, mAP) 89 | s2 = 'AVG cosine sim (sq dist) values: pos: {0:.4f} ({1:.4f}), neg: {2:.4f} ({3:.4f}), max: {4:.4f} ({5:.4f})'.format(avg_pos, 2 - 2 * avg_pos, avg_neg, 2 - 2 * avg_neg, avg_max, 2 - 2 * avg_max) 90 | log(P, prefix + s1 + s2) 91 | 92 | test_set, test_ref_set = testset_tuple 93 | set_net_train(net, False) 94 | prec1, correct, tot, sum_pos, sum_neg, sum_max, mAP, lab_dict = test_descriptor_net(P, get_embeddings, net, test_set, test_ref_set) 95 | # can save labels dictionary (predicted labels for all test labels) 96 | # TODO 97 | 98 | num_pos = sum(test_label == ref_label for _, test_label, _ in test_set for _, ref_label, _ in test_ref_set) 99 | num_neg = len(test_set) * len(test_ref_set) - num_pos 100 | 101 | if (correct > best_score): 102 | best_score = correct 103 | prefix = '{0}, EPOCH:{1}, SCORE:{2}'.format(train_type, epoch, correct) 104 | save_uuid(P, prefix) 105 | torch.save(net.state_dict(), path.join(P.save_dir, unique_str(P) + "_best_siam.pth.tar")) 106 | print_stats('TEST - ', prec1, correct, tot, sum_pos / num_pos, sum_neg / num_neg, sum_max / len(test_set), mAP) 107 | torch.save(net.state_dict(), path.join(P.save_dir, "model_siam_" + str(epoch) + ".pth.tar")) 108 | 109 | # training set accuracy (choose second highest value, 110 | # as highest should almost certainly be the same image) 111 | # choose train samples with at least 2 other images for the query 112 | couples = get_pos_couples(test_ref_set) 113 | train_test_set = random.sample(test_ref_set, max(1, len(test_ref_set) // 10)) 114 | train_test_set = filter(lambda x: len(couples[x[1]]) >= 3, train_test_set) 115 | prec1, correct, tot, sum_pos, sum_neg, sum_max, mAP, _ = test_descriptor_net(P, get_embeddings, net, train_test_set, test_ref_set, kth=2) 116 | num_pos = sum(test_label == ref_label for _, test_label, _ in train_test_set for _, ref_label, _ in test_ref_set) 117 | num_neg = len(train_test_set) * len(test_ref_set) - num_pos 118 | print_stats('TRAIN - ', prec1, correct, tot, sum_pos / num_pos, sum_neg / num_neg, sum_max / len(train_test_set), mAP) 119 | set_net_train(net, True, bn_train=P.train_bn) 120 | return best_score 121 | --------------------------------------------------------------------------------