├── .gitignore ├── LICENSE ├── README.md ├── authors ├── FerranMarques160x160.jpg ├── JordiTorres.jpg ├── JordiTorres160x160.jpg ├── MiriamBellver160x160.jpg ├── XavierGiro160x160.jpg ├── carlos160x160.jpeg ├── giro.jpg ├── marques.jpg └── miriam.jpg ├── bellver-2016-nipsws.pdf ├── img ├── HR_sequences.png ├── architecture.png ├── hierarchy.png ├── results.png └── thumbnail.png ├── logos ├── MEyC.png ├── bsc.jpg ├── bsc320x86.jpg ├── etsetb.png ├── excellence_center.png ├── generalitat.jpg ├── gpi.png ├── gpi320x70.png ├── nips500x95.png ├── nvidia.jpg ├── severo_ochoa.png └── upc.jpg ├── requirements.txt └── scripts ├── features.py ├── image_helper.py ├── image_zooms_testing.py ├── image_zooms_training.py ├── metrics.py ├── parse_xml_annotations.py ├── pool45_crops_testing.py ├── pool45_crops_training.py ├── reinforcement.py └── visualization.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Image Processing Group - BarcelonaTECH - UPC 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hierarchical Object Detection with Deep Reinforcement Learning 2 | 3 | | ![NIPS 2016 logo][logo-nips] | Paper accepted at [Deep Reinforcement Learning Workshop, NIPS 2016](https://sites.google.com/site/deeprlnips2016/) | 4 | |:-:|---| 5 | 6 | [logo-nips]: https://github.com/imatge-upc/detection-2016-nipsws/blob/master/logos/nips500x95.png?raw=true "NIPS 2016 logo" 7 | 8 | | ![Míriam Bellver][bellver-photo] | ![Xavier Giro-i-Nieto][giro-photo] | ![Ferran Marqués][marques-photo] | ![Jordi Torres][torres-photo] | 9 | |:-:|:-:|:-:|:-:| 10 | | [Míriam Bellver][bellver-web] | [Xavier Giro-i-Nieto][giro-web] | [Ferran Marques][marques-web] | [Jordi Torres][torres-web] | 11 | 12 | 13 | [bellver-web]: https://www.bsc.es/bellver-bueno-miriam 14 | [giro-web]: https://imatge.upc.edu/web/people/xavier-giro 15 | [torres-web]: http://www.jorditorres.org/ 16 | [marques-web]:https://imatge.upc.edu/web/people/ferran-marques 17 | 18 | [bellver-photo]: https://github.com/imatge-upc/detection-2016-nipsws/blob/master/authors/MiriamBellver160x160.jpg?raw=true "Míriam Bellver" 19 | [giro-photo]: https://github.com/imatge-upc/detection-2016-nipsws/blob/master/authors/XavierGiro160x160.jpg?raw=true "Xavier Giró-i-Nieto" 20 | [marques-photo]: https://github.com/imatge-upc/detection-2016-nipsws/blob/master/authors/FerranMarques160x160.jpg?raw=true "Ferran Marqués" 21 | [torres-photo]: https://github.com/imatge-upc/detection-2016-nipsws/blob/master/authors/JordiTorres.jpg?raw=true "Jordi Torres" 22 | 23 | A joint collaboration between: 24 | 25 | |![logo-bsc] | ![logo-gpi] | 26 | |:-:|:-:| 27 | | [Barcelona Supercomputing Center][bsc-web] | [UPC Image Processing Group][gpi-web] | 28 | 29 | [gpi-web]: https://imatge.upc.edu/web/ 30 | [bsc-web]: http://www.bsc.es 31 | 32 | [logo-bsc]:https://github.com/imatge-upc/detection-2016-nipsws/blob/master/logos/bsc320x86.jpg?raw=true "Barcelona Supercomputing Center" 33 | [logo-gpi]: https://github.com/imatge-upc/detection-2016-nipsws/blob/master/logos/gpi320x70.png?raw=true "UPC Image Processing Group" 34 | [logo-severo]: https://github.com/imatge-upc/detection-2016-nipsws/blob/master/logos/severo_ochoa.png?raw=true "Severo Ochoa" 35 | 36 | ## Summary 37 | 38 | We present a method for performing hierarchical object detection in images guided by a deep reinforcement learning agent. The key idea is to focus on those parts of the image that contain richer information and zoom on them. We train an intelligent agent that, given an image window, is capable of deciding where to focus the attention among five different predefined region candidates (smaller windows). This procedure is iterated providing a hierarchical image analysis. We compare two different candidate proposal strategies to guide the object search: with and without overlap. 39 | 40 | ![Hierarchy of overlapping region proposals](https://github.com/imatge-upc/detection-2016-nipsws/blob/master/img/hierarchy.png?raw=true) 41 | 42 | Moreover, our work compares two different strategies to extract features from a convolutional neural network for each region proposal: a first one that computes new feature maps for each region proposal, and a second one that computes the feature maps for the whole image to later generate crops for each region proposal. 43 | 44 | ![Architectures for convolutional feature extraction](https://github.com/imatge-upc/detection-2016-nipsws/blob/master/img/architecture.png?raw=true) 45 | 46 | Experiments indicate better results for the overlapping candidate proposal strategy and a loss of performance for the cropped image features due to the loss of spatial resolution. We argue that, while this loss seems unavoidable when working with large amounts of object candidates, the much more reduced amount of region proposals generated by our reinforcement learning agent allows considering to extract features for each location without sharing convolutional computation among regions. 47 | 48 | ![Qualitative results](https://github.com/imatge-upc/detection-2016-nipsws/blob/master/img/HR_sequences.png?raw=true) 49 | 50 | ## Publication 51 | 52 | Our workshop paper is available on [arXiv](https://arxiv.org/abs/1611.03718), and related slides [here](http://www.slideshare.net/xavigiro/hierarchical-object-detection-with-deep-reinforcement-learning). 53 | 54 | Please cite with the following Bibtex code: 55 | 56 | ```` 57 | @InProceedings{Bellver_2016_NIPSWS, 58 | author = {Bellver, Miriam and Giro-i-Nieto, Xavier and Marques, Ferran and Torres, Jordi}, 59 | title = {Hierarchical Object Detection with Deep Reinforcement Learning}, 60 | booktitle = {Deep Reinforcement Learning Workshop, NIPS}, 61 | month = {December}, 62 | year = {2016} 63 | } 64 | ```` 65 | 66 | You may also want to refer to our publication with the more human-friendly Chicago style: 67 | 68 | *Miriam Bellver, Xavier Giro-i-Nieto, Ferran Marques, and Jordi Torres. "Hierarchical Object Detection with Deep Reinforcement Learning." In Deep Reinforcement Learning Workshop (NIPS). 2016.* 69 | 70 | ## Code Instructions 71 | 72 | This python code enables to both train and test each of the two models proposed in the paper. The image zooms model extracts features for each region visited, whereas the pool45 crops model extracts features just once and then ROI-pools features for each subregion. In this section we are going to describe how to use the code. The code uses Keras framework library. If you are using a virtual environment, you can use the requirements.txt provided. 73 | 74 | 75 | First it is important to notice that this code is already an extension of the code used for the paper. During the training stage, we are not only considering one object per image, we are also training for other objects by covering the already found objects with the mean of VGG-16, inspired by what Caicedo et al. did on Active Object Localization with Deep Reinforcement Learning. 76 | 77 | ### Setup 78 | 79 | First of all the weights of VGG-16 should be downloaded from the following link [VGG-16 weights]. If you want to use some pre-trained models for the Deep Q-network, they can be downloaded in the following link [Image Zooms model]. Notice that these models could lead to different results compared to the ones provided in the paper, due that these models are already trained to find more than one instance of planes in the image. You should also create two folders in the root of the project, called models_image_zooms and models_pool45_crops, and store inside them the corresponding weights. 80 | 81 | 82 | [VGG-16 weights]: http://imatge.upc.edu/web/sites/default/files/projects/deeplearning/public/detection-2016-nipsws/vgg16_weights.h5 83 | [Image Zooms model]: http://imatge.upc.edu/web/sites/default/files/projects/deeplearning/public/detection-2016-nipsws/model_image_zooms_2 84 | 85 | 86 | ### Usage 87 | 88 | ##### Training 89 | 90 | We will follow as example how to train the Image Zooms model, that is the one that achieves better results. The instructions are equal for training the Pool45 Crops model. The script is image_zooms_training.py, and first the path to the database should be configured. The default paths are the following: 91 | 92 | # path of PASCAL VOC 2012 or other database to use for training 93 | path_voc = "./VOC2012/" 94 | # path of other PASCAL VOC dataset, if you want to train with 2007 and 2012 train datasets 95 | path_voc2 = "./VOC2007/" 96 | # path of where to store the models 97 | path_model = "../models_image_zooms" 98 | # path of where to store visualizations of search sequences 99 | path_testing_folder = '../testing_visualizations' 100 | # path of VGG16 weights 101 | path_vgg = "../vgg16_weights.h5" 102 | 103 | But you can change them to point to your own locations. 104 | 105 | The training of the models enables checkpointing, so you should indicate which epoch you are going to train when running the script. If you are training it from scratch, then the training command should be: 106 | 107 | python image_zooms_training.py -n 0 108 | 109 | There are many options that can be changed to test different configurations: 110 | 111 | **class_object**: for which class you want to train the models. We have trained it for planes, and all the experiments of the paper are run on this class, but you can test other categories of pascal, also changing appropiately the training databases. 112 | 113 | **number_of_steps**: For how many steps you want your agent to search for an object in an image. 114 | 115 | **scale_subregion**: The scale of the subregions in the hierarchy, compared to its ancestor. Default value is 3/4, that denoted good results in our experiments, but it can easily be set. Take into consideration that the subregion scale and the number of steps is very correlated, if the subregion scale is high, then you will probably require more steps to find objects. 116 | 117 | **bool_draw**: This is a boolean, that if it is set to 1, it stores visualizations of the sequences for image searches. 118 | 119 | At each epoch the models will be saved in the models_image_zooms folder. 120 | 121 | ##### Testing 122 | 123 | To test the models, you should use the script image_zooms_testing.py. You should also configure the paths to indicate which weights you want to use, in the same manner as in the training stage. In this case, you should only run the command python image_zooms_testing.py. It is recommended that for testing you put bool_draw = 1, so you can observe the visualizations of the object search sequences. There is the option to just search for a single object in each image, to reproduce the same results of our paper, by just setting the boolean only_first_object to 1. 124 | 125 | 126 | ## Acknowledgements 127 | 128 | We would like to especially thank Albert Gil Moreno and Josep Pujal from our technical support team at the Image Processing Group at the UPC. We also would like to thank Carlos Tripiana from the technical support team at the Barcelona Supercomputing center (BSC). 129 | 130 | | ![AlbertGil-photo] | ![JosepPujal-photo] | ![CarlosTripiana-photo] | 131 | |:-:|:-:|:-:| 132 | | [Albert Gil](https://imatge.upc.edu/web/people/albert-gil-moreno) | [Josep Pujal](https://imatge.upc.edu/web/people/josep-pujal) | [Carlos Tripiana](https://www.bsc.es/tripiana-carlos/) | 133 | 134 | [AlbertGil-photo]: https://raw.githubusercontent.com/imatge-upc/saliency-2016-cvpr/master/authors/AlbertGil.jpg "Albert Gil" 135 | [JosepPujal-photo]: https://raw.githubusercontent.com/imatge-upc/saliency-2016-cvpr/master/authors/JosepPujal.jpg "Josep Pujal" 136 | [CarlosTripiana-photo]: https://github.com/imatge-upc/detection-2016-nipsws/blob/master/authors/carlos160x160.jpeg?raw=true "Carlos Tripiana" 137 | 138 | [AlbertGil-web]: https://imatge.upc.edu/web/people/albert-gil-moreno 139 | [JosepPujal-web]: https://imatge.upc.edu/web/people/josep-pujal 140 | [CarlosTripiana-web]: https://www.bsc.es/tripiana-carlos/ 141 | 142 | | | | 143 | |:--|:-:| 144 | | This work has been supported by the [grant SEV2015-0493 of the Severo Ochoa Program](https://www.bsc.es/es/severo-ochoa/presentaci%C3%B3n) awarded by Spanish Government, project TIN2015-65316 by the Spanish Ministry of Science and Innovation contracts 2014-SGR-1051 by Generalitat de Catalunya | ![logo-severo] | 145 | | We gratefully acknowledge the support of [NVIDIA Corporation](http://www.nvidia.com/content/global/global.php) with the donation of the GeoForce GTX [Titan Z](http://www.nvidia.com/gtx-700-graphics-cards/gtx-titan-z/) and [Titan X](http://www.geforce.com/hardware/desktop-gpus/geforce-gtx-titan-x) used in this work at the UPC, and the BSC/UPC NVIDIA GPU Center of Excellence. | ![logo-nvidia] | 146 | | The Image ProcessingGroup at the UPC is a [SGR14 Consolidated Research Group](https://imatge.upc.edu/web/projects/sgr14-image-and-video-processing-group) recognized and sponsored by the Catalan Government (Generalitat de Catalunya) through its [AGAUR](http://agaur.gencat.cat/en/inici/index.html) office. | ![logo-catalonia] | 147 | | This work has been developed in the framework of the project [BigGraph TEC2013-43935-R](https://imatge.upc.edu/web/projects/biggraph-heterogeneous-information-and-graph-signal-processing-big-data-era-application), funded by the Spanish Ministerio de Economía y Competitividad and the European Regional Development Fund (ERDF). | ![logo-spain] | 148 | 149 | 150 | [logo-nvidia]: https://github.com/imatge-upc/detection-2016-nipsws/blob/master/logos/excellence_center.png?raw=true "Logo of NVidia" 151 | [logo-catalonia]: https://raw.githubusercontent.com/imatge-upc/saliency-2016-cvpr/master/logos/generalitat.jpg "Logo of Catalan government" 152 | [logo-spain]: https://raw.githubusercontent.com/imatge-upc/saliency-2016-cvpr/master/logos/MEyC.png "Logo of Spanish government" 153 | 154 | 155 | ## Contact 156 | 157 | If you have any general doubt about our work or code which may be of interest for other researchers, please use the [public issues section](https://github.com/imatge-upc/detection-2016-nipsws/issues) on this github repo. Alternatively, drop us an e-mail at and . 158 | 159 | 160 | -------------------------------------------------------------------------------- /authors/FerranMarques160x160.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/authors/FerranMarques160x160.jpg -------------------------------------------------------------------------------- /authors/JordiTorres.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/authors/JordiTorres.jpg -------------------------------------------------------------------------------- /authors/JordiTorres160x160.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/authors/JordiTorres160x160.jpg -------------------------------------------------------------------------------- /authors/MiriamBellver160x160.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/authors/MiriamBellver160x160.jpg -------------------------------------------------------------------------------- /authors/XavierGiro160x160.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/authors/XavierGiro160x160.jpg -------------------------------------------------------------------------------- /authors/carlos160x160.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/authors/carlos160x160.jpeg -------------------------------------------------------------------------------- /authors/giro.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/authors/giro.jpg -------------------------------------------------------------------------------- /authors/marques.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/authors/marques.jpg -------------------------------------------------------------------------------- /authors/miriam.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/authors/miriam.jpg -------------------------------------------------------------------------------- /bellver-2016-nipsws.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/bellver-2016-nipsws.pdf -------------------------------------------------------------------------------- /img/HR_sequences.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/img/HR_sequences.png -------------------------------------------------------------------------------- /img/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/img/architecture.png -------------------------------------------------------------------------------- /img/hierarchy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/img/hierarchy.png -------------------------------------------------------------------------------- /img/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/img/results.png -------------------------------------------------------------------------------- /img/thumbnail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/img/thumbnail.png -------------------------------------------------------------------------------- /logos/MEyC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/logos/MEyC.png -------------------------------------------------------------------------------- /logos/bsc.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/logos/bsc.jpg -------------------------------------------------------------------------------- /logos/bsc320x86.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/logos/bsc320x86.jpg -------------------------------------------------------------------------------- /logos/etsetb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/logos/etsetb.png -------------------------------------------------------------------------------- /logos/excellence_center.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/logos/excellence_center.png -------------------------------------------------------------------------------- /logos/generalitat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/logos/generalitat.jpg -------------------------------------------------------------------------------- /logos/gpi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/logos/gpi.png -------------------------------------------------------------------------------- /logos/gpi320x70.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/logos/gpi320x70.png -------------------------------------------------------------------------------- /logos/nips500x95.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/logos/nips500x95.png -------------------------------------------------------------------------------- /logos/nvidia.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/logos/nvidia.jpg -------------------------------------------------------------------------------- /logos/severo_ochoa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/logos/severo_ochoa.png -------------------------------------------------------------------------------- /logos/upc.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/logos/upc.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | backports.shutil-get-terminal-size==1.0.0 2 | cycler==0.10.0 3 | Cython==0.24.1 4 | decorator==4.0.10 5 | easydict==1.6 6 | enum34==1.1.6 7 | h5py==2.6.0 8 | ipython==5.1.0 9 | ipython-genutils==0.1.0 10 | Keras==1.0.8 11 | matplotlib==1.5.3 12 | numpy==1.11.1 13 | pathlib2==2.1.0 14 | pexpect==4.2.1 15 | pickleshare==0.7.4 16 | Pillow==3.3.1 17 | prompt-toolkit==1.0.7 18 | protobuf==3.0.0b2 19 | ptyprocess==0.5.1 20 | Pygments==2.1.3 21 | pyparsing==2.1.9 22 | python-dateutil==2.5.3 23 | pytz==2016.6.1 24 | PyYAML==3.12 25 | scikit-learn==0.17.1 26 | scipy==0.18.0 27 | simplegeneric==0.8.1 28 | six==1.10.0 29 | sklearn==0.0 30 | Theano==0.8.2 31 | traitlets==4.3.1 32 | wcwidth==0.1.7 33 | -------------------------------------------------------------------------------- /scripts/features.py: -------------------------------------------------------------------------------- 1 | from keras.models import Sequential 2 | from keras.layers.core import Flatten, Dense, Dropout 3 | from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D 4 | from keras.optimizers import SGD 5 | from keras import backend as K 6 | import cv2, numpy as np 7 | import math 8 | import numpy, scipy 9 | from scipy import interpolate 10 | import scipy.ndimage 11 | import time 12 | 13 | # the feature size is of 7x7xp, being p the number of channels 14 | feature_size = 7 15 | # the relative scale reduction of the shallower feature map compared to the initial image input 16 | scale_reduction_shallower_feature = 16 17 | # the relative scale reduction of the deeper feature map compared to the initial image input 18 | scale_reduction_deeper_feature = 32 19 | # scaling of the input image 20 | factor_x_input = float(1) 21 | factor_y_input = float(1) 22 | 23 | 24 | # Interpolation of 2d features for a single channel of a feature map 25 | def interpolate_2d_features(features): 26 | out_size = feature_size 27 | x = np.arange(features.shape[0]) 28 | y = np.arange(features.shape[1]) 29 | z = features 30 | xx = np.linspace(x.min(), x.max(), out_size) 31 | yy = np.linspace(y.min(), y.max(), out_size) 32 | new_kernel = interpolate.RectBivariateSpline(x, y, z, kx=1, ky=1) 33 | kernel_out = new_kernel(xx, yy) 34 | return kernel_out 35 | 36 | 37 | # Interpolation 2d of each channel, so we obtain 3d interpolated feature maps 38 | def interpolate_3d_features(features): 39 | new_features = np.zeros([512, feature_size, feature_size]) 40 | for i in range(features.shape[0]): 41 | new_features[i, :, :] = interpolate_2d_features(features[i, :, :]) 42 | return new_features 43 | 44 | 45 | def pop_layer(model): 46 | if not model.outputs: 47 | raise Exception('Sequential model cannot be popped: model is empty.') 48 | model.layers.pop() 49 | if not model.layers: 50 | model.outputs = [] 51 | model.inbound_nodes = [] 52 | model.outbound_nodes = [] 53 | else: 54 | model.layers[-1].outbound_nodes = [] 55 | model.outputs = [model.layers[-1].output] 56 | model.built = False 57 | return model 58 | 59 | 60 | def get_convolutional_vgg16_compiled(vgg_weights_path): 61 | model_vgg = obtain_compiled_vgg_16(vgg_weights_path) 62 | for i in range(0, 6): 63 | model_vgg = pop_layer(model_vgg) 64 | return model_vgg 65 | 66 | 67 | def get_feature_maps(model, img): 68 | return [get_feature_map_4(model, img), get_feature_map_8(model, img)] 69 | 70 | 71 | # get deeper feature map 72 | def get_feature_map_8(model, im): 73 | im = im.astype(np.float32) 74 | dim_ordering = K.image_dim_ordering() 75 | if dim_ordering == 'th': 76 | # 'RGB'->'BGR' 77 | im = im[::-1, :, :] 78 | # Zero-center by mean pixel 79 | im[0, :, :] -= 103.939 80 | im[1, :, :] -= 116.779 81 | im[2, :, :] -= 123.68 82 | else: 83 | # 'RGB'->'BGR' 84 | im = im[:, :, ::-1] 85 | # Zero-center by mean pixel 86 | im[:, :, 0] -= 103.939 87 | im[:, :, 1] -= 116.779 88 | im[:, :, 2] -= 123.68 89 | im = im.transpose((2, 0, 1)) 90 | im = np.expand_dims(im, axis=0) 91 | inputs = [K.learning_phase()] + model.inputs 92 | _convout1_f = K.function(inputs, model.outputs) 93 | feature_map = _convout1_f([0] + [im]) 94 | feature_map = np.array([feature_map]) 95 | feature_map = feature_map[0, 0, 0, :, :, :] 96 | return feature_map 97 | 98 | 99 | # get shallower feature map 100 | def get_feature_map_4(model, im): 101 | im = im.astype(np.float32) 102 | dim_ordering = K.image_dim_ordering() 103 | if dim_ordering == 'th': 104 | # 'RGB'->'BGR' 105 | im = im[::-1, :, :] 106 | # Zero-center by mean pixel 107 | im[0, :, :] -= 103.939 108 | im[1, :, :] -= 116.779 109 | im[2, :, :] -= 123.68 110 | else: 111 | # 'RGB'->'BGR' 112 | im = im[:, :, ::-1] 113 | # Zero-center by mean pixel 114 | im[:, :, 0] -= 103.939 115 | im[:, :, 1] -= 116.779 116 | im[:, :, 2] -= 123.68 117 | im = im.transpose((2, 0, 1)) 118 | im = np.expand_dims(im, axis=0) 119 | inputs = [K.learning_phase()] + model.inputs 120 | _convout1_f = K.function(inputs, [model.layers[23].output]) 121 | feature_map = _convout1_f([0] + [im]) 122 | feature_map = np.array([feature_map]) 123 | feature_map = feature_map[0, 0, 0, :, :, :] 124 | return feature_map 125 | 126 | 127 | def crop_roi(feature_map, coordinates): 128 | return feature_map[:, coordinates[0]:coordinates[0]+coordinates[2], coordinates[1]:coordinates[1]+coordinates[3]] 129 | 130 | 131 | # this method decides whether to use the deeper or the shallower feature map 132 | # and then crops and interpolates if necessary the features to obtain a final descriptor of 7x7xp 133 | def obtain_descriptor_from_feature_map(feature_maps, region_coordinates): 134 | initial_width = region_coordinates[2]*factor_x_input 135 | initial_height = region_coordinates[3]*factor_y_input 136 | scale_aux = math.sqrt(initial_height*initial_width)/math.sqrt(feature_size*feature_size) 137 | if scale_aux > scale_reduction_deeper_feature: 138 | scale = scale_reduction_deeper_feature 139 | feature_map = feature_maps[1] 140 | else: 141 | scale = scale_reduction_shallower_feature 142 | feature_map = feature_maps[0] 143 | new_width = initial_width/scale 144 | new_height = initial_height/scale 145 | if new_width < feature_size: 146 | new_width = feature_size 147 | if new_height < feature_size: 148 | new_height = feature_size 149 | xo = region_coordinates[0]/scale 150 | yo = region_coordinates[1]/scale 151 | feat = np.array([feature_map]) 152 | if new_width + xo > feat.shape[2]: 153 | xo = feat.shape[2] - new_width 154 | if new_height + yo > feat.shape[3]: 155 | yo = feat.shape[3] - new_height 156 | if xo < 0: 157 | xo = 0 158 | if yo < 0: 159 | yo = 0 160 | new_coordinates = np.array([xo, yo, new_width, new_height]) 161 | roi = crop_roi(feature_map, new_coordinates) 162 | if roi.shape[1] < feature_size & roi.shape[2] < feature_size: 163 | features = interpolate_3d_features(roi) 164 | elif roi.shape[2] < feature_size: 165 | features = interpolate_3d_features(roi) 166 | elif roi.shape[1] < feature_size: 167 | features = interpolate_3d_features(roi) 168 | else: 169 | features = extract_features_from_roi(roi) 170 | return features 171 | 172 | 173 | # ROI-pooling features 174 | def extract_features_from_roi(roi): 175 | roi_width = roi.shape[1] 176 | roi_height = roi.shape[2] 177 | new_width = roi_width / feature_size 178 | new_height = roi_height / feature_size 179 | pooled_values = np.zeros([feature_size, feature_size, 512]) 180 | for j in range(512): 181 | for i in range(feature_size): 182 | for k in range(feature_size): 183 | if k == (feature_size-1) & i == (feature_size-1): 184 | patch = roi[j, i * new_width:roi_width, k * new_height:roi_height] 185 | elif k == (feature_size-1): 186 | patch = roi[j, i * new_width:(i + 1) * new_width, k * new_height:roi_height] 187 | elif i == (feature_size-1): 188 | patch = roi[j, i * new_width:roi_width, k * new_height:(k + 1) * new_height] 189 | else: 190 | patch = roi[j, i * new_width:(i + 1) * new_width, k * new_height:(k + 1) * new_height] 191 | pooled_values[i, k, j] = np.max(patch) 192 | return pooled_values 193 | 194 | 195 | def calculate_all_initial_feature_maps(images, model, image_names): 196 | initial_feature_maps = [] 197 | for z in range(np.size(image_names)): 198 | initial_feature_maps.append(get_feature_maps(model, np.array(images[z]))) 199 | return initial_feature_maps 200 | 201 | 202 | def get_image_descriptor_for_image(image, model): 203 | im = cv2.resize(image, (224, 224)).astype(np.float32) 204 | dim_ordering = K.image_dim_ordering() 205 | if dim_ordering == 'th': 206 | # 'RGB'->'BGR' 207 | im = im[::-1, :, :] 208 | # Zero-center by mean pixel 209 | im[0, :, :] -= 103.939 210 | im[1, :, :] -= 116.779 211 | im[2, :, :] -= 123.68 212 | else: 213 | # 'RGB'->'BGR' 214 | im = im[:, :, ::-1] 215 | # Zero-center by mean pixel 216 | im[:, :, 0] -= 103.939 217 | im[:, :, 1] -= 116.779 218 | im[:, :, 2] -= 123.68 219 | im = im.transpose((2, 0, 1)) 220 | im = np.expand_dims(im, axis=0) 221 | inputs = [K.learning_phase()] + model.inputs 222 | _convout1_f = K.function(inputs, [model.layers[33].output]) 223 | return _convout1_f([0] + [im]) 224 | 225 | 226 | def get_conv_image_descriptor_for_image(image, model): 227 | im = cv2.resize(image, (224, 224)).astype(np.float32) 228 | dim_ordering = K.image_dim_ordering() 229 | if dim_ordering == 'th': 230 | # 'RGB'->'BGR' 231 | im = im[::-1, :, :] 232 | # Zero-center by mean pixel 233 | im[0, :, :] -= 103.939 234 | im[1, :, :] -= 116.779 235 | im[2, :, :] -= 123.68 236 | else: 237 | # 'RGB'->'BGR' 238 | im = im[:, :, ::-1] 239 | # Zero-center by mean pixel 240 | im[:, :, 0] -= 103.939 241 | im[:, :, 1] -= 116.779 242 | im[:, :, 2] -= 123.68 243 | im = im.transpose((2, 0, 1)) 244 | im = np.expand_dims(im, axis=0) 245 | inputs = [K.learning_phase()] + model.inputs 246 | _convout1_f = K.function(inputs, [model.layers[31].output]) 247 | return _convout1_f([0] + [im]) 248 | 249 | 250 | def obtain_compiled_vgg_16(vgg_weights_path): 251 | model = vgg_16(vgg_weights_path) 252 | sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True) 253 | model.compile(optimizer=sgd, loss='categorical_crossentropy') 254 | return model 255 | 256 | 257 | def vgg_16(weights_path=None): 258 | model = Sequential() 259 | model.add(ZeroPadding2D((1, 1), input_shape=(3, 224, 224))) 260 | model.add(Convolution2D(64, 3, 3, activation='relu')) 261 | model.add(ZeroPadding2D((1, 1))) 262 | model.add(Convolution2D(64, 3, 3, activation='relu')) 263 | model.add(MaxPooling2D((2, 2), strides=(2, 2))) 264 | 265 | model.add(ZeroPadding2D((1, 1))) 266 | model.add(Convolution2D(128, 3, 3, activation='relu')) 267 | model.add(ZeroPadding2D((1, 1))) 268 | model.add(Convolution2D(128, 3, 3, activation='relu')) 269 | model.add(MaxPooling2D((2, 2), strides=(2, 2))) 270 | 271 | model.add(ZeroPadding2D((1, 1))) 272 | model.add(Convolution2D(256, 3, 3, activation='relu')) 273 | model.add(ZeroPadding2D((1, 1))) 274 | model.add(Convolution2D(256, 3, 3, activation='relu')) 275 | model.add(ZeroPadding2D((1, 1))) 276 | model.add(Convolution2D(256, 3, 3, activation='relu')) 277 | model.add(MaxPooling2D((2, 2), strides=(2, 2))) 278 | 279 | model.add(ZeroPadding2D((1, 1))) 280 | model.add(Convolution2D(512, 3, 3, activation='relu')) 281 | model.add(ZeroPadding2D((1, 1))) 282 | model.add(Convolution2D(512, 3, 3, activation='relu')) 283 | model.add(ZeroPadding2D((1, 1))) 284 | model.add(Convolution2D(512, 3, 3, activation='relu')) 285 | model.add(MaxPooling2D((2, 2), strides=(2, 2))) 286 | 287 | model.add(ZeroPadding2D((1, 1))) 288 | model.add(Convolution2D(512, 3, 3, activation='relu')) 289 | model.add(ZeroPadding2D((1, 1))) 290 | model.add(Convolution2D(512, 3, 3, activation='relu')) 291 | model.add(ZeroPadding2D((1, 1))) 292 | model.add(Convolution2D(512, 3, 3, activation='relu')) 293 | model.add(MaxPooling2D((2, 2), strides=(2, 2))) 294 | 295 | model.add(Flatten()) 296 | model.add(Dense(4096, activation='relu')) 297 | model.add(Dropout(0.5)) 298 | model.add(Dense(4096, activation='relu')) 299 | model.add(Dropout(0.5)) 300 | model.add(Dense(1000, activation='softmax')) 301 | 302 | if weights_path: 303 | model.load_weights(weights_path) 304 | 305 | return model 306 | 307 | -------------------------------------------------------------------------------- /scripts/image_helper.py: -------------------------------------------------------------------------------- 1 | from keras.preprocessing import image 2 | import numpy as np 3 | 4 | 5 | def get_all_ids(annotations): 6 | all_ids = [] 7 | for i in range(len(annotations)): 8 | all_ids.append(get_ids_objects_from_annotation(annotations[i])) 9 | return all_ids 10 | 11 | 12 | def get_all_images(image_names, path_voc): 13 | images = [] 14 | for j in range(np.size(image_names)): 15 | image_name = image_names[0][j] 16 | string = path_voc + '/JPEGImages/' + image_name + '.jpg' 17 | images.append(image.load_img(string, False)) 18 | return images 19 | 20 | 21 | def get_all_images_pool(image_names, path_voc): 22 | images = [] 23 | for j in range(np.size(image_names)): 24 | image_name = image_names[j] 25 | string = path_voc + '/JPEGImages/' + image_name + '.jpg' 26 | images.append(image.load_img(string, False)) 27 | return images 28 | 29 | 30 | def load_images_names_in_data_set(data_set_name, path_voc): 31 | file_path = path_voc + '/ImageSets/Main/' + data_set_name + '.txt' 32 | f = open(file_path) 33 | image_names = f.readlines() 34 | image_names = [x.strip('\n') for x in image_names] 35 | if data_set_name.startswith("aeroplane") | data_set_name.startswith("bird") | data_set_name.startswith("cow"): 36 | return [x.split(None, 1)[0] for x in image_names] 37 | else: 38 | return [x.strip('\n') for x in image_names] 39 | 40 | 41 | def load_images_labels_in_data_set(data_set_name, path_voc): 42 | file_path = path_voc + '/ImageSets/Main/' + data_set_name + '.txt' 43 | f = open(file_path) 44 | images_names = f.readlines() 45 | images_names = [x.split(None, 1)[1] for x in images_names] 46 | images_names = [x.strip('\n') for x in images_names] 47 | return images_names 48 | 49 | 50 | def mask_image_with_mean_background(mask_object_found, image): 51 | new_image = image 52 | size_image = np.shape(mask_object_found) 53 | for j in range(size_image[0]): 54 | for i in range(size_image[1]): 55 | if mask_object_found[j][i] == 1: 56 | new_image[j, i, 0] = 103.939 57 | new_image[j, i, 1] = 116.779 58 | new_image[j, i, 2] = 123.68 59 | return new_image -------------------------------------------------------------------------------- /scripts/image_zooms_testing.py: -------------------------------------------------------------------------------- 1 | import cv2, numpy as np 2 | import time 3 | import math as mth 4 | from PIL import Image, ImageDraw, ImageFont 5 | import scipy.io 6 | from keras.models import Sequential 7 | from keras import initializations 8 | from keras.initializations import normal, identity 9 | from keras.layers.core import Dense, Dropout, Activation, Flatten 10 | from keras.optimizers import RMSprop, SGD, Adam 11 | import random 12 | from scipy import ndimage 13 | from keras.preprocessing import image 14 | from sklearn.preprocessing import OneHotEncoder 15 | 16 | from features import get_image_descriptor_for_image, obtain_compiled_vgg_16, vgg_16, \ 17 | get_conv_image_descriptor_for_image, calculate_all_initial_feature_maps 18 | from parse_xml_annotations import * 19 | from image_helper import * 20 | from metrics import * 21 | from visualization import * 22 | from reinforcement import * 23 | 24 | if __name__ == "__main__": 25 | 26 | ######## PATHS definition ######## 27 | 28 | # path of pascal voc test 29 | path_voc_test = "./VOC2007_test/" 30 | # model name of the weights 31 | model_name = "model_image_zooms" 32 | # path of folder where the weights are 33 | weights_path = "../models_image_zooms/" 34 | # path of where to store visualizations of search sequences 35 | path_testing_folder = '../testing/' 36 | # path of VGG16 weights 37 | path_vgg = "../vgg16_weights.h5" 38 | 39 | ######## MODELS ######## 40 | 41 | model_vgg = obtain_compiled_vgg_16(path_vgg) 42 | model = get_q_network(weights_path + model_name) 43 | 44 | ######## LOAD IMAGE NAMES ######## 45 | 46 | image_names = np.array([load_images_names_in_data_set('aeroplane_test', path_voc_test)]) 47 | labels = load_images_labels_in_data_set('aeroplane_test', path_voc_test) 48 | 49 | ######## LOAD IMAGES ######## 50 | 51 | images = get_all_images(image_names, path_voc_test) 52 | 53 | ######## PARAMETERS ######## 54 | 55 | # Class category of PASCAL that the RL agent will be searching 56 | class_object = 1 57 | # 1 if you want to obtain visualizations of the search for objects 58 | bool_draw = 1 59 | # Scale of subregion for the hierarchical regions (to deal with 2/4, 3/4) 60 | scale_subregion = float(3)/4 61 | scale_mask = float(1)/(scale_subregion*4) 62 | # Number of steps that the agent does at each image 63 | number_of_steps = 10 64 | # Only search first object 65 | only_first_object = 1 66 | 67 | for j in range(np.size(image_names)): 68 | if labels[j] == "1": 69 | image = np.array(images[j]) 70 | # init drawing for visualization 71 | background = Image.new('RGBA', (10000, 2000), (255, 255, 255, 255)) 72 | draw = ImageDraw.Draw(background) 73 | image_name = image_names[0][j] 74 | annotation = get_bb_of_gt_from_pascal_xml_annotation(image_name, path_voc_test) 75 | gt_masks = generate_bounding_box_from_annotation(annotation, image.shape) 76 | array_classes_gt_objects = get_ids_objects_from_annotation(annotation) 77 | size_mask = (image.shape[0], image.shape[1]) 78 | original_shape = size_mask 79 | image_for_search = image 80 | region_mask = np.ones([image.shape[0], image.shape[1]]) 81 | # offset of the region observed at each time step 82 | offset = (0, 0) 83 | # absolute status is a boolean we indicate if the agent will continue 84 | # searching object or not. If the first object already covers the whole 85 | # image, we can put it at 0 so we do not further search there 86 | absolute_status = 1 87 | action = 0 88 | step = 0 89 | qval = 0 90 | region_image = image_for_search 91 | region_mask = np.ones([image.shape[0], image.shape[1]]) 92 | # we run the agent if the maximum number of steps has not been reached and 93 | # if the boolean 94 | while (step < number_of_steps) and (absolute_status == 1): 95 | iou = 0 96 | # we init history vector as we are going to find another object 97 | history_vector = np.zeros([24]) 98 | state = get_state(region_image, history_vector, model_vgg) 99 | status = 1 100 | draw_sequences_test(step, action, qval, draw, region_image, background, path_testing_folder, 101 | region_mask, image_name, bool_draw) 102 | size_mask = (image.shape[0], image.shape[1]) 103 | original_shape = size_mask 104 | region_mask = np.ones([image.shape[0], image.shape[1]]) 105 | while (status == 1) & (step < number_of_steps): 106 | step += 1 107 | qval = model.predict(state.T, batch_size=1) 108 | action = (np.argmax(qval))+1 109 | # movement action, make the proper zoom on the image 110 | if action != 6: 111 | region_mask = np.zeros(original_shape) 112 | size_mask = (size_mask[0] * scale_subregion, size_mask[1] * scale_subregion) 113 | if action == 1: 114 | offset_aux = (0, 0) 115 | elif action == 2: 116 | offset_aux = (0, size_mask[1] * scale_mask) 117 | offset = (offset[0], offset[1] + size_mask[1] * scale_mask) 118 | elif action == 3: 119 | offset_aux = (size_mask[0] * scale_mask, 0) 120 | offset = (offset[0] + size_mask[0] * scale_mask, offset[1]) 121 | elif action == 4: 122 | offset_aux = (size_mask[0] * scale_mask, 123 | size_mask[1] * scale_mask) 124 | offset = (offset[0] + size_mask[0] * scale_mask, 125 | offset[1] + size_mask[1] * scale_mask) 126 | elif action == 5: 127 | offset_aux = (size_mask[0] * scale_mask / 2, 128 | size_mask[0] * scale_mask / 2) 129 | offset = (offset[0] + size_mask[0] * scale_mask / 2, 130 | offset[1] + size_mask[0] * scale_mask / 2) 131 | region_image = region_image[offset_aux[0]:offset_aux[0] + size_mask[0], 132 | offset_aux[1]:offset_aux[1] + size_mask[1]] 133 | region_mask[offset[0]:offset[0] + size_mask[0], offset[1]:offset[1] + size_mask[1]] = 1 134 | draw_sequences_test(step, action, qval, draw, region_image, background, path_testing_folder, 135 | region_mask, image_name, bool_draw) 136 | # trigger action 137 | if action == 6: 138 | offset = (0, 0) 139 | status = 0 140 | if step == 1: 141 | absolute_status = 0 142 | if only_first_object == 1: 143 | absolute_status = 0 144 | image_for_search = mask_image_with_mean_background(region_mask, image_for_search) 145 | region_image = image_for_search 146 | history_vector = update_history_vector(history_vector, action) 147 | new_state = get_state(region_image, history_vector, model_vgg) 148 | state = new_state 149 | -------------------------------------------------------------------------------- /scripts/image_zooms_training.py: -------------------------------------------------------------------------------- 1 | import cv2, numpy as np 2 | import time 3 | import math as mth 4 | from PIL import Image, ImageDraw, ImageFont 5 | import scipy.io 6 | from keras.models import Sequential 7 | from keras import initializations 8 | from keras.initializations import normal, identity 9 | from keras.layers.core import Dense, Dropout, Activation, Flatten 10 | from keras.optimizers import RMSprop, SGD, Adam 11 | import random 12 | import argparse 13 | from scipy import ndimage 14 | from keras.preprocessing import image 15 | from sklearn.preprocessing import OneHotEncoder 16 | from features import get_image_descriptor_for_image, obtain_compiled_vgg_16, vgg_16, \ 17 | get_conv_image_descriptor_for_image, calculate_all_initial_feature_maps 18 | from parse_xml_annotations import * 19 | from image_helper import * 20 | from metrics import * 21 | from visualization import * 22 | from reinforcement import * 23 | 24 | 25 | # Read number of epoch to be trained, to make checkpointing 26 | parser = argparse.ArgumentParser(description='Epoch:') 27 | parser.add_argument("-n", metavar='N', type=int, default=0) 28 | args = parser.parse_args() 29 | epochs_id = int(args.n) 30 | 31 | 32 | if __name__ == "__main__": 33 | 34 | ######## PATHS definition ######## 35 | 36 | # path of PASCAL VOC 2012 or other database to use for training 37 | path_voc = "./VOC2012/" 38 | # path of other PASCAL VOC dataset, if you want to train with 2007 and 2012 train datasets 39 | path_voc2 = "./VOC2007/" 40 | # path of where to store the models 41 | path_model = "../models_image_zooms" 42 | # path of where to store visualizations of search sequences 43 | path_testing_folder = '../testing_visualizations' 44 | # path of VGG16 weights 45 | path_vgg = "../vgg16_weights.h5" 46 | 47 | ######## PARAMETERS ######## 48 | 49 | # Class category of PASCAL that the RL agent will be searching 50 | class_object = 1 51 | # Scale of subregion for the hierarchical regions (to deal with 2/4, 3/4) 52 | scale_subregion = float(3)/4 53 | scale_mask = float(1)/(scale_subregion*4) 54 | # 1 if you want to obtain visualizations of the search for objects 55 | bool_draw = 0 56 | # How many steps can run the agent until finding one object 57 | number_of_steps = 10 58 | # Boolean to indicate if you want to use the two databases, or just one 59 | two_databases = 0 60 | epochs = 50 61 | gamma = 0.90 62 | epsilon = 1 63 | batch_size = 100 64 | # Pointer to where to store the last experience in the experience replay buffer, 65 | # actually there is a pointer for each PASCAL category, in case all categories 66 | # are trained at the same time 67 | h = np.zeros([20]) 68 | # Each replay memory (one for each possible category) has a capacity of 100 experiences 69 | buffer_experience_replay = 1000 70 | # Init replay memories 71 | replay = [[] for i in range(20)] 72 | reward = 0 73 | 74 | ######## MODELS ######## 75 | 76 | model_vgg = obtain_compiled_vgg_16(path_vgg) 77 | 78 | # If you want to train it from first epoch, first option is selected. Otherwise, 79 | # when making checkpointing, weights of last stored weights are loaded for a particular class object 80 | 81 | if epochs_id == 0: 82 | models = get_array_of_q_networks_for_pascal("0", class_object) 83 | else: 84 | models = get_array_of_q_networks_for_pascal(path_model, class_object) 85 | 86 | ######## LOAD IMAGE NAMES ######## 87 | 88 | if two_databases == 1: 89 | image_names1 = np.array([load_images_names_in_data_set('trainval', path_voc)]) 90 | image_names2 = np.array([load_images_names_in_data_set('trainval', path_voc2)]) 91 | image_names = np.concatenate([image_names1, image_names2]) 92 | else: 93 | image_names = np.array([load_images_names_in_data_set('trainval', path_voc)]) 94 | 95 | ######## LOAD IMAGES ######## 96 | 97 | if two_databases == 1: 98 | images1 = get_all_images(image_names1, path_voc) 99 | images2 = get_all_images(image_names2, path_voc2) 100 | images = np.concatenate([images1, images2]) 101 | else: 102 | images = get_all_images(image_names, path_voc) 103 | 104 | for i in range(epochs_id, epochs_id + epochs): 105 | for j in range(np.size(image_names)): 106 | masked = 0 107 | not_finished = 1 108 | image = np.array(images[j]) 109 | image_name = image_names[0][j] 110 | annotation = get_bb_of_gt_from_pascal_xml_annotation(image_name, path_voc) 111 | if two_databases == 1: 112 | if j < np.size(image_names1): 113 | annotation = get_bb_of_gt_from_pascal_xml_annotation(image_name, path_voc) 114 | else: 115 | annotation = get_bb_of_gt_from_pascal_xml_annotation(image_name, path_voc2) 116 | gt_masks = generate_bounding_box_from_annotation(annotation, image.shape) 117 | array_classes_gt_objects = get_ids_objects_from_annotation(annotation) 118 | region_mask = np.ones([image.shape[0], image.shape[1]]) 119 | shape_gt_masks = np.shape(gt_masks) 120 | available_objects = np.ones(np.size(array_classes_gt_objects)) 121 | # Iterate through all the objects in the ground truth of an image 122 | for k in range(np.size(array_classes_gt_objects)): 123 | # Init visualization 124 | background = Image.new('RGBA', (10000, 2500), (255, 255, 255, 255)) 125 | draw = ImageDraw.Draw(background) 126 | # We check whether the ground truth object is of the target class category 127 | if array_classes_gt_objects[k] == class_object: 128 | gt_mask = gt_masks[:, :, k] 129 | step = 0 130 | new_iou = 0 131 | # this matrix stores the IoU of each object of the ground-truth, just in case 132 | # the agent changes of observed object 133 | last_matrix = np.zeros([np.size(array_classes_gt_objects)]) 134 | region_image = image 135 | offset = (0, 0) 136 | size_mask = (image.shape[0], image.shape[1]) 137 | original_shape = size_mask 138 | old_region_mask = region_mask 139 | region_mask = np.ones([image.shape[0], image.shape[1]]) 140 | # If the ground truth object is already masked by other already found masks, do not 141 | # use it for training 142 | if masked == 1: 143 | for p in range(gt_masks.shape[2]): 144 | overlap = calculate_overlapping(old_region_mask, gt_masks[:, :, p]) 145 | if overlap > 0.60: 146 | available_objects[p] = 0 147 | # We check if there are still obejcts to be found 148 | if np.count_nonzero(available_objects) == 0: 149 | not_finished = 0 150 | # follow_iou function calculates at each time step which is the groun truth object 151 | # that overlaps more with the visual region, so that we can calculate the rewards appropiately 152 | iou, new_iou, last_matrix, index = follow_iou(gt_masks, region_mask, array_classes_gt_objects, 153 | class_object, last_matrix, available_objects) 154 | new_iou = iou 155 | gt_mask = gt_masks[:, :, index] 156 | # init of the history vector that indicates past actions (6 actions * 4 steps in the memory) 157 | history_vector = np.zeros([24]) 158 | # computation of the initial state 159 | state = get_state(region_image, history_vector, model_vgg) 160 | # status indicates whether the agent is still alive and has not triggered the terminal action 161 | status = 1 162 | action = 0 163 | reward = 0 164 | if step > number_of_steps: 165 | background = draw_sequences(i, k, step, action, draw, region_image, background, 166 | path_testing_folder, iou, reward, gt_mask, region_mask, image_name, 167 | bool_draw) 168 | step += 1 169 | while (status == 1) & (step < number_of_steps) & not_finished: 170 | category = int(array_classes_gt_objects[k]-1) 171 | model = models[0][category] 172 | qval = model.predict(state.T, batch_size=1) 173 | background = draw_sequences(i, k, step, action, draw, region_image, background, 174 | path_testing_folder, iou, reward, gt_mask, region_mask, image_name, 175 | bool_draw) 176 | step += 1 177 | # we force terminal action in case actual IoU is higher than 0.5, to train faster the agent 178 | if (i < 100) & (new_iou > 0.5): 179 | action = 6 180 | # epsilon-greedy policy 181 | elif random.random() < epsilon: 182 | action = np.random.randint(1, 7) 183 | else: 184 | action = (np.argmax(qval))+1 185 | # terminal action 186 | if action == 6: 187 | iou, new_iou, last_matrix, index = follow_iou(gt_masks, region_mask, 188 | array_classes_gt_objects, class_object, 189 | last_matrix, available_objects) 190 | gt_mask = gt_masks[:, :, index] 191 | reward = get_reward_trigger(new_iou) 192 | background = draw_sequences(i, k, step, action, draw, region_image, background, 193 | path_testing_folder, iou, reward, gt_mask, region_mask, 194 | image_name, bool_draw) 195 | step += 1 196 | # movement action, we perform the crop of the corresponding subregion 197 | else: 198 | region_mask = np.zeros(original_shape) 199 | size_mask = (size_mask[0] * scale_subregion, size_mask[1] * scale_subregion) 200 | if action == 1: 201 | offset_aux = (0, 0) 202 | elif action == 2: 203 | offset_aux = (0, size_mask[1] * scale_mask) 204 | offset = (offset[0], offset[1] + size_mask[1] * scale_mask) 205 | elif action == 3: 206 | offset_aux = (size_mask[0] * scale_mask, 0) 207 | offset = (offset[0] + size_mask[0] * scale_mask, offset[1]) 208 | elif action == 4: 209 | offset_aux = (size_mask[0] * scale_mask, 210 | size_mask[1] * scale_mask) 211 | offset = (offset[0] + size_mask[0] * scale_mask, 212 | offset[1] + size_mask[1] * scale_mask) 213 | elif action == 5: 214 | offset_aux = (size_mask[0] * scale_mask / 2, 215 | size_mask[0] * scale_mask / 2) 216 | offset = (offset[0] + size_mask[0] * scale_mask / 2, 217 | offset[1] + size_mask[0] * scale_mask / 2) 218 | region_image = region_image[offset_aux[0]:offset_aux[0] + size_mask[0], 219 | offset_aux[1]:offset_aux[1] + size_mask[1]] 220 | region_mask[offset[0]:offset[0] + size_mask[0], offset[1]:offset[1] + size_mask[1]] = 1 221 | iou, new_iou, last_matrix, index = follow_iou(gt_masks, region_mask, 222 | array_classes_gt_objects, class_object, 223 | last_matrix, available_objects) 224 | gt_mask = gt_masks[:, :, index] 225 | reward = get_reward_movement(iou, new_iou) 226 | iou = new_iou 227 | history_vector = update_history_vector(history_vector, action) 228 | new_state = get_state(region_image, history_vector, model_vgg) 229 | # Experience replay storage 230 | if len(replay[category]) < buffer_experience_replay: 231 | replay[category].append((state, action, reward, new_state)) 232 | else: 233 | if h[category] < (buffer_experience_replay-1): 234 | h[category] += 1 235 | else: 236 | h[category] = 0 237 | h_aux = h[category] 238 | h_aux = int(h_aux) 239 | replay[category][h_aux] = (state, action, reward, new_state) 240 | minibatch = random.sample(replay[category], batch_size) 241 | X_train = [] 242 | y_train = [] 243 | # we pick from the replay memory a sampled minibatch and generate the training samples 244 | for memory in minibatch: 245 | old_state, action, reward, new_state = memory 246 | old_qval = model.predict(old_state.T, batch_size=1) 247 | newQ = model.predict(new_state.T, batch_size=1) 248 | maxQ = np.max(newQ) 249 | y = np.zeros([1, 6]) 250 | y = old_qval 251 | y = y.T 252 | if action != 6: #non-terminal state 253 | update = (reward + (gamma * maxQ)) 254 | else: #terminal state 255 | update = reward 256 | y[action-1] = update #target output 257 | X_train.append(old_state) 258 | y_train.append(y) 259 | X_train = np.array(X_train) 260 | y_train = np.array(y_train) 261 | X_train = X_train.astype("float32") 262 | y_train = y_train.astype("float32") 263 | X_train = X_train[:, :, 0] 264 | y_train = y_train[:, :, 0] 265 | hist = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=1, verbose=0) 266 | models[0][category] = model 267 | state = new_state 268 | if action == 6: 269 | status = 0 270 | masked = 1 271 | # we mask object found with ground-truth so that agent learns faster 272 | image = mask_image_with_mean_background(gt_mask, image) 273 | else: 274 | masked = 0 275 | available_objects[index] = 0 276 | if epsilon > 0.1: 277 | epsilon -= 0.1 278 | for t in range (np.size(models)): 279 | if t == (class_object-1): 280 | string = path_model + '/model' + str(t) + '_epoch_' + str(i) + 'h5' 281 | string2 = path_model + '/model' + str(t) + 'h5' 282 | model = models[0][t] 283 | model.save_weights(string, overwrite=True) 284 | model.save_weights(string2, overwrite=True) 285 | 286 | -------------------------------------------------------------------------------- /scripts/metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | 4 | 5 | def calculate_iou(img_mask, gt_mask): 6 | gt_mask *= 1.0 7 | img_and = cv2.bitwise_and(img_mask, gt_mask) 8 | img_or = cv2.bitwise_or(img_mask, gt_mask) 9 | j = np.count_nonzero(img_and) 10 | i = np.count_nonzero(img_or) 11 | iou = float(float(j)/float(i)) 12 | return iou 13 | 14 | 15 | def calculate_overlapping(img_mask, gt_mask): 16 | gt_mask *= 1.0 17 | img_and = cv2.bitwise_and(img_mask, gt_mask) 18 | j = np.count_nonzero(img_and) 19 | i = np.count_nonzero(gt_mask) 20 | overlap = float(float(j)/float(i)) 21 | return overlap 22 | 23 | 24 | def follow_iou(gt_masks, mask, array_classes_gt_objects, object_id, last_matrix, available_objects): 25 | results = np.zeros([np.size(array_classes_gt_objects), 1]) 26 | for k in range(np.size(array_classes_gt_objects)): 27 | if array_classes_gt_objects[k] == object_id: 28 | if available_objects[k] == 1: 29 | gt_mask = gt_masks[:, :, k] 30 | iou = calculate_iou(mask, gt_mask) 31 | results[k] = iou 32 | else: 33 | results[k] = -1 34 | max_result = max(results) 35 | ind = np.argmax(results) 36 | iou = last_matrix[ind] 37 | new_iou = max_result 38 | return iou, new_iou, results, ind 39 | -------------------------------------------------------------------------------- /scripts/parse_xml_annotations.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | import numpy as np 3 | 4 | 5 | def get_bb_of_gt_from_pascal_xml_annotation(xml_name, voc_path): 6 | string = voc_path + '/Annotations/' + xml_name + '.xml' 7 | tree = ET.parse(string) 8 | root = tree.getroot() 9 | names = [] 10 | x_min = [] 11 | x_max = [] 12 | y_min = [] 13 | y_max = [] 14 | for child in root: 15 | if child.tag == 'object': 16 | for child2 in child: 17 | if child2.tag == 'name': 18 | names.append(child2.text) 19 | elif child2.tag == 'bndbox': 20 | for child3 in child2: 21 | if child3.tag == 'xmin': 22 | x_min.append(child3.text) 23 | elif child3.tag == 'xmax': 24 | x_max.append(child3.text) 25 | elif child3.tag == 'ymin': 26 | y_min.append(child3.text) 27 | elif child3.tag == 'ymax': 28 | y_max.append(child3.text) 29 | category_and_bb = np.zeros([np.size(names), 5]) 30 | for i in range(np.size(names)): 31 | category_and_bb[i][0] = get_id_of_class_name(names[i]) 32 | category_and_bb[i][1] = x_min[i] 33 | category_and_bb[i][2] = x_max[i] 34 | category_and_bb[i][3] = y_min[i] 35 | category_and_bb[i][4] = y_max[i] 36 | return category_and_bb 37 | 38 | 39 | def get_all_annotations(image_names, voc_path): 40 | annotations = [] 41 | for i in range(np.size(image_names)): 42 | image_name = image_names[0][i] 43 | annotations.append(get_bb_of_gt_from_pascal_xml_annotation(image_name, voc_path)) 44 | return annotations 45 | 46 | 47 | def generate_bounding_box_from_annotation(annotation, image_shape): 48 | length_annotation = annotation.shape[0] 49 | masks = np.zeros([image_shape[0], image_shape[1], length_annotation]) 50 | for i in range(0, length_annotation): 51 | masks[annotation[i, 3]:annotation[i, 4], annotation[i, 1]:annotation[i, 2], i] = 1 52 | return masks 53 | 54 | 55 | def get_ids_objects_from_annotation(annotation): 56 | return annotation[:, 0] 57 | 58 | 59 | def get_id_of_class_name (class_name): 60 | if class_name == 'aeroplane': 61 | return 1 62 | elif class_name == 'bicycle': 63 | return 2 64 | elif class_name == 'bird': 65 | return 3 66 | elif class_name == 'boat': 67 | return 4 68 | elif class_name == 'bottle': 69 | return 5 70 | elif class_name == 'bus': 71 | return 6 72 | elif class_name == 'car': 73 | return 7 74 | elif class_name == 'cat': 75 | return 8 76 | elif class_name == 'chair': 77 | return 9 78 | elif class_name == 'cow': 79 | return 10 80 | elif class_name == 'diningtable': 81 | return 11 82 | elif class_name == 'dog': 83 | return 12 84 | elif class_name == 'horse': 85 | return 13 86 | elif class_name == 'motorbike': 87 | return 14 88 | elif class_name == 'person': 89 | return 15 90 | elif class_name == 'pottedplant': 91 | return 16 92 | elif class_name == 'sheep': 93 | return 17 94 | elif class_name == 'sofa': 95 | return 18 96 | elif class_name == 'train': 97 | return 19 98 | elif class_name == 'tvmonitor': 99 | return 20 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | -------------------------------------------------------------------------------- /scripts/pool45_crops_testing.py: -------------------------------------------------------------------------------- 1 | import cv2, numpy as np 2 | import time 3 | import math as mth 4 | from PIL import Image, ImageDraw, ImageFont 5 | import scipy.io 6 | from keras.models import Sequential 7 | from keras import initializations 8 | from keras.initializations import normal, identity 9 | from keras.layers.core import Dense, Dropout, Activation, Flatten 10 | from keras.optimizers import RMSprop, SGD, Adam 11 | import random 12 | from scipy import ndimage 13 | from keras.preprocessing import image 14 | from sklearn.preprocessing import OneHotEncoder 15 | 16 | from features import get_image_descriptor_for_image, obtain_compiled_vgg_16, vgg_16, \ 17 | get_conv_image_descriptor_for_image, calculate_all_initial_feature_maps 18 | from parse_xml_annotations import * 19 | from image_helper import * 20 | from metrics import * 21 | from visualization import * 22 | from reinforcement import * 23 | 24 | if __name__ == "__main__": 25 | 26 | ######## PATHS definition ######## 27 | 28 | # path of pascal voc test 29 | path_voc_test = "./VOC2007_test/" 30 | # model name of the weights 31 | model_name = "model_pool45_crops" 32 | # path of folder where the weights are 33 | weights_path = "../models_pool45_crops/" 34 | # path of where to store visualizations of search sequences 35 | path_testing_folder = '../testing/' 36 | # path of VGG16 weights 37 | path_vgg = "../vgg16_weights.h5" 38 | 39 | ######## MODELS ######## 40 | 41 | model_vgg = get_convolutional_vgg16_compiled(path_vgg) 42 | model = get_q_network(weights_path + model_name) 43 | 44 | ######## LOAD IMAGE NAMES ######## 45 | 46 | image_names = np.array([load_images_names_in_data_set('aeroplane_test', path_voc_test)]) 47 | labels = load_images_labels_in_data_set('aeroplane_test', path_voc_test) 48 | 49 | ######## LOAD IMAGES ######## 50 | 51 | images = get_all_images(image_names, path_voc_test) 52 | 53 | ######## PARAMETERS ######## 54 | 55 | # Class category of PASCAL that the RL agent will be searching 56 | class_object = 1 57 | # 1 if you want to obtain visualizations of the search for objects 58 | bool_draw = 1 59 | # Scale of subregion for the hierarchical regions (to deal with 2/4, 3/4) 60 | scale_subregion = float(3)/4 61 | scale_mask = float(1)/(scale_subregion*4) 62 | # Number of steps that the agent does at each image 63 | number_of_steps = 10 64 | # Only search first object 65 | only_first_object = 1 66 | 67 | for j in range(np.size(image_names)): 68 | if labels[j] == "1": 69 | image = np.array(images[j]) 70 | # init drawing for visualization 71 | background = Image.new('RGBA', (10000, 2000), (255, 255, 255, 255)) 72 | draw = ImageDraw.Draw(background) 73 | image_name = image_names[0][j] 74 | # get feature maps for the image 75 | feature_maps = get_feature_maps(model_vgg, image) 76 | annotation = get_bb_of_gt_from_pascal_xml_annotation(image_name, path_voc_test) 77 | gt_masks = generate_bounding_box_from_annotation(annotation, image.shape) 78 | array_classes_gt_objects = get_ids_objects_from_annotation(annotation) 79 | size_mask = (image.shape[0], image.shape[1]) 80 | original_shape = size_mask 81 | image_for_search = image 82 | region_mask = np.ones([image.shape[0], image.shape[1]]) 83 | count = 0 84 | # offset of the region observed at each time step 85 | offset = (0, 0) 86 | # absolute status is a boolean we indicate if the agent will continue 87 | # searching object or not. If the first object already covers the whole 88 | # image, we can put it at 0 so we do not further search there 89 | absolute_status = 1 90 | action = 0 91 | step = 0 92 | qval = 0 93 | region_image = image_for_search 94 | region_mask = np.ones([image.shape[0], image.shape[1]]) 95 | while (step < number_of_steps) and (absolute_status == 1): 96 | iou = 0 97 | history_vector = np.zeros([24]) 98 | region_coordinates = np.array([offset[0], offset[1], size_mask[0], size_mask[1]]) 99 | region_descriptor = obtain_descriptor_from_feature_map(feature_maps, region_coordinates) 100 | region_descriptor_2 = np.reshape(region_descriptor, (25088, 1)) 101 | state = get_state_pool45(history_vector, region_descriptor_2) 102 | status = 1 103 | iou = 0 104 | draw_sequences_test(step, action, qval, draw, region_image, background, path_testing_folder, 105 | region_mask, image_name, bool_draw) 106 | size_mask = (image.shape[0], image.shape[1]) 107 | original_shape = size_mask 108 | region_mask = np.ones([image.shape[0], image.shape[1]]) 109 | while (status == 1) & (step < number_of_steps): 110 | step += 1 111 | qval = model.predict(state.T, batch_size=1) 112 | action = (np.argmax(qval))+1 113 | # movement action, make the proper zoom on the image 114 | if action != 6: 115 | region_mask = np.zeros(original_shape) 116 | size_mask = (size_mask[0] * scale_subregion, size_mask[1] * scale_subregion) 117 | if action == 1: 118 | offset_aux = (0, 0) 119 | elif action == 2: 120 | offset_aux = (0, size_mask[1] * scale_mask) 121 | offset = (offset[0], offset[1] + size_mask[1] * scale_mask) 122 | elif action == 3: 123 | offset_aux = (size_mask[0] * scale_mask, 0) 124 | offset = (offset[0] + size_mask[0] * scale_mask, offset[1]) 125 | elif action == 4: 126 | offset_aux = (size_mask[0] * scale_mask, 127 | size_mask[1] * scale_mask) 128 | offset = (offset[0] + size_mask[0] * scale_mask, 129 | offset[1] + size_mask[1] * scale_mask) 130 | elif action == 5: 131 | offset_aux = (size_mask[0] * scale_mask / 2, 132 | size_mask[0] * scale_mask / 2) 133 | offset = (offset[0] + size_mask[0] * scale_mask / 2, 134 | offset[1] + size_mask[0] * scale_mask / 2) 135 | region_image = region_image[offset_aux[0]:offset_aux[0] + size_mask[0], 136 | offset_aux[1]:offset_aux[1] + size_mask[1]] 137 | region_mask[offset[0]:offset[0] + size_mask[0], offset[1]:offset[1] + size_mask[1]] = 1 138 | draw_sequences_test(step, action, qval, draw, region_image, background, path_testing_folder, 139 | region_mask, image_name, bool_draw) 140 | # trigger action 141 | if action == 6: 142 | offset = (0, 0) 143 | status = 0 144 | if step == 1: 145 | absolute_status = 0 146 | if only_first_object == 1: 147 | absolute_status = 0 148 | image_for_search = mask_image_with_mean_background(region_mask, image_for_search) 149 | region_image = image_for_search 150 | feature_maps = get_feature_maps(model_vgg, region_image) 151 | history_vector = update_history_vector(history_vector, action) 152 | region_coordinates = np.array([offset[0], offset[1], size_mask[0], size_mask[1]]) 153 | region_descriptor = obtain_descriptor_from_feature_map(feature_maps, region_coordinates) 154 | region_descriptor_2 = np.reshape(region_descriptor, (25088, 1)) 155 | state = get_state_pool45(history_vector, region_descriptor_2) 156 | -------------------------------------------------------------------------------- /scripts/pool45_crops_training.py: -------------------------------------------------------------------------------- 1 | import cv2, numpy as np 2 | import time 3 | import math as mth 4 | from PIL import Image, ImageDraw, ImageFont 5 | import scipy.io 6 | from keras.models import Sequential 7 | from keras import initializations 8 | from keras.initializations import normal, identity 9 | from keras.layers.core import Dense, Dropout, Activation, Flatten 10 | from keras.optimizers import RMSprop, SGD, Adam 11 | import random 12 | import argparse 13 | from scipy import ndimage 14 | from keras.preprocessing import image 15 | from sklearn.preprocessing import OneHotEncoder 16 | 17 | from features import get_image_descriptor_for_image, obtain_compiled_vgg_16, vgg_16, \ 18 | get_conv_image_descriptor_for_image, calculate_all_initial_feature_maps 19 | from parse_xml_annotations import * 20 | from image_helper import * 21 | from metrics import * 22 | from visualization import * 23 | from reinforcement import * 24 | 25 | 26 | # Read number of epoch to be trained, to make checkpointing 27 | parser = argparse.ArgumentParser(description='Epoch:') 28 | parser.add_argument("-n", metavar='N', type=int, default=0) 29 | args = parser.parse_args() 30 | epochs_id = int(args.n) 31 | 32 | 33 | if __name__ == "__main__": 34 | 35 | ######## PATHS definition ######## 36 | 37 | # path of PASCAL VOC 2012 or other database to use for training 38 | path_voc = "./VOC2012_train/" 39 | # path of other PASCAL VOC dataset, if you want to train with 2007 and 2012 train datasets 40 | # path_voc2 = "/gpfs/projects/bsc31/bsc31429/VOC2007_train/" 41 | # path of where to store the models 42 | path_model = "../models_pool45_crops" 43 | # path of where to store visualizations of search sequences 44 | path_testing_folder = '../testing' 45 | # path of VGG16 weights 46 | path_vgg = "../vgg16_weights.h5" 47 | 48 | ######## PARAMETERS ######## 49 | 50 | # Class category of PASCAL that the RL agent will be searching 51 | class_object = 1 52 | # Scale of subregion for the hierarchical regions (to deal with 2/4, 3/4) 53 | scale_subregion = float(3)/4 54 | scale_mask = float(1)/(scale_subregion*4) 55 | # 1 if you want to obtain visualizations of the search for objects 56 | bool_draw = 0 57 | # How many steps can run the agent until finding one object 58 | number_of_steps = 10 59 | # Boolean to indicate if you want to use the two databases, or just one 60 | two_databases = 0 61 | epochs = 50 62 | gamma = 0.90 63 | epsilon = 1 64 | batch_size = 100 65 | # Pointer to where to store the last experience in the experience replay buffer, 66 | # actually there is a pointer for each PASCAL category, in case all categories 67 | # are trained at the same time 68 | h = np.zeros([20]) 69 | # Each replay memory (one for each possible category) has a capacity of 100 experiences 70 | buffer_experience_replay = 1000 71 | # Init replay memories 72 | replay = [[] for i in range(20)] 73 | reward = 0 74 | 75 | ######## MODELS ######## 76 | 77 | model_vgg = get_convolutional_vgg16_compiled(path_vgg) 78 | 79 | # If you want to train it from first epoch, first option is selected. Otherwise, 80 | # when making checkpointing, weights of last stored weights are loaded for a particular class object 81 | # NOTICE that for POOL45 model, this script only can train one class category at a time. We did this as 82 | # we are pre-computing features and storing them to RAM, and it is not possible to store features for all 83 | # objects of all classes 84 | 85 | if epochs_id == 0: 86 | model = get_q_network("0") 87 | else: 88 | model = get_q_network(path_model + '/model' + str(class_object-1) + 'h5') 89 | 90 | ######## LOAD IMAGE NAMES ######## 91 | 92 | if two_databases == 1: 93 | image_names_1 = np.array([load_images_names_in_data_set('aeroplane_trainval', path_voc)]) 94 | labels = load_images_labels_in_data_set('aeroplane_trainval', path_voc) 95 | image_names_1_2 = [] 96 | for i in range(0, np.size(labels)): 97 | if labels[i] == "1": 98 | image_names_1_2.append(image_names_1[0][i]) 99 | image_names_2 = np.array([load_images_names_in_data_set('aeroplane_trainval', path_voc2)]) 100 | labels = load_images_labels_in_data_set('aeroplane_trainval', path_voc2) 101 | image_names_2_2 = [] 102 | for i in range(0, np.size(labels)): 103 | if labels[i] == "1": 104 | image_names_2_2.append(image_names_2[0][i]) 105 | image_names = np.concatenate([image_names_1_2, image_names_2_2], axis=1) 106 | else: 107 | image_names = np.array([load_images_names_in_data_set('aeroplane_trainval', path_voc)]) 108 | # We check in the annotations which of the images actually contain the class category that we want 109 | # notice that as we want to train it for planes (class category 1) we input this subset of the database 110 | labels = load_images_labels_in_data_set('aeroplane_trainval', path_voc) 111 | image_names_2 = [] 112 | for i in range(0, np.size(labels)): 113 | if labels[i] == "1": 114 | image_names_2.append(image_names[0][i]) 115 | image_names = image_names_2 116 | 117 | ######## LOAD IMAGES ######## 118 | 119 | if two_databases == 1: 120 | images1 = get_all_images_pool(image_names_1_2, path_voc) 121 | images2 = get_all_images_pool(image_names_2_2, path_voc2) 122 | images = images1 + images2 123 | else: 124 | images = get_all_images_pool(image_names, path_voc) 125 | 126 | 127 | ######## PRECOMPUTE ALL INITIAL FEATURE MAPS ######## 128 | 129 | if two_databases == 1: 130 | initial_feature_maps1 = calculate_all_initial_feature_maps(images1, model_vgg, image_names_1_2) 131 | initial_feature_maps2 = calculate_all_initial_feature_maps(images2, model_vgg, image_names_2_2) 132 | initial_feature_maps = initial_feature_maps1 + initial_feature_maps2 133 | else: 134 | initial_feature_maps = calculate_all_initial_feature_maps(images, model_vgg, image_names) 135 | 136 | for i in range(epochs_id, epochs_id+epochs_batch): 137 | for j in range(np.size(image_names)): 138 | masked = 0 139 | not_finished = 1 140 | image = np.array(images[j]) 141 | image_name = image_names[j] 142 | feature_maps = initial_feature_maps[j] 143 | annotation = get_bb_of_gt_from_pascal_xml_annotation(image_name, path_voc) 144 | if two_databases == 1: 145 | if j < np.size(image_names1_2): 146 | annotation = get_bb_of_gt_from_pascal_xml_annotation(image_name, path_voc) 147 | else: 148 | annotation = get_bb_of_gt_from_pascal_xml_annotation(image_name, path_voc2) 149 | gt_masks = generate_bounding_box_from_annotation(annotation, image.shape) 150 | array_classes_gt_objects = get_ids_objects_from_annotation(annotation) 151 | region_mask = np.ones([image.shape[0], image.shape[1]]) 152 | shape_gt_masks = np.shape(gt_masks) 153 | available_objects = np.ones(np.size(array_classes_gt_objects)) 154 | # Iterate through all the objects in the ground truth of an image 155 | for k in range(np.size(array_classes_gt_objects)): 156 | # Init visualization 157 | background = Image.new('RGBA', (10000, 2500), (255, 255, 255, 255)) 158 | draw = ImageDraw.Draw(background) 159 | # We check whether the ground truth object is of the target class category 160 | if array_classes_gt_objects[k] == class_object: 161 | gt_mask = gt_masks[:, :, k] 162 | step = 0 163 | reward = 0 164 | # this matrix stores the IoU of each object of the ground-truth, just in case 165 | # the agent changes of observed object 166 | last_matrix = np.zeros([np.size(array_classes_gt_objects)]) 167 | new_iou = 0 168 | region_image = image 169 | offset = (0, 0) 170 | size_mask = (image.shape[0], image.shape[1]) 171 | original_shape = size_mask 172 | old_region_mask = region_mask 173 | region_mask = np.ones([image.shape[0], image.shape[1]]) 174 | # If the ground truth object is already masked by other already found masks, do not 175 | # use it for training 176 | if masked == 1: 177 | for p in range(gt_masks.shape[2]): 178 | overlap = calculate_overlapping(old_region_mask, gt_masks[:, :, p]) 179 | if overlap > 0.6: 180 | available_objects[p] = 0 181 | # We check if there are still objects to be found 182 | if np.count_nonzero(available_objects) == 0: 183 | not_finished = 0 184 | # follow_iou function calculates at each time step which is the groun truth object 185 | # that overlaps more with the visual region, so that we can calculate the rewards appropiately 186 | iou, new_iou, last_matrix, index = follow_iou(gt_masks, region_mask, array_classes_gt_objects, 187 | class_object, last_matrix, available_objects) 188 | new_iou = iou 189 | gt_mask = gt_masks[:, :, index] 190 | # init of the history vector that indicates past actions (6 actions * 4 steps in the memory) 191 | history_vector = np.zeros([24]) 192 | region_coordinates = np.array([offset[0], offset[1], size_mask[0], size_mask[1]]) 193 | # calculate descriptor of region by ROI-pooling 194 | region_descriptor = obtain_descriptor_from_feature_map(feature_maps, region_coordinates) 195 | region_descriptor_2 = np.reshape(region_descriptor, (25088, 1)) 196 | # computation of the initial state 197 | state = get_state_pool45(history_vector, region_descriptor_2) 198 | # status indicates whether the agent is still alive and has not triggered the terminal action 199 | status = 1 200 | action = 0 201 | if step > number_of_steps: 202 | background = draw_sequences(i, k, step, action, draw, region_image, background, 203 | path_testing_folder, iou, reward, gt_mask, region_mask, image_name, 204 | bool_draw) 205 | step += 1 206 | while (status == 1) & (step < number_of_steps) & not_finished: 207 | category = int(array_classes_gt_objects[k]-1) 208 | counter[category] += 1 209 | qval = model.predict(state.T, batch_size=1) 210 | background = draw_sequences(i, k, step, action, draw, region_image, background, 211 | path_testing_folder, iou, reward, gt_mask, region_mask, image_name, 212 | bool_draw) 213 | step += 1 214 | # we force terminal action in case actual IoU is higher than 0.5, to train faster the agent 215 | if (i < 100) & (new_iou > 0.5): 216 | action = 6 217 | # epsilon-greedy policy 218 | elif random.random() < epsilon: 219 | action = np.random.randint(1, 7) 220 | else: 221 | action = (np.argmax(qval))+1 222 | # terminal action 223 | if action == 6: 224 | iou, new_iou, last_matrix, index = follow_iou(gt_masks, region_mask, 225 | array_classes_gt_objects, class_object, 226 | last_matrix, available_objects) 227 | gt_mask = gt_masks[:, :, index] 228 | reward = get_reward_trigger(new_iou) 229 | background = draw_sequences(i, k, step, action, draw, region_image, background, 230 | path_testing_folder, iou, reward, gt_mask, region_mask, 231 | image_name, bool_draw) 232 | step += 1 233 | # movement action, we perform the crop of the corresponding subregion 234 | else: 235 | region_mask = np.zeros(original_shape) 236 | size_mask = (size_mask[0] * scale_subregion, size_mask[1] * scale_subregion) 237 | if action == 1: 238 | offset_aux = (0, 0) 239 | elif action == 2: 240 | offset_aux = (0, size_mask[1] * scale_mask) 241 | offset = (offset[0], offset[1] + size_mask[1] * scale_mask) 242 | elif action == 3: 243 | offset_aux = (size_mask[0] * scale_mask, 0) 244 | offset = (offset[0] + size_mask[0] * scale_mask, offset[1]) 245 | elif action == 4: 246 | offset_aux = (size_mask[0] * scale_mask, 247 | size_mask[1] * scale_mask) 248 | offset = (offset[0] + size_mask[0] * scale_mask, 249 | offset[1] + size_mask[1] * scale_mask) 250 | elif action == 5: 251 | offset_aux = (size_mask[0] * scale_mask / 2, 252 | size_mask[0] * scale_mask / 2) 253 | offset = (offset[0] + size_mask[0] * scale_mask / 2, 254 | offset[1] + size_mask[0] * scale_mask / 2) 255 | region_image = region_image[offset_aux[0]:offset_aux[0] + size_mask[0], 256 | offset_aux[1]:offset_aux[1] + size_mask[1]] 257 | region_mask[offset[0]:offset[0] + size_mask[0], offset[1]:offset[1] + size_mask[1]] = 1 258 | # new_IoU=calculateIoU(region_mask,gt_mask) 259 | iou, new_iou, last_matrix, index = follow_iou(gt_masks, region_mask, 260 | array_classes_gt_objects, class_object, 261 | last_matrix, available_objects) 262 | gt_mask = gt_masks[:, :, index] 263 | reward = get_reward_movement(iou, new_iou) 264 | iou = new_iou 265 | history_vector = update_history_vector(history_vector, action) 266 | region_coordinates = np.array([offset[0], offset[1], size_mask[0], size_mask[1]]) 267 | region_descriptor = obtain_descriptor_from_feature_map(feature_maps, region_coordinates) 268 | region_descriptor_2 = np.reshape(region_descriptor, (25088, 1)) 269 | new_state = get_state_pool45(history_vector, region_descriptor_2) 270 | #Experience replay storage 271 | if len(replay[category]) < buffer_experience_replay: 272 | replay[category].append((state, action, reward, new_state)) 273 | else: 274 | if h[category] < (buffer_experience_replay-1): 275 | h[category] += 1 276 | else: 277 | h[category] = 0 278 | h_aux = h[category] 279 | h_aux = int(h_aux) 280 | replay[category][h_aux] = (state, action, reward, new_state) 281 | minibatch = random.sample(replay[category], batch_size) 282 | X_train = [] 283 | y_train = [] 284 | # we pick from the replay memory a sampled minibatch and generate the training samples 285 | for memory in minibatch: 286 | old_state, action, reward, new_state = memory 287 | old_qval = model.predict(old_state.T, batch_size=1) 288 | newQ = model.predict(new_state.T, batch_size=1) 289 | maxQ = np.max(newQ) 290 | y = np.zeros([1, 6]) 291 | y = old_qval 292 | y = y.T 293 | if action != 6: #non-terminal state 294 | update = (reward + (gamma * maxQ)) 295 | else: #terminal state 296 | update = reward 297 | y[action-1] = update #target output 298 | X_train.append(old_state) 299 | y_train.append(y) 300 | X_train = np.array(X_train) 301 | y_train = np.array(y_train) 302 | X_train = X_train.astype("float32") 303 | y_train = y_train.astype("float32") 304 | X_train = X_train[:, :, 0] 305 | y_train = y_train[:, :, 0] 306 | hist = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=1, verbose=0) 307 | state = new_state 308 | if action == 6: 309 | status = 0 310 | masked = 1 311 | # we mask object found with ground-truth so that agent learns faster 312 | image = mask_image_with_mean_background(gt_mask, image) 313 | else: 314 | masked = 0 315 | available_objects[index] = 0 316 | if epsilon > 0.1: 317 | epsilon -= 0.1 318 | string = path_model + '/model' + str(class_object-1) + '_epoch_' + str(i) + 'h5' 319 | string2 = path_model + '/model' + str(class_object-1) + 'h5' 320 | model.save_weights(string, overwrite=True) 321 | model.save_weights(string2, overwrite=True) 322 | 323 | -------------------------------------------------------------------------------- /scripts/reinforcement.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from keras.models import Sequential 3 | from keras import initializations 4 | from keras.initializations import normal, identity 5 | from keras.layers.core import Dense, Dropout, Activation, Flatten 6 | from keras.layers.recurrent import LSTM 7 | from keras.optimizers import RMSprop, SGD, Adam 8 | from features import * 9 | 10 | # Different actions that the agent can do 11 | number_of_actions = 6 12 | # Actions captures in the history vector 13 | actions_of_history = 4 14 | # Visual descriptor size 15 | visual_descriptor_size = 25088 16 | # Reward movement action 17 | reward_movement_action = 1 18 | # Reward terminal action 19 | reward_terminal_action = 3 20 | # IoU required to consider a positive detection 21 | iou_threshold = 0.5 22 | 23 | 24 | def update_history_vector(history_vector, action): 25 | action_vector = np.zeros(number_of_actions) 26 | action_vector[action-1] = 1 27 | size_history_vector = np.size(np.nonzero(history_vector)) 28 | updated_history_vector = np.zeros(number_of_actions*actions_of_history) 29 | if size_history_vector < actions_of_history: 30 | aux2 = 0 31 | for l in range(number_of_actions*size_history_vector, number_of_actions*size_history_vector+number_of_actions - 1): 32 | history_vector[l] = action_vector[aux2] 33 | aux2 += 1 34 | return history_vector 35 | else: 36 | for j in range(0, number_of_actions*(actions_of_history-1) - 1): 37 | updated_history_vector[j] = history_vector[j+number_of_actions] 38 | aux = 0 39 | for k in range(number_of_actions*(actions_of_history-1), number_of_actions*actions_of_history): 40 | updated_history_vector[k] = action_vector[aux] 41 | aux += 1 42 | return updated_history_vector 43 | 44 | 45 | def get_state(image, history_vector, model_vgg): 46 | descriptor_image = get_conv_image_descriptor_for_image(image, model_vgg) 47 | descriptor_image = np.reshape(descriptor_image, (visual_descriptor_size, 1)) 48 | history_vector = np.reshape(history_vector, (number_of_actions*actions_of_history, 1)) 49 | state = np.vstack((descriptor_image, history_vector)) 50 | return state 51 | 52 | 53 | def get_state_pool45(history_vector, region_descriptor): 54 | history_vector = np.reshape(history_vector, (24, 1)) 55 | return np.vstack((region_descriptor, history_vector)) 56 | 57 | 58 | def get_reward_movement(iou, new_iou): 59 | if new_iou > iou: 60 | reward = reward_movement_action 61 | else: 62 | reward = - reward_movement_action 63 | return reward 64 | 65 | 66 | def get_reward_trigger(new_iou): 67 | if new_iou > iou_threshold: 68 | reward = reward_terminal_action 69 | else: 70 | reward = - reward_terminal_action 71 | return reward 72 | 73 | 74 | def get_q_network(weights_path): 75 | model = Sequential() 76 | model.add(Dense(1024, init=lambda shape, name: normal(shape, scale=0.01, name=name), input_shape=(25112,))) 77 | model.add(Activation('relu')) 78 | model.add(Dropout(0.2)) 79 | model.add(Dense(1024, init=lambda shape, name: normal(shape, scale=0.01, name=name))) 80 | model.add(Activation('relu')) 81 | model.add(Dropout(0.2)) 82 | model.add(Dense(6, init=lambda shape, name: normal(shape, scale=0.01, name=name))) 83 | model.add(Activation('linear')) 84 | adam = Adam(lr=1e-6) 85 | model.compile(loss='mse', optimizer=adam) 86 | if weights_path != "0": 87 | model.load_weights(weights_path) 88 | return model 89 | 90 | 91 | def get_array_of_q_networks_for_pascal(weights_path, class_object): 92 | q_networks = [] 93 | if weights_path == "0": 94 | for i in range(20): 95 | q_networks.append(get_q_network("0")) 96 | else: 97 | for i in range(20): 98 | if i == (class_object-1): 99 | q_networks.append(get_q_network(weights_path + "/model" + str(i) + "h5")) 100 | else: 101 | q_networks.append(get_q_network("0")) 102 | return np.array([q_networks]) -------------------------------------------------------------------------------- /scripts/visualization.py: -------------------------------------------------------------------------------- 1 | from PIL import Image, ImageDraw, ImageFont 2 | import numpy as np 3 | 4 | path_font = "/usr/share/fonts/liberation/LiberationMono-Regular.ttf" 5 | font = ImageFont.truetype(path_font, 24) 6 | 7 | 8 | def string_for_action(action): 9 | if action == 0: 10 | return "START" 11 | if action == 1: 12 | return 'up-left' 13 | elif action == 2: 14 | return 'up-right' 15 | elif action == 3: 16 | return 'down-left' 17 | elif action == 4: 18 | return 'down-right' 19 | elif action == 5: 20 | return 'center' 21 | elif action == 6: 22 | return 'TRIGGER' 23 | 24 | 25 | def draw_sequences(i, k, step, action, draw, region_image, background, path_testing_folder, iou, reward, 26 | gt_mask, region_mask, image_name, save_boolean): 27 | mask = Image.fromarray(255 * gt_mask) 28 | mask_img = Image.fromarray(255 * region_mask) 29 | image_offset = (1000 * step, 70) 30 | text_offset = (1000 * step, 550) 31 | masked_image_offset = (1000 * step, 1400) 32 | mask_offset = (1000 * step, 700) 33 | action_string = string_for_action(action) 34 | footnote = 'action: ' + action_string + ' ' + 'reward: ' + str(reward) + ' Iou:' + str(iou) 35 | draw.text(text_offset, str(footnote), (0, 0, 0), font=font) 36 | img_for_paste = Image.fromarray(region_image) 37 | background.paste(img_for_paste, image_offset) 38 | background.paste(mask, mask_offset) 39 | background.paste(mask_img, masked_image_offset) 40 | file_name = path_testing_folder + '/' + image_name + str(i) + '_object_' + str(k) + '.png' 41 | if save_boolean == 1: 42 | background.save(file_name) 43 | return background 44 | 45 | 46 | def draw_sequences_test(step, action, qval, draw, region_image, background, path_testing_folder, 47 | region_mask, image_name, save_boolean): 48 | aux = np.asarray(region_image, np.uint8) 49 | img_offset = (1000 * step, 70) 50 | footnote_offset = (1000 * step, 550) 51 | q_predictions_offset = (1000 * step, 500) 52 | mask_img_offset = (1000 * step, 700) 53 | img_for_paste = Image.fromarray(aux) 54 | background.paste(img_for_paste, img_offset) 55 | mask_img = Image.fromarray(255 * region_mask) 56 | background.paste(mask_img, mask_img_offset) 57 | footnote = 'action: ' + str(action) 58 | q_val_predictions_text = str(qval) 59 | draw.text(footnote_offset, footnote, (0, 0, 0), font=font) 60 | draw.text(q_predictions_offset, q_val_predictions_text, (0, 0, 0), font=font) 61 | file_name = path_testing_folder + image_name + '.png' 62 | if save_boolean == 1: 63 | background.save(file_name) 64 | return background 65 | 66 | 67 | --------------------------------------------------------------------------------