├── .gitignore
├── LICENSE
├── README.md
├── authors
    ├── FerranMarques160x160.jpg
    ├── JordiTorres.jpg
    ├── JordiTorres160x160.jpg
    ├── MiriamBellver160x160.jpg
    ├── XavierGiro160x160.jpg
    ├── carlos160x160.jpeg
    ├── giro.jpg
    ├── marques.jpg
    └── miriam.jpg
├── bellver-2016-nipsws.pdf
├── img
    ├── HR_sequences.png
    ├── architecture.png
    ├── hierarchy.png
    ├── results.png
    └── thumbnail.png
├── logos
    ├── MEyC.png
    ├── bsc.jpg
    ├── bsc320x86.jpg
    ├── etsetb.png
    ├── excellence_center.png
    ├── generalitat.jpg
    ├── gpi.png
    ├── gpi320x70.png
    ├── nips500x95.png
    ├── nvidia.jpg
    ├── severo_ochoa.png
    └── upc.jpg
├── requirements.txt
└── scripts
    ├── features.py
    ├── image_helper.py
    ├── image_zooms_testing.py
    ├── image_zooms_training.py
    ├── metrics.py
    ├── parse_xml_annotations.py
    ├── pool45_crops_testing.py
    ├── pool45_crops_training.py
    ├── reinforcement.py
    └── visualization.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Image Processing Group - BarcelonaTECH - UPC
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Hierarchical Object Detection with Deep Reinforcement Learning
  2 | 
  3 | |  ![NIPS 2016 logo][logo-nips] | Paper accepted at [Deep Reinforcement Learning Workshop, NIPS 2016](https://sites.google.com/site/deeprlnips2016/)   |
  4 | |:-:|---|
  5 | 
  6 | [logo-nips]: https://github.com/imatge-upc/detection-2016-nipsws/blob/master/logos/nips500x95.png?raw=true "NIPS 2016 logo"
  7 | 
  8 | | ![Míriam Bellver][bellver-photo]  | ![Xavier Giro-i-Nieto][giro-photo]  | ![Ferran Marqués][marques-photo]  | ![Jordi Torres][torres-photo]  |
  9 | |:-:|:-:|:-:|:-:|
 10 | | [Míriam Bellver][bellver-web]  | [Xavier Giro-i-Nieto][giro-web]  |  [Ferran Marques][marques-web] | [Jordi Torres][torres-web]  |
 11 | 
 12 | 
 13 | [bellver-web]: https://www.bsc.es/bellver-bueno-miriam
 14 | [giro-web]: https://imatge.upc.edu/web/people/xavier-giro
 15 | [torres-web]: http://www.jorditorres.org/
 16 | [marques-web]:https://imatge.upc.edu/web/people/ferran-marques
 17 | 
 18 | [bellver-photo]:  https://github.com/imatge-upc/detection-2016-nipsws/blob/master/authors/MiriamBellver160x160.jpg?raw=true "Míriam Bellver"
 19 | [giro-photo]: https://github.com/imatge-upc/detection-2016-nipsws/blob/master/authors/XavierGiro160x160.jpg?raw=true "Xavier Giró-i-Nieto"
 20 | [marques-photo]: https://github.com/imatge-upc/detection-2016-nipsws/blob/master/authors/FerranMarques160x160.jpg?raw=true "Ferran Marqués"
 21 | [torres-photo]:  https://github.com/imatge-upc/detection-2016-nipsws/blob/master/authors/JordiTorres.jpg?raw=true  "Jordi Torres"
 22 | 
 23 | A joint collaboration between:
 24 | 
 25 | |![logo-bsc] | ![logo-gpi]  |
 26 | |:-:|:-:|
 27 | | [Barcelona Supercomputing Center][bsc-web] | [UPC Image Processing Group][gpi-web] |
 28 | 
 29 | [gpi-web]: https://imatge.upc.edu/web/ 
 30 | [bsc-web]: http://www.bsc.es 
 31 | 
 32 | [logo-bsc]:https://github.com/imatge-upc/detection-2016-nipsws/blob/master/logos/bsc320x86.jpg?raw=true "Barcelona Supercomputing Center"
 33 | [logo-gpi]: https://github.com/imatge-upc/detection-2016-nipsws/blob/master/logos/gpi320x70.png?raw=true "UPC Image Processing Group"
 34 | [logo-severo]: https://github.com/imatge-upc/detection-2016-nipsws/blob/master/logos/severo_ochoa.png?raw=true "Severo Ochoa"
 35 | 
 36 | ## Summary
 37 | 
 38 | We present a method for performing hierarchical object detection in images guided by a deep reinforcement learning agent. The key idea is to focus on those parts of the image that contain richer information and zoom on them. We train an intelligent agent that, given an image window, is capable of deciding where to focus the attention among five different predefined region candidates (smaller windows). This procedure is iterated providing a hierarchical image analysis. We compare two different candidate proposal strategies to guide the object search: with and without overlap. 
 39 | 
 40 | ![Hierarchy of overlapping region proposals](https://github.com/imatge-upc/detection-2016-nipsws/blob/master/img/hierarchy.png?raw=true)
 41 | 
 42 | Moreover, our work compares two different strategies to extract features from a convolutional neural network for each region proposal: a first one that computes new feature maps for each region proposal, and a second one that computes the feature maps for the whole image to later generate crops for each region proposal. 
 43 | 
 44 | ![Architectures for convolutional feature extraction](https://github.com/imatge-upc/detection-2016-nipsws/blob/master/img/architecture.png?raw=true)
 45 | 
 46 | Experiments indicate better results for the overlapping candidate proposal strategy and a loss of performance for the cropped image features due to the loss of spatial resolution. We argue that, while this loss seems unavoidable when working with large amounts of object candidates, the much more reduced amount of region proposals generated by our reinforcement learning agent allows considering to extract features for each location without sharing convolutional computation among regions.
 47 | 
 48 | ![Qualitative results](https://github.com/imatge-upc/detection-2016-nipsws/blob/master/img/HR_sequences.png?raw=true)
 49 | 
 50 | ## Publication
 51 | 
 52 | Our workshop paper is available on [arXiv](https://arxiv.org/abs/1611.03718), and related slides [here](http://www.slideshare.net/xavigiro/hierarchical-object-detection-with-deep-reinforcement-learning).
 53 | 
 54 | Please cite with the following Bibtex code:
 55 | 
 56 | ````
 57 | @InProceedings{Bellver_2016_NIPSWS,
 58 | author = {Bellver, Miriam and Giro-i-Nieto, Xavier and Marques, Ferran and Torres, Jordi},
 59 | title = {Hierarchical Object Detection with Deep Reinforcement Learning},
 60 | booktitle = {Deep Reinforcement Learning Workshop, NIPS},
 61 | month = {December},
 62 | year = {2016}
 63 | }
 64 | ````
 65 | 
 66 | You may also want to refer to our publication with the more human-friendly Chicago style:
 67 | 
 68 | *Miriam Bellver, Xavier Giro-i-Nieto, Ferran Marques, and Jordi Torres. "Hierarchical Object Detection with Deep Reinforcement Learning." In Deep Reinforcement Learning Workshop (NIPS). 2016.*
 69 | 
 70 | ## Code Instructions
 71 | 
 72 | This python code enables to both train and test each of the two models proposed in the paper. The image zooms model extracts features for each region visited, whereas the pool45 crops model extracts features just once and then ROI-pools features for each subregion. In this section we are going to describe how to use the code. The code uses Keras framework library. If you are using a virtual environment, you can use the requirements.txt provided.
 73 | 
 74 | 
 75 | First it is important to notice that this code is already an extension of the code used for the paper. During the training stage, we are not only considering one object per image, we are also training for other objects by covering the already found objects with the mean of VGG-16, inspired by what Caicedo et al. did on Active Object Localization with Deep Reinforcement Learning. 
 76 | 
 77 | ### Setup
 78 | 
 79 | First of all the weights of VGG-16 should be downloaded from the following link [VGG-16 weights]. If you want to use some pre-trained models for the Deep Q-network, they can be downloaded in the following link [Image Zooms model]. Notice that these models could lead to different results compared to the ones provided in the paper, due that these models are already trained to find more than one instance of planes in the image. You should also create two folders in the root of the project, called models_image_zooms and models_pool45_crops, and store inside them the corresponding weights. 
 80 | 
 81 | 
 82 | [VGG-16 weights]: http://imatge.upc.edu/web/sites/default/files/projects/deeplearning/public/detection-2016-nipsws/vgg16_weights.h5
 83 | [Image Zooms model]: http://imatge.upc.edu/web/sites/default/files/projects/deeplearning/public/detection-2016-nipsws/model_image_zooms_2
 84 | 
 85 | 
 86 | ### Usage
 87 | 
 88 | ##### Training
 89 | 
 90 | We will follow as example how to train the Image Zooms model, that is the one that achieves better results. The instructions are equal for training the Pool45 Crops model. The script is image_zooms_training.py, and first the path to the database should be configured. The default paths are the following:
 91 | 
 92 |     # path of PASCAL VOC 2012 or other database to use for training
 93 |     path_voc = "./VOC2012/"
 94 |     # path of other PASCAL VOC dataset, if you want to train with 2007 and 2012 train datasets
 95 |     path_voc2 = "./VOC2007/"
 96 |     # path of where to store the models
 97 |     path_model = "../models_image_zooms"
 98 |     # path of where to store visualizations of search sequences
 99 |     path_testing_folder = '../testing_visualizations'
100 |     # path of VGG16 weights
101 |     path_vgg = "../vgg16_weights.h5"
102 | 
103 | But you can change them to point to your own locations. 
104 | 
105 | The training of the models enables checkpointing, so you should indicate which epoch you are going to train when running the script. If you are training it from scratch, then the training command should be:
106 | 
107 | python image_zooms_training.py -n 0
108 | 
109 | There are many options that can be changed to test different configurations:
110 | 
111 | **class_object**: for which class you want to train the models. We have trained it for planes, and all the experiments of the paper are run on this class, but you can test other categories of pascal, also changing appropiately the training databases. 
112 | 
113 | **number_of_steps**: For how many steps you want your agent to search for an object in an image.
114 | 
115 | **scale_subregion**: The scale of the subregions in the hierarchy, compared to its ancestor. Default value is 3/4, that denoted good results in our experiments, but it can easily be set. Take into consideration that the subregion scale and the number of steps is very correlated, if the subregion scale is high, then you will probably require more steps to find objects.
116 | 
117 | **bool_draw**: This is a boolean, that if it is set to 1, it stores visualizations of the sequences for image searches. 
118 | 
119 | At each epoch the models will be saved in the models_image_zooms folder.
120 | 
121 | ##### Testing
122 | 
123 | To test the models, you should use the script image_zooms_testing.py. You should also configure the paths to indicate which weights you want to use, in the same manner as in the training stage. In this case, you should only run the command python image_zooms_testing.py. It is recommended that for testing you put bool_draw = 1, so you can observe the visualizations of the object search sequences. There is the option to just search for a single object in each image, to reproduce the same results of our paper, by just setting the boolean only_first_object to 1.
124 | 
125 | 
126 | ## Acknowledgements
127 | 
128 | We would like to especially thank Albert Gil Moreno and Josep Pujal from our technical support team at the Image Processing Group at the UPC. We also would like to thank Carlos Tripiana from the technical support team at the Barcelona Supercomputing center (BSC). 
129 | 
130 | | ![AlbertGil-photo]  | ![JosepPujal-photo]  | ![CarlosTripiana-photo]  |
131 | |:-:|:-:|:-:|
132 | | [Albert Gil](https://imatge.upc.edu/web/people/albert-gil-moreno)  |  [Josep Pujal](https://imatge.upc.edu/web/people/josep-pujal) | [Carlos Tripiana](https://www.bsc.es/tripiana-carlos/) |
133 | 
134 | [AlbertGil-photo]: https://raw.githubusercontent.com/imatge-upc/saliency-2016-cvpr/master/authors/AlbertGil.jpg "Albert Gil"
135 | [JosepPujal-photo]: https://raw.githubusercontent.com/imatge-upc/saliency-2016-cvpr/master/authors/JosepPujal.jpg "Josep Pujal"
136 | [CarlosTripiana-photo]: https://github.com/imatge-upc/detection-2016-nipsws/blob/master/authors/carlos160x160.jpeg?raw=true "Carlos Tripiana"
137 | 
138 | [AlbertGil-web]: https://imatge.upc.edu/web/people/albert-gil-moreno
139 | [JosepPujal-web]: https://imatge.upc.edu/web/people/josep-pujal
140 | [CarlosTripiana-web]: https://www.bsc.es/tripiana-carlos/
141 | 
142 | |   |   |
143 | |:--|:-:|
144 | | This work has been supported by the [grant SEV2015-0493 of the Severo Ochoa Program](https://www.bsc.es/es/severo-ochoa/presentaci%C3%B3n) awarded by Spanish Government, project TIN2015-65316 by the Spanish Ministry of Science and Innovation contracts 2014-SGR-1051 by Generalitat de Catalunya | ![logo-severo] |
145 | |  We gratefully acknowledge the support of [NVIDIA Corporation](http://www.nvidia.com/content/global/global.php) with the donation of the GeoForce GTX [Titan Z](http://www.nvidia.com/gtx-700-graphics-cards/gtx-titan-z/) and [Titan X](http://www.geforce.com/hardware/desktop-gpus/geforce-gtx-titan-x) used in this work at the UPC, and the BSC/UPC NVIDIA GPU Center of Excellence. |  ![logo-nvidia] |
146 | |  The Image ProcessingGroup at the UPC is a [SGR14 Consolidated Research Group](https://imatge.upc.edu/web/projects/sgr14-image-and-video-processing-group) recognized and sponsored by the Catalan Government (Generalitat de Catalunya) through its [AGAUR](http://agaur.gencat.cat/en/inici/index.html) office. |  ![logo-catalonia] |
147 | |  This work has been developed in the framework of the project [BigGraph TEC2013-43935-R](https://imatge.upc.edu/web/projects/biggraph-heterogeneous-information-and-graph-signal-processing-big-data-era-application), funded by the Spanish Ministerio de Economía y Competitividad and the European Regional Development Fund (ERDF).  | ![logo-spain] | 
148 | 
149 | 
150 | [logo-nvidia]: https://github.com/imatge-upc/detection-2016-nipsws/blob/master/logos/excellence_center.png?raw=true  "Logo of NVidia"
151 | [logo-catalonia]: https://raw.githubusercontent.com/imatge-upc/saliency-2016-cvpr/master/logos/generalitat.jpg "Logo of Catalan government"
152 | [logo-spain]: https://raw.githubusercontent.com/imatge-upc/saliency-2016-cvpr/master/logos/MEyC.png "Logo of Spanish government"
153 | 
154 | 
155 | ## Contact
156 | 
157 | If you have any general doubt about our work or code which may be of interest for other researchers, please use the [public issues section](https://github.com/imatge-upc/detection-2016-nipsws/issues) on this github repo. Alternatively, drop us an e-mail at <mailto:miriam.bellver@bsc.es> and <mailto:xavier.giro@upc.edu>.
158 | 
159 | 
160 | 


--------------------------------------------------------------------------------
/authors/FerranMarques160x160.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/authors/FerranMarques160x160.jpg


--------------------------------------------------------------------------------
/authors/JordiTorres.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/authors/JordiTorres.jpg


--------------------------------------------------------------------------------
/authors/JordiTorres160x160.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/authors/JordiTorres160x160.jpg


--------------------------------------------------------------------------------
/authors/MiriamBellver160x160.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/authors/MiriamBellver160x160.jpg


--------------------------------------------------------------------------------
/authors/XavierGiro160x160.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/authors/XavierGiro160x160.jpg


--------------------------------------------------------------------------------
/authors/carlos160x160.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/authors/carlos160x160.jpeg


--------------------------------------------------------------------------------
/authors/giro.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/authors/giro.jpg


--------------------------------------------------------------------------------
/authors/marques.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/authors/marques.jpg


--------------------------------------------------------------------------------
/authors/miriam.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/authors/miriam.jpg


--------------------------------------------------------------------------------
/bellver-2016-nipsws.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/bellver-2016-nipsws.pdf


--------------------------------------------------------------------------------
/img/HR_sequences.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/img/HR_sequences.png


--------------------------------------------------------------------------------
/img/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/img/architecture.png


--------------------------------------------------------------------------------
/img/hierarchy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/img/hierarchy.png


--------------------------------------------------------------------------------
/img/results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/img/results.png


--------------------------------------------------------------------------------
/img/thumbnail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/img/thumbnail.png


--------------------------------------------------------------------------------
/logos/MEyC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/logos/MEyC.png


--------------------------------------------------------------------------------
/logos/bsc.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/logos/bsc.jpg


--------------------------------------------------------------------------------
/logos/bsc320x86.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/logos/bsc320x86.jpg


--------------------------------------------------------------------------------
/logos/etsetb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/logos/etsetb.png


--------------------------------------------------------------------------------
/logos/excellence_center.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/logos/excellence_center.png


--------------------------------------------------------------------------------
/logos/generalitat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/logos/generalitat.jpg


--------------------------------------------------------------------------------
/logos/gpi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/logos/gpi.png


--------------------------------------------------------------------------------
/logos/gpi320x70.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/logos/gpi320x70.png


--------------------------------------------------------------------------------
/logos/nips500x95.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/logos/nips500x95.png


--------------------------------------------------------------------------------
/logos/nvidia.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/logos/nvidia.jpg


--------------------------------------------------------------------------------
/logos/severo_ochoa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/logos/severo_ochoa.png


--------------------------------------------------------------------------------
/logos/upc.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imatge-upc/detection-2016-nipsws/dbcc3ac46b4e2a8841eadd1cbc46ee58c6d9a2a8/logos/upc.jpg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | backports.shutil-get-terminal-size==1.0.0
 2 | cycler==0.10.0
 3 | Cython==0.24.1
 4 | decorator==4.0.10
 5 | easydict==1.6
 6 | enum34==1.1.6
 7 | h5py==2.6.0
 8 | ipython==5.1.0
 9 | ipython-genutils==0.1.0
10 | Keras==1.0.8
11 | matplotlib==1.5.3
12 | numpy==1.11.1
13 | pathlib2==2.1.0
14 | pexpect==4.2.1
15 | pickleshare==0.7.4
16 | Pillow==3.3.1
17 | prompt-toolkit==1.0.7
18 | protobuf==3.0.0b2
19 | ptyprocess==0.5.1
20 | Pygments==2.1.3
21 | pyparsing==2.1.9
22 | python-dateutil==2.5.3
23 | pytz==2016.6.1
24 | PyYAML==3.12
25 | scikit-learn==0.17.1
26 | scipy==0.18.0
27 | simplegeneric==0.8.1
28 | six==1.10.0
29 | sklearn==0.0
30 | Theano==0.8.2
31 | traitlets==4.3.1
32 | wcwidth==0.1.7
33 | 


--------------------------------------------------------------------------------
/scripts/features.py:
--------------------------------------------------------------------------------
  1 | from keras.models import Sequential
  2 | from keras.layers.core import Flatten, Dense, Dropout
  3 | from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
  4 | from keras.optimizers import SGD
  5 | from keras import backend as K
  6 | import cv2, numpy as np
  7 | import math
  8 | import numpy, scipy
  9 | from scipy import interpolate
 10 | import scipy.ndimage
 11 | import time
 12 | 
 13 | # the feature size is of 7x7xp, being p the number of channels
 14 | feature_size = 7
 15 | # the relative scale reduction of the shallower feature map compared to the initial image input
 16 | scale_reduction_shallower_feature = 16
 17 | # the relative scale reduction of the deeper feature map compared to the initial image input
 18 | scale_reduction_deeper_feature = 32
 19 | # scaling of the input image
 20 | factor_x_input = float(1)
 21 | factor_y_input = float(1)
 22 | 
 23 | 
 24 | # Interpolation of 2d features for a single channel of a feature map
 25 | def interpolate_2d_features(features):
 26 |     out_size = feature_size
 27 |     x = np.arange(features.shape[0])
 28 |     y = np.arange(features.shape[1])
 29 |     z = features
 30 |     xx = np.linspace(x.min(), x.max(), out_size)
 31 |     yy = np.linspace(y.min(), y.max(), out_size)
 32 |     new_kernel = interpolate.RectBivariateSpline(x, y, z, kx=1, ky=1)
 33 |     kernel_out = new_kernel(xx, yy)
 34 |     return kernel_out
 35 | 
 36 | 
 37 | # Interpolation 2d of each channel, so we obtain 3d interpolated feature maps
 38 | def interpolate_3d_features(features):
 39 |     new_features = np.zeros([512, feature_size, feature_size])
 40 |     for i in range(features.shape[0]):
 41 |         new_features[i, :, :] = interpolate_2d_features(features[i, :, :])
 42 |     return new_features
 43 | 
 44 | 
 45 | def pop_layer(model):
 46 |     if not model.outputs:
 47 |         raise Exception('Sequential model cannot be popped: model is empty.')
 48 |     model.layers.pop()
 49 |     if not model.layers:
 50 |         model.outputs = []
 51 |         model.inbound_nodes = []
 52 |         model.outbound_nodes = []
 53 |     else:
 54 |         model.layers[-1].outbound_nodes = []
 55 |         model.outputs = [model.layers[-1].output]
 56 |     model.built = False
 57 |     return model
 58 | 
 59 | 
 60 | def get_convolutional_vgg16_compiled(vgg_weights_path):
 61 |     model_vgg = obtain_compiled_vgg_16(vgg_weights_path)
 62 |     for i in range(0, 6):
 63 |         model_vgg = pop_layer(model_vgg)
 64 |     return model_vgg
 65 | 
 66 | 
 67 | def get_feature_maps(model, img):
 68 |     return [get_feature_map_4(model, img), get_feature_map_8(model, img)]
 69 | 
 70 | 
 71 | # get deeper feature map
 72 | def get_feature_map_8(model, im):
 73 |     im = im.astype(np.float32)
 74 |     dim_ordering = K.image_dim_ordering()
 75 |     if dim_ordering == 'th':
 76 |         # 'RGB'->'BGR'
 77 |         im = im[::-1, :, :]
 78 |         # Zero-center by mean pixel
 79 |         im[0, :, :] -= 103.939
 80 |         im[1, :, :] -= 116.779
 81 |         im[2, :, :] -= 123.68
 82 |     else:
 83 |         # 'RGB'->'BGR'
 84 |         im = im[:, :, ::-1]
 85 |         # Zero-center by mean pixel
 86 |         im[:, :, 0] -= 103.939
 87 |         im[:, :, 1] -= 116.779
 88 |         im[:, :, 2] -= 123.68
 89 |     im = im.transpose((2, 0, 1))
 90 |     im = np.expand_dims(im, axis=0)
 91 |     inputs = [K.learning_phase()] + model.inputs
 92 |     _convout1_f = K.function(inputs, model.outputs)
 93 |     feature_map = _convout1_f([0] + [im])
 94 |     feature_map = np.array([feature_map])
 95 |     feature_map = feature_map[0, 0, 0, :, :, :]
 96 |     return feature_map
 97 | 
 98 | 
 99 | # get shallower feature map
100 | def get_feature_map_4(model, im):
101 |     im = im.astype(np.float32)
102 |     dim_ordering = K.image_dim_ordering()
103 |     if dim_ordering == 'th':
104 |         # 'RGB'->'BGR'
105 |         im = im[::-1, :, :]
106 |         # Zero-center by mean pixel
107 |         im[0, :, :] -= 103.939
108 |         im[1, :, :] -= 116.779
109 |         im[2, :, :] -= 123.68
110 |     else:
111 |         # 'RGB'->'BGR'
112 |         im = im[:, :, ::-1]
113 |         # Zero-center by mean pixel
114 |         im[:, :, 0] -= 103.939
115 |         im[:, :, 1] -= 116.779
116 |         im[:, :, 2] -= 123.68
117 |     im = im.transpose((2, 0, 1))
118 |     im = np.expand_dims(im, axis=0)
119 |     inputs = [K.learning_phase()] + model.inputs
120 |     _convout1_f = K.function(inputs, [model.layers[23].output])
121 |     feature_map = _convout1_f([0] + [im])
122 |     feature_map = np.array([feature_map])
123 |     feature_map = feature_map[0, 0, 0, :, :, :]
124 |     return feature_map
125 | 
126 | 
127 | def crop_roi(feature_map, coordinates):
128 |     return feature_map[:, coordinates[0]:coordinates[0]+coordinates[2], coordinates[1]:coordinates[1]+coordinates[3]]
129 | 
130 | 
131 | # this method decides whether to use the deeper or the shallower feature map
132 | # and then crops and interpolates if necessary the features to obtain a final descriptor of 7x7xp
133 | def obtain_descriptor_from_feature_map(feature_maps, region_coordinates):
134 |     initial_width = region_coordinates[2]*factor_x_input
135 |     initial_height = region_coordinates[3]*factor_y_input
136 |     scale_aux = math.sqrt(initial_height*initial_width)/math.sqrt(feature_size*feature_size)
137 |     if scale_aux > scale_reduction_deeper_feature:
138 |         scale = scale_reduction_deeper_feature
139 |         feature_map = feature_maps[1]
140 |     else:
141 |         scale = scale_reduction_shallower_feature
142 |         feature_map = feature_maps[0]
143 |     new_width = initial_width/scale
144 |     new_height = initial_height/scale
145 |     if new_width < feature_size:
146 |         new_width = feature_size
147 |     if new_height < feature_size:
148 |         new_height = feature_size
149 |     xo = region_coordinates[0]/scale
150 |     yo = region_coordinates[1]/scale
151 |     feat = np.array([feature_map])
152 |     if new_width + xo > feat.shape[2]:
153 |         xo = feat.shape[2] - new_width
154 |     if new_height + yo > feat.shape[3]:
155 |         yo = feat.shape[3] - new_height
156 |     if xo < 0:
157 |         xo = 0
158 |     if yo < 0:
159 |         yo = 0
160 |     new_coordinates = np.array([xo, yo, new_width, new_height])
161 |     roi = crop_roi(feature_map, new_coordinates)
162 |     if roi.shape[1] < feature_size & roi.shape[2] < feature_size:
163 |         features = interpolate_3d_features(roi)
164 |     elif roi.shape[2] < feature_size:
165 |         features = interpolate_3d_features(roi)
166 |     elif roi.shape[1] < feature_size:
167 |         features = interpolate_3d_features(roi)
168 |     else:
169 |         features = extract_features_from_roi(roi)
170 |     return features
171 | 
172 | 
173 | # ROI-pooling features
174 | def extract_features_from_roi(roi):
175 |     roi_width = roi.shape[1]
176 |     roi_height = roi.shape[2]
177 |     new_width = roi_width / feature_size
178 |     new_height = roi_height / feature_size
179 |     pooled_values = np.zeros([feature_size, feature_size, 512])
180 |     for j in range(512):
181 |         for i in range(feature_size):
182 |             for k in range(feature_size):
183 |                 if k == (feature_size-1) & i == (feature_size-1):
184 |                     patch = roi[j, i * new_width:roi_width, k * new_height:roi_height]
185 |                 elif k == (feature_size-1):
186 |                     patch = roi[j, i * new_width:(i + 1) * new_width, k * new_height:roi_height]
187 |                 elif i == (feature_size-1):
188 |                     patch = roi[j, i * new_width:roi_width, k * new_height:(k + 1) * new_height]
189 |                 else:
190 |                     patch = roi[j, i * new_width:(i + 1) * new_width, k * new_height:(k + 1) * new_height]
191 |                 pooled_values[i, k, j] = np.max(patch)
192 |     return pooled_values
193 | 
194 | 
195 | def calculate_all_initial_feature_maps(images, model, image_names):
196 |     initial_feature_maps = []
197 |     for z in range(np.size(image_names)):
198 |         initial_feature_maps.append(get_feature_maps(model, np.array(images[z])))
199 |     return initial_feature_maps
200 | 
201 | 
202 | def get_image_descriptor_for_image(image, model):
203 |     im = cv2.resize(image, (224, 224)).astype(np.float32)
204 |     dim_ordering = K.image_dim_ordering()
205 |     if dim_ordering == 'th':
206 |         # 'RGB'->'BGR'
207 |         im = im[::-1, :, :]
208 |         # Zero-center by mean pixel
209 |         im[0, :, :] -= 103.939
210 |         im[1, :, :] -= 116.779
211 |         im[2, :, :] -= 123.68
212 |     else:
213 |         # 'RGB'->'BGR'
214 |         im = im[:, :, ::-1]
215 |         # Zero-center by mean pixel
216 |         im[:, :, 0] -= 103.939
217 |         im[:, :, 1] -= 116.779
218 |         im[:, :, 2] -= 123.68
219 |     im = im.transpose((2, 0, 1))
220 |     im = np.expand_dims(im, axis=0)
221 |     inputs = [K.learning_phase()] + model.inputs
222 |     _convout1_f = K.function(inputs, [model.layers[33].output])
223 |     return _convout1_f([0] + [im])
224 | 
225 | 
226 | def get_conv_image_descriptor_for_image(image, model):
227 |     im = cv2.resize(image, (224, 224)).astype(np.float32)
228 |     dim_ordering = K.image_dim_ordering()
229 |     if dim_ordering == 'th':
230 |         # 'RGB'->'BGR'
231 |         im = im[::-1, :, :]
232 |         # Zero-center by mean pixel
233 |         im[0, :, :] -= 103.939
234 |         im[1, :, :] -= 116.779
235 |         im[2, :, :] -= 123.68
236 |     else:
237 |         # 'RGB'->'BGR'
238 |         im = im[:, :, ::-1]
239 |         # Zero-center by mean pixel
240 |         im[:, :, 0] -= 103.939
241 |         im[:, :, 1] -= 116.779
242 |         im[:, :, 2] -= 123.68
243 |     im = im.transpose((2, 0, 1))
244 |     im = np.expand_dims(im, axis=0)
245 |     inputs = [K.learning_phase()] + model.inputs
246 |     _convout1_f = K.function(inputs, [model.layers[31].output])
247 |     return _convout1_f([0] + [im])
248 | 
249 | 
250 | def obtain_compiled_vgg_16(vgg_weights_path):
251 |     model = vgg_16(vgg_weights_path)
252 |     sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
253 |     model.compile(optimizer=sgd, loss='categorical_crossentropy')
254 |     return model
255 | 
256 | 
257 | def vgg_16(weights_path=None):
258 |     model = Sequential()
259 |     model.add(ZeroPadding2D((1, 1), input_shape=(3, 224, 224)))
260 |     model.add(Convolution2D(64, 3, 3, activation='relu'))
261 |     model.add(ZeroPadding2D((1, 1)))
262 |     model.add(Convolution2D(64, 3, 3, activation='relu'))
263 |     model.add(MaxPooling2D((2, 2), strides=(2, 2)))
264 | 
265 |     model.add(ZeroPadding2D((1, 1)))
266 |     model.add(Convolution2D(128, 3, 3, activation='relu'))
267 |     model.add(ZeroPadding2D((1, 1)))
268 |     model.add(Convolution2D(128, 3, 3, activation='relu'))
269 |     model.add(MaxPooling2D((2, 2), strides=(2, 2)))
270 | 
271 |     model.add(ZeroPadding2D((1, 1)))
272 |     model.add(Convolution2D(256, 3, 3, activation='relu'))
273 |     model.add(ZeroPadding2D((1, 1)))
274 |     model.add(Convolution2D(256, 3, 3, activation='relu'))
275 |     model.add(ZeroPadding2D((1, 1)))
276 |     model.add(Convolution2D(256, 3, 3, activation='relu'))
277 |     model.add(MaxPooling2D((2, 2), strides=(2, 2)))
278 | 
279 |     model.add(ZeroPadding2D((1, 1)))
280 |     model.add(Convolution2D(512, 3, 3, activation='relu'))
281 |     model.add(ZeroPadding2D((1, 1)))
282 |     model.add(Convolution2D(512, 3, 3, activation='relu'))
283 |     model.add(ZeroPadding2D((1, 1)))
284 |     model.add(Convolution2D(512, 3, 3, activation='relu'))
285 |     model.add(MaxPooling2D((2, 2), strides=(2, 2)))
286 | 
287 |     model.add(ZeroPadding2D((1, 1)))
288 |     model.add(Convolution2D(512, 3, 3, activation='relu'))
289 |     model.add(ZeroPadding2D((1, 1)))
290 |     model.add(Convolution2D(512, 3, 3, activation='relu'))
291 |     model.add(ZeroPadding2D((1, 1)))
292 |     model.add(Convolution2D(512, 3, 3, activation='relu'))
293 |     model.add(MaxPooling2D((2, 2), strides=(2, 2)))
294 | 
295 |     model.add(Flatten())
296 |     model.add(Dense(4096, activation='relu'))
297 |     model.add(Dropout(0.5))
298 |     model.add(Dense(4096, activation='relu'))
299 |     model.add(Dropout(0.5))
300 |     model.add(Dense(1000, activation='softmax'))
301 | 
302 |     if weights_path:
303 |         model.load_weights(weights_path)
304 | 
305 |     return model
306 | 
307 | 


--------------------------------------------------------------------------------
/scripts/image_helper.py:
--------------------------------------------------------------------------------
 1 | from keras.preprocessing import image
 2 | import numpy as np
 3 | 
 4 | 
 5 | def get_all_ids(annotations):
 6 |     all_ids = []
 7 |     for i in range(len(annotations)):
 8 |         all_ids.append(get_ids_objects_from_annotation(annotations[i]))
 9 |     return all_ids
10 | 
11 | 
12 | def get_all_images(image_names, path_voc):
13 |     images = []
14 |     for j in range(np.size(image_names)):
15 |         image_name = image_names[0][j]
16 |         string = path_voc + '/JPEGImages/' + image_name + '.jpg'
17 |         images.append(image.load_img(string, False))
18 |     return images
19 | 
20 | 
21 | def get_all_images_pool(image_names, path_voc):
22 |     images = []
23 |     for j in range(np.size(image_names)):
24 |         image_name = image_names[j]
25 |         string = path_voc + '/JPEGImages/' + image_name + '.jpg'
26 |         images.append(image.load_img(string, False))
27 |     return images
28 | 
29 | 
30 | def load_images_names_in_data_set(data_set_name, path_voc):
31 |     file_path = path_voc + '/ImageSets/Main/' + data_set_name + '.txt'
32 |     f = open(file_path)
33 |     image_names = f.readlines()
34 |     image_names = [x.strip('\n') for x in image_names]
35 |     if data_set_name.startswith("aeroplane") | data_set_name.startswith("bird") | data_set_name.startswith("cow"):
36 |         return [x.split(None, 1)[0] for x in image_names]
37 |     else:
38 |         return [x.strip('\n') for x in image_names]
39 | 
40 | 
41 | def load_images_labels_in_data_set(data_set_name, path_voc):
42 |     file_path = path_voc + '/ImageSets/Main/' + data_set_name + '.txt'
43 |     f = open(file_path)
44 |     images_names = f.readlines()
45 |     images_names = [x.split(None, 1)[1] for x in images_names]
46 |     images_names = [x.strip('\n') for x in images_names]
47 |     return images_names
48 | 
49 | 
50 | def mask_image_with_mean_background(mask_object_found, image):
51 |     new_image = image
52 |     size_image = np.shape(mask_object_found)
53 |     for j in range(size_image[0]):
54 |         for i in range(size_image[1]):
55 |             if mask_object_found[j][i] == 1:
56 |                     new_image[j, i, 0] = 103.939
57 |                     new_image[j, i, 1] = 116.779
58 |                     new_image[j, i, 2] = 123.68
59 |     return new_image


--------------------------------------------------------------------------------
/scripts/image_zooms_testing.py:
--------------------------------------------------------------------------------
  1 | import cv2, numpy as np
  2 | import time
  3 | import math as mth
  4 | from PIL import Image, ImageDraw, ImageFont
  5 | import scipy.io
  6 | from keras.models import Sequential
  7 | from keras import initializations
  8 | from keras.initializations import normal, identity
  9 | from keras.layers.core import Dense, Dropout, Activation, Flatten
 10 | from keras.optimizers import RMSprop, SGD, Adam
 11 | import random
 12 | from scipy import ndimage
 13 | from keras.preprocessing import image
 14 | from sklearn.preprocessing import OneHotEncoder
 15 | 
 16 | from features import get_image_descriptor_for_image, obtain_compiled_vgg_16, vgg_16, \
 17 |     get_conv_image_descriptor_for_image, calculate_all_initial_feature_maps
 18 | from parse_xml_annotations import *
 19 | from image_helper import *
 20 | from metrics import *
 21 | from visualization import *
 22 | from reinforcement import *
 23 | 
 24 | if __name__ == "__main__":
 25 |    
 26 |     ######## PATHS definition ########
 27 | 
 28 |     # path of pascal voc test
 29 |     path_voc_test = "./VOC2007_test/"
 30 |     # model name of the weights
 31 |     model_name = "model_image_zooms"
 32 |     # path of folder where the weights are
 33 |     weights_path = "../models_image_zooms/"
 34 |     # path of where to store visualizations of search sequences
 35 |     path_testing_folder = '../testing/'
 36 |     # path of VGG16 weights
 37 |     path_vgg = "../vgg16_weights.h5"
 38 | 
 39 |      ######## MODELS ########
 40 | 
 41 |     model_vgg = obtain_compiled_vgg_16(path_vgg)
 42 |     model = get_q_network(weights_path + model_name)
 43 | 
 44 |     ######## LOAD IMAGE NAMES ########
 45 | 
 46 |     image_names = np.array([load_images_names_in_data_set('aeroplane_test', path_voc_test)])
 47 |     labels = load_images_labels_in_data_set('aeroplane_test', path_voc_test)
 48 |     
 49 |     ######## LOAD IMAGES ########
 50 |    
 51 |     images = get_all_images(image_names, path_voc_test)
 52 | 
 53 |     ######## PARAMETERS ########
 54 | 
 55 |     # Class category of PASCAL that the RL agent will be searching
 56 |     class_object = 1
 57 |     # 1 if you want to obtain visualizations of the search for objects
 58 |     bool_draw = 1
 59 |     # Scale of subregion for the hierarchical regions (to deal with 2/4, 3/4)
 60 |     scale_subregion = float(3)/4
 61 |     scale_mask = float(1)/(scale_subregion*4)
 62 |     # Number of steps that the agent does at each image
 63 |     number_of_steps = 10
 64 |     # Only search first object
 65 |     only_first_object = 1
 66 | 
 67 |     for j in range(np.size(image_names)):
 68 |         if labels[j] == "1":
 69 |             image = np.array(images[j])
 70 |             # init drawing for visualization
 71 |             background = Image.new('RGBA', (10000, 2000), (255, 255, 255, 255))
 72 |             draw = ImageDraw.Draw(background)
 73 |             image_name = image_names[0][j]
 74 |             annotation = get_bb_of_gt_from_pascal_xml_annotation(image_name, path_voc_test)
 75 |             gt_masks = generate_bounding_box_from_annotation(annotation, image.shape)
 76 |             array_classes_gt_objects = get_ids_objects_from_annotation(annotation)
 77 |             size_mask = (image.shape[0], image.shape[1])
 78 |             original_shape = size_mask
 79 |             image_for_search = image
 80 |             region_mask = np.ones([image.shape[0], image.shape[1]])
 81 |             # offset of the region observed at each time step
 82 |             offset = (0, 0)
 83 |             # absolute status is a boolean we indicate if the agent will continue
 84 |             # searching object or not. If the first object already covers the whole
 85 |             # image, we can put it at 0 so we do not further search there
 86 |             absolute_status = 1
 87 |             action = 0
 88 |             step = 0
 89 |             qval = 0
 90 |             region_image = image_for_search
 91 |             region_mask = np.ones([image.shape[0], image.shape[1]])
 92 |             # we run the agent if the maximum number of steps has not been reached and
 93 |             # if the boolean
 94 |             while (step < number_of_steps) and (absolute_status == 1):
 95 |                 iou = 0
 96 |                 # we init history vector as we are going to find another object
 97 |                 history_vector = np.zeros([24])
 98 |                 state = get_state(region_image, history_vector, model_vgg)
 99 |                 status = 1
100 |                 draw_sequences_test(step, action, qval, draw, region_image, background, path_testing_folder,
101 |                                     region_mask, image_name, bool_draw)
102 |                 size_mask = (image.shape[0], image.shape[1])
103 |                 original_shape = size_mask
104 |                 region_mask = np.ones([image.shape[0], image.shape[1]])
105 |                 while (status == 1) & (step < number_of_steps):
106 |                     step += 1
107 |                     qval = model.predict(state.T, batch_size=1)
108 |                     action = (np.argmax(qval))+1
109 |                     # movement action, make the proper zoom on the image
110 |                     if action != 6:
111 |                         region_mask = np.zeros(original_shape)
112 |                         size_mask = (size_mask[0] * scale_subregion, size_mask[1] * scale_subregion)
113 |                         if action == 1:
114 |                             offset_aux = (0, 0)
115 |                         elif action == 2:
116 |                             offset_aux = (0, size_mask[1] * scale_mask)
117 |                             offset = (offset[0], offset[1] + size_mask[1] * scale_mask)
118 |                         elif action == 3:
119 |                             offset_aux = (size_mask[0] * scale_mask, 0)
120 |                             offset = (offset[0] + size_mask[0] * scale_mask, offset[1])
121 |                         elif action == 4:
122 |                             offset_aux = (size_mask[0] * scale_mask,
123 |                                           size_mask[1] * scale_mask)
124 |                             offset = (offset[0] + size_mask[0] * scale_mask,
125 |                                       offset[1] + size_mask[1] * scale_mask)
126 |                         elif action == 5:
127 |                             offset_aux = (size_mask[0] * scale_mask / 2,
128 |                                           size_mask[0] * scale_mask / 2)
129 |                             offset = (offset[0] + size_mask[0] * scale_mask / 2,
130 |                                       offset[1] + size_mask[0] * scale_mask / 2)
131 |                         region_image = region_image[offset_aux[0]:offset_aux[0] + size_mask[0],
132 |                                        offset_aux[1]:offset_aux[1] + size_mask[1]]
133 |                         region_mask[offset[0]:offset[0] + size_mask[0], offset[1]:offset[1] + size_mask[1]] = 1
134 |                     draw_sequences_test(step, action, qval, draw, region_image, background, path_testing_folder,
135 |                                         region_mask, image_name, bool_draw)
136 |                     # trigger action
137 |                     if action == 6:
138 |                         offset = (0, 0)
139 |                         status = 0
140 |                         if step == 1:
141 |                             absolute_status = 0
142 |                         if only_first_object == 1:
143 |                             absolute_status = 0
144 |                         image_for_search = mask_image_with_mean_background(region_mask, image_for_search)
145 |                         region_image = image_for_search
146 |                     history_vector = update_history_vector(history_vector, action)
147 |                     new_state = get_state(region_image, history_vector, model_vgg)
148 |                     state = new_state
149 | 


--------------------------------------------------------------------------------
/scripts/image_zooms_training.py:
--------------------------------------------------------------------------------
  1 | import cv2, numpy as np
  2 | import time
  3 | import math as mth
  4 | from PIL import Image, ImageDraw, ImageFont
  5 | import scipy.io
  6 | from keras.models import Sequential
  7 | from keras import initializations
  8 | from keras.initializations import normal, identity
  9 | from keras.layers.core import Dense, Dropout, Activation, Flatten
 10 | from keras.optimizers import RMSprop, SGD, Adam
 11 | import random
 12 | import argparse
 13 | from scipy import ndimage
 14 | from keras.preprocessing import image
 15 | from sklearn.preprocessing import OneHotEncoder
 16 | from features import get_image_descriptor_for_image, obtain_compiled_vgg_16, vgg_16, \
 17 |     get_conv_image_descriptor_for_image, calculate_all_initial_feature_maps
 18 | from parse_xml_annotations import *
 19 | from image_helper import *
 20 | from metrics import *
 21 | from visualization import *
 22 | from reinforcement import *
 23 | 
 24 | 
 25 | # Read number of epoch to be trained, to make checkpointing
 26 | parser = argparse.ArgumentParser(description='Epoch:')
 27 | parser.add_argument("-n", metavar='N', type=int, default=0)
 28 | args = parser.parse_args()
 29 | epochs_id = int(args.n)
 30 | 
 31 | 
 32 | if __name__ == "__main__":
 33 | 
 34 |     ######## PATHS definition ########
 35 | 
 36 |     # path of PASCAL VOC 2012 or other database to use for training
 37 |     path_voc = "./VOC2012/"
 38 |     # path of other PASCAL VOC dataset, if you want to train with 2007 and 2012 train datasets
 39 |     path_voc2 = "./VOC2007/"
 40 |     # path of where to store the models
 41 |     path_model = "../models_image_zooms"
 42 |     # path of where to store visualizations of search sequences
 43 |     path_testing_folder = '../testing_visualizations'
 44 |     # path of VGG16 weights
 45 |     path_vgg = "../vgg16_weights.h5"
 46 | 
 47 |     ######## PARAMETERS ########
 48 | 
 49 |     # Class category of PASCAL that the RL agent will be searching
 50 |     class_object = 1
 51 |     # Scale of subregion for the hierarchical regions (to deal with 2/4, 3/4)
 52 |     scale_subregion = float(3)/4
 53 |     scale_mask = float(1)/(scale_subregion*4)
 54 |     # 1 if you want to obtain visualizations of the search for objects
 55 |     bool_draw = 0
 56 |     # How many steps can run the agent until finding one object
 57 |     number_of_steps = 10
 58 |     # Boolean to indicate if you want to use the two databases, or just one
 59 |     two_databases = 0
 60 |     epochs = 50
 61 |     gamma = 0.90
 62 |     epsilon = 1
 63 |     batch_size = 100
 64 |     # Pointer to where to store the last experience in the experience replay buffer,
 65 |     # actually there is a pointer for each PASCAL category, in case all categories
 66 |     # are trained at the same time
 67 |     h = np.zeros([20])
 68 |     # Each replay memory (one for each possible category) has a capacity of 100 experiences
 69 |     buffer_experience_replay = 1000
 70 |     # Init replay memories
 71 |     replay = [[] for i in range(20)]
 72 |     reward = 0
 73 | 
 74 |     ######## MODELS ########
 75 | 
 76 |     model_vgg = obtain_compiled_vgg_16(path_vgg)
 77 | 
 78 |     # If you want to train it from first epoch, first option is selected. Otherwise,
 79 |     # when making checkpointing, weights of last stored weights are loaded for a particular class object
 80 | 
 81 |     if epochs_id == 0:
 82 |         models = get_array_of_q_networks_for_pascal("0", class_object)
 83 |     else:
 84 |         models = get_array_of_q_networks_for_pascal(path_model, class_object)
 85 | 
 86 |     ######## LOAD IMAGE NAMES ########
 87 | 
 88 |     if two_databases == 1:
 89 |         image_names1 = np.array([load_images_names_in_data_set('trainval', path_voc)])
 90 |         image_names2 = np.array([load_images_names_in_data_set('trainval', path_voc2)])
 91 |         image_names = np.concatenate([image_names1, image_names2])
 92 |     else:
 93 |         image_names = np.array([load_images_names_in_data_set('trainval', path_voc)])
 94 | 
 95 |     ######## LOAD IMAGES ########
 96 | 
 97 |     if two_databases == 1:
 98 |         images1 = get_all_images(image_names1, path_voc)
 99 |         images2 = get_all_images(image_names2, path_voc2)
100 |         images = np.concatenate([images1, images2])
101 |     else:
102 |         images = get_all_images(image_names, path_voc)
103 | 
104 |     for i in range(epochs_id, epochs_id + epochs):
105 |         for j in range(np.size(image_names)):
106 |             masked = 0
107 |             not_finished = 1
108 |             image = np.array(images[j])
109 |             image_name = image_names[0][j]
110 |             annotation = get_bb_of_gt_from_pascal_xml_annotation(image_name, path_voc)
111 |             if two_databases == 1:
112 |                 if j < np.size(image_names1):
113 |                     annotation = get_bb_of_gt_from_pascal_xml_annotation(image_name, path_voc)
114 |                 else:
115 |                     annotation = get_bb_of_gt_from_pascal_xml_annotation(image_name, path_voc2)
116 |             gt_masks = generate_bounding_box_from_annotation(annotation, image.shape)
117 |             array_classes_gt_objects = get_ids_objects_from_annotation(annotation)
118 |             region_mask = np.ones([image.shape[0], image.shape[1]])
119 |             shape_gt_masks = np.shape(gt_masks)
120 |             available_objects = np.ones(np.size(array_classes_gt_objects))
121 |             # Iterate through all the objects in the ground truth of an image
122 |             for k in range(np.size(array_classes_gt_objects)):
123 |                 # Init visualization
124 |                 background = Image.new('RGBA', (10000, 2500), (255, 255, 255, 255))
125 |                 draw = ImageDraw.Draw(background)
126 |                 # We check whether the ground truth object is of the target class category
127 |                 if array_classes_gt_objects[k] == class_object:
128 |                     gt_mask = gt_masks[:, :, k]
129 |                     step = 0
130 |                     new_iou = 0
131 |                     # this matrix stores the IoU of each object of the ground-truth, just in case
132 |                     # the agent changes of observed object
133 |                     last_matrix = np.zeros([np.size(array_classes_gt_objects)])
134 |                     region_image = image
135 |                     offset = (0, 0)
136 |                     size_mask = (image.shape[0], image.shape[1])
137 |                     original_shape = size_mask
138 |                     old_region_mask = region_mask
139 |                     region_mask = np.ones([image.shape[0], image.shape[1]])
140 |                     # If the ground truth object is already masked by other already found masks, do not
141 |                     # use it for training
142 |                     if masked == 1:
143 |                         for p in range(gt_masks.shape[2]):
144 |                             overlap = calculate_overlapping(old_region_mask, gt_masks[:, :, p])
145 |                             if overlap > 0.60:
146 |                                 available_objects[p] = 0
147 |                     # We check if there are still obejcts to be found
148 |                     if np.count_nonzero(available_objects) == 0:
149 |                         not_finished = 0
150 |                     # follow_iou function calculates at each time step which is the groun truth object
151 |                     # that overlaps more with the visual region, so that we can calculate the rewards appropiately
152 |                     iou, new_iou, last_matrix, index = follow_iou(gt_masks, region_mask, array_classes_gt_objects,
153 |                                                                   class_object, last_matrix, available_objects)
154 |                     new_iou = iou
155 |                     gt_mask = gt_masks[:, :, index]
156 |                     # init of the history vector that indicates past actions (6 actions * 4 steps in the memory)
157 |                     history_vector = np.zeros([24])
158 |                     # computation of the initial state
159 |                     state = get_state(region_image, history_vector, model_vgg)
160 |                     # status indicates whether the agent is still alive and has not triggered the terminal action
161 |                     status = 1
162 |                     action = 0
163 |                     reward = 0
164 |                     if step > number_of_steps:
165 |                         background = draw_sequences(i, k, step, action, draw, region_image, background,
166 |                                                     path_testing_folder, iou, reward, gt_mask, region_mask, image_name,
167 |                                                     bool_draw)
168 |                         step += 1
169 |                     while (status == 1) & (step < number_of_steps) & not_finished:
170 |                         category = int(array_classes_gt_objects[k]-1)
171 |                         model = models[0][category]
172 |                         qval = model.predict(state.T, batch_size=1)
173 |                         background = draw_sequences(i, k, step, action, draw, region_image, background,
174 |                                                     path_testing_folder, iou, reward, gt_mask, region_mask, image_name,
175 |                                                     bool_draw)
176 |                         step += 1
177 |                         # we force terminal action in case actual IoU is higher than 0.5, to train faster the agent
178 |                         if (i < 100) & (new_iou > 0.5):
179 |                             action = 6
180 |                         # epsilon-greedy policy
181 |                         elif random.random() < epsilon:
182 |                             action = np.random.randint(1, 7)
183 |                         else:
184 |                             action = (np.argmax(qval))+1
185 |                         # terminal action
186 |                         if action == 6:
187 |                             iou, new_iou, last_matrix, index = follow_iou(gt_masks, region_mask,
188 |                                                                           array_classes_gt_objects, class_object,
189 |                                                                           last_matrix, available_objects)
190 |                             gt_mask = gt_masks[:, :, index]
191 |                             reward = get_reward_trigger(new_iou)
192 |                             background = draw_sequences(i, k, step, action, draw, region_image, background,
193 |                                                         path_testing_folder, iou, reward, gt_mask, region_mask,
194 |                                                         image_name, bool_draw)
195 |                             step += 1
196 |                         # movement action, we perform the crop of the corresponding subregion
197 |                         else:
198 |                             region_mask = np.zeros(original_shape)
199 |                             size_mask = (size_mask[0] * scale_subregion, size_mask[1] * scale_subregion)
200 |                             if action == 1:
201 |                                 offset_aux = (0, 0)
202 |                             elif action == 2:
203 |                                 offset_aux = (0, size_mask[1] * scale_mask)
204 |                                 offset = (offset[0], offset[1] + size_mask[1] * scale_mask)
205 |                             elif action == 3:
206 |                                 offset_aux = (size_mask[0] * scale_mask, 0)
207 |                                 offset = (offset[0] + size_mask[0] * scale_mask, offset[1])
208 |                             elif action == 4:
209 |                                 offset_aux = (size_mask[0] * scale_mask, 
210 |                                               size_mask[1] * scale_mask)
211 |                                 offset = (offset[0] + size_mask[0] * scale_mask,
212 |                                           offset[1] + size_mask[1] * scale_mask)
213 |                             elif action == 5:
214 |                                 offset_aux = (size_mask[0] * scale_mask / 2,
215 |                                               size_mask[0] * scale_mask / 2)
216 |                                 offset = (offset[0] + size_mask[0] * scale_mask / 2,
217 |                                           offset[1] + size_mask[0] * scale_mask / 2)
218 |                             region_image = region_image[offset_aux[0]:offset_aux[0] + size_mask[0],
219 |                                            offset_aux[1]:offset_aux[1] + size_mask[1]]
220 |                             region_mask[offset[0]:offset[0] + size_mask[0], offset[1]:offset[1] + size_mask[1]] = 1
221 |                             iou, new_iou, last_matrix, index = follow_iou(gt_masks, region_mask,
222 |                                                                           array_classes_gt_objects, class_object,
223 |                                                                           last_matrix, available_objects)
224 |                             gt_mask = gt_masks[:, :, index]
225 |                             reward = get_reward_movement(iou, new_iou)
226 |                             iou = new_iou
227 |                         history_vector = update_history_vector(history_vector, action)
228 |                         new_state = get_state(region_image, history_vector, model_vgg)
229 |                         # Experience replay storage
230 |                         if len(replay[category]) < buffer_experience_replay:
231 |                             replay[category].append((state, action, reward, new_state))
232 |                         else:
233 |                             if h[category] < (buffer_experience_replay-1):
234 |                                 h[category] += 1
235 |                             else:
236 |                                 h[category] = 0
237 |                             h_aux = h[category]
238 |                             h_aux = int(h_aux)
239 |                             replay[category][h_aux] = (state, action, reward, new_state)
240 |                             minibatch = random.sample(replay[category], batch_size)
241 |                             X_train = []
242 |                             y_train = []
243 |                             # we pick from the replay memory a sampled minibatch and generate the training samples
244 |                             for memory in minibatch:
245 |                                 old_state, action, reward, new_state = memory
246 |                                 old_qval = model.predict(old_state.T, batch_size=1)
247 |                                 newQ = model.predict(new_state.T, batch_size=1)
248 |                                 maxQ = np.max(newQ)
249 |                                 y = np.zeros([1, 6])
250 |                                 y = old_qval
251 |                                 y = y.T
252 |                                 if action != 6: #non-terminal state
253 |                                     update = (reward + (gamma * maxQ))
254 |                                 else: #terminal state
255 |                                     update = reward
256 |                                 y[action-1] = update #target output
257 |                                 X_train.append(old_state)
258 |                                 y_train.append(y)
259 |                             X_train = np.array(X_train)
260 |                             y_train = np.array(y_train)
261 |                             X_train = X_train.astype("float32")
262 |                             y_train = y_train.astype("float32")
263 |                             X_train = X_train[:, :, 0]
264 |                             y_train = y_train[:, :, 0]
265 |                             hist = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=1, verbose=0)
266 |                             models[0][category] = model
267 |                             state = new_state
268 |                         if action == 6:
269 |                             status = 0
270 |                             masked = 1
271 |                             # we mask object found with ground-truth so that agent learns faster
272 |                             image = mask_image_with_mean_background(gt_mask, image)
273 |                         else:
274 |                             masked = 0
275 |                     available_objects[index] = 0
276 |         if epsilon > 0.1:
277 |             epsilon -= 0.1
278 |         for t in range (np.size(models)):
279 |             if t == (class_object-1):
280 |                 string = path_model + '/model' + str(t) + '_epoch_' + str(i) + 'h5'
281 |                 string2 = path_model + '/model' + str(t) + 'h5'
282 |                 model = models[0][t]
283 |                 model.save_weights(string, overwrite=True)
284 |                 model.save_weights(string2, overwrite=True)
285 | 
286 | 


--------------------------------------------------------------------------------
/scripts/metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import cv2
 3 | 
 4 | 
 5 | def calculate_iou(img_mask, gt_mask):
 6 |     gt_mask *= 1.0
 7 |     img_and = cv2.bitwise_and(img_mask, gt_mask)
 8 |     img_or = cv2.bitwise_or(img_mask, gt_mask)
 9 |     j = np.count_nonzero(img_and)
10 |     i = np.count_nonzero(img_or)
11 |     iou = float(float(j)/float(i))
12 |     return iou
13 | 
14 | 
15 | def calculate_overlapping(img_mask, gt_mask):
16 |     gt_mask *= 1.0
17 |     img_and = cv2.bitwise_and(img_mask, gt_mask)
18 |     j = np.count_nonzero(img_and)
19 |     i = np.count_nonzero(gt_mask)
20 |     overlap = float(float(j)/float(i))
21 |     return overlap
22 | 
23 | 
24 | def follow_iou(gt_masks, mask, array_classes_gt_objects, object_id, last_matrix, available_objects):
25 |     results = np.zeros([np.size(array_classes_gt_objects), 1])
26 |     for k in range(np.size(array_classes_gt_objects)):
27 |         if array_classes_gt_objects[k] == object_id:
28 |             if available_objects[k] == 1:
29 |                 gt_mask = gt_masks[:, :, k]
30 |                 iou = calculate_iou(mask, gt_mask)
31 |                 results[k] = iou
32 |             else:
33 |                 results[k] = -1
34 |     max_result = max(results)
35 |     ind = np.argmax(results)
36 |     iou = last_matrix[ind]
37 |     new_iou = max_result
38 |     return iou, new_iou, results, ind
39 | 


--------------------------------------------------------------------------------
/scripts/parse_xml_annotations.py:
--------------------------------------------------------------------------------
  1 | import xml.etree.ElementTree as ET
  2 | import numpy as np
  3 | 
  4 | 
  5 | def get_bb_of_gt_from_pascal_xml_annotation(xml_name, voc_path):
  6 |     string = voc_path + '/Annotations/' + xml_name + '.xml'
  7 |     tree = ET.parse(string)
  8 |     root = tree.getroot()
  9 |     names = []
 10 |     x_min = []
 11 |     x_max = []
 12 |     y_min = []
 13 |     y_max = []
 14 |     for child in root:
 15 |         if child.tag == 'object':
 16 |             for child2 in child:
 17 |                 if child2.tag == 'name':
 18 |                     names.append(child2.text)
 19 |                 elif child2.tag == 'bndbox':
 20 |                     for child3 in child2:
 21 |                         if child3.tag == 'xmin':
 22 |                             x_min.append(child3.text)
 23 |                         elif child3.tag == 'xmax':
 24 |                             x_max.append(child3.text)
 25 |                         elif child3.tag == 'ymin':
 26 |                             y_min.append(child3.text)
 27 |                         elif child3.tag == 'ymax':
 28 |                             y_max.append(child3.text)
 29 |     category_and_bb = np.zeros([np.size(names), 5])
 30 |     for i in range(np.size(names)):
 31 |         category_and_bb[i][0] = get_id_of_class_name(names[i])
 32 |         category_and_bb[i][1] = x_min[i]
 33 |         category_and_bb[i][2] = x_max[i]
 34 |         category_and_bb[i][3] = y_min[i]
 35 |         category_and_bb[i][4] = y_max[i]
 36 |     return category_and_bb
 37 | 
 38 | 
 39 | def get_all_annotations(image_names, voc_path):
 40 |     annotations = []
 41 |     for i in range(np.size(image_names)):
 42 |         image_name = image_names[0][i]
 43 |         annotations.append(get_bb_of_gt_from_pascal_xml_annotation(image_name, voc_path))
 44 |     return annotations
 45 | 
 46 | 
 47 | def generate_bounding_box_from_annotation(annotation, image_shape):
 48 |     length_annotation = annotation.shape[0]
 49 |     masks = np.zeros([image_shape[0], image_shape[1], length_annotation])
 50 |     for i in range(0, length_annotation):
 51 |         masks[annotation[i, 3]:annotation[i, 4], annotation[i, 1]:annotation[i, 2], i] = 1
 52 |     return masks
 53 | 
 54 | 
 55 | def get_ids_objects_from_annotation(annotation):
 56 |     return annotation[:, 0]
 57 | 
 58 | 
 59 | def get_id_of_class_name (class_name):
 60 |     if class_name == 'aeroplane':
 61 |         return 1
 62 |     elif class_name == 'bicycle':
 63 |         return 2
 64 |     elif class_name == 'bird':
 65 |         return 3
 66 |     elif class_name == 'boat':
 67 |         return 4
 68 |     elif class_name == 'bottle':
 69 |         return 5
 70 |     elif class_name == 'bus':
 71 |         return 6
 72 |     elif class_name == 'car':
 73 |         return 7
 74 |     elif class_name == 'cat':
 75 |         return 8
 76 |     elif class_name == 'chair':
 77 |         return 9
 78 |     elif class_name == 'cow':
 79 |         return 10
 80 |     elif class_name == 'diningtable':
 81 |         return 11
 82 |     elif class_name == 'dog':
 83 |         return 12
 84 |     elif class_name == 'horse':
 85 |         return 13
 86 |     elif class_name == 'motorbike':
 87 |         return 14
 88 |     elif class_name == 'person':
 89 |         return 15
 90 |     elif class_name == 'pottedplant':
 91 |         return 16
 92 |     elif class_name == 'sheep':
 93 |         return 17
 94 |     elif class_name == 'sofa':
 95 |         return 18
 96 |     elif class_name == 'train':
 97 |         return 19
 98 |     elif class_name == 'tvmonitor':
 99 |         return 20
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 


--------------------------------------------------------------------------------
/scripts/pool45_crops_testing.py:
--------------------------------------------------------------------------------
  1 | import cv2, numpy as np
  2 | import time
  3 | import math as mth
  4 | from PIL import Image, ImageDraw, ImageFont
  5 | import scipy.io
  6 | from keras.models import Sequential
  7 | from keras import initializations
  8 | from keras.initializations import normal, identity
  9 | from keras.layers.core import Dense, Dropout, Activation, Flatten
 10 | from keras.optimizers import RMSprop, SGD, Adam
 11 | import random
 12 | from scipy import ndimage
 13 | from keras.preprocessing import image
 14 | from sklearn.preprocessing import OneHotEncoder
 15 | 
 16 | from features import get_image_descriptor_for_image, obtain_compiled_vgg_16, vgg_16, \
 17 |     get_conv_image_descriptor_for_image, calculate_all_initial_feature_maps
 18 | from parse_xml_annotations import *
 19 | from image_helper import *
 20 | from metrics import *
 21 | from visualization import *
 22 | from reinforcement import *
 23 | 
 24 | if __name__ == "__main__":
 25 | 
 26 |     ######## PATHS definition ########
 27 | 
 28 |     # path of pascal voc test
 29 |     path_voc_test = "./VOC2007_test/"
 30 |     # model name of the weights
 31 |     model_name = "model_pool45_crops"
 32 |     # path of folder where the weights are
 33 |     weights_path = "../models_pool45_crops/"
 34 |     # path of where to store visualizations of search sequences
 35 |     path_testing_folder = '../testing/'
 36 |     # path of VGG16 weights
 37 |     path_vgg = "../vgg16_weights.h5"
 38 | 
 39 |     ######## MODELS ########
 40 | 
 41 |     model_vgg = get_convolutional_vgg16_compiled(path_vgg)
 42 |     model = get_q_network(weights_path + model_name)
 43 | 
 44 |     ######## LOAD IMAGE NAMES ########
 45 | 
 46 |     image_names = np.array([load_images_names_in_data_set('aeroplane_test', path_voc_test)])
 47 |     labels = load_images_labels_in_data_set('aeroplane_test', path_voc_test)
 48 | 
 49 |     ######## LOAD IMAGES ########
 50 |     
 51 |     images = get_all_images(image_names, path_voc_test)
 52 | 
 53 |     ######## PARAMETERS ########
 54 | 
 55 |     # Class category of PASCAL that the RL agent will be searching
 56 |     class_object = 1
 57 |     # 1 if you want to obtain visualizations of the search for objects
 58 |     bool_draw = 1
 59 |     # Scale of subregion for the hierarchical regions (to deal with 2/4, 3/4)
 60 |     scale_subregion = float(3)/4
 61 |     scale_mask = float(1)/(scale_subregion*4)
 62 |     # Number of steps that the agent does at each image
 63 |     number_of_steps = 10
 64 |     # Only search first object
 65 |     only_first_object = 1
 66 | 
 67 |     for j in range(np.size(image_names)):
 68 |         if labels[j] == "1":
 69 |             image = np.array(images[j])
 70 |             # init drawing for visualization
 71 |             background = Image.new('RGBA', (10000, 2000), (255, 255, 255, 255))
 72 |             draw = ImageDraw.Draw(background)
 73 |             image_name = image_names[0][j]
 74 |             # get feature maps for the image
 75 |             feature_maps = get_feature_maps(model_vgg, image)
 76 |             annotation = get_bb_of_gt_from_pascal_xml_annotation(image_name, path_voc_test)
 77 |             gt_masks = generate_bounding_box_from_annotation(annotation, image.shape)
 78 |             array_classes_gt_objects = get_ids_objects_from_annotation(annotation)
 79 |             size_mask = (image.shape[0], image.shape[1])
 80 |             original_shape = size_mask
 81 |             image_for_search = image
 82 |             region_mask = np.ones([image.shape[0], image.shape[1]])
 83 |             count = 0
 84 |             # offset of the region observed at each time step
 85 |             offset = (0, 0)
 86 |             # absolute status is a boolean we indicate if the agent will continue
 87 |             # searching object or not. If the first object already covers the whole
 88 |             # image, we can put it at 0 so we do not further search there
 89 |             absolute_status = 1
 90 |             action = 0
 91 |             step = 0
 92 |             qval = 0
 93 |             region_image = image_for_search
 94 |             region_mask = np.ones([image.shape[0], image.shape[1]])
 95 |             while (step < number_of_steps) and (absolute_status == 1):
 96 |                 iou = 0
 97 |                 history_vector = np.zeros([24])
 98 |                 region_coordinates = np.array([offset[0], offset[1], size_mask[0], size_mask[1]])
 99 |                 region_descriptor = obtain_descriptor_from_feature_map(feature_maps, region_coordinates)
100 |                 region_descriptor_2 = np.reshape(region_descriptor, (25088, 1))
101 |                 state = get_state_pool45(history_vector, region_descriptor_2)
102 |                 status = 1
103 |                 iou = 0
104 |                 draw_sequences_test(step, action, qval, draw, region_image, background, path_testing_folder,
105 |                                     region_mask, image_name, bool_draw)
106 |                 size_mask = (image.shape[0], image.shape[1])
107 |                 original_shape = size_mask
108 |                 region_mask = np.ones([image.shape[0], image.shape[1]])
109 |                 while (status == 1) & (step < number_of_steps):
110 |                     step += 1
111 |                     qval = model.predict(state.T, batch_size=1)
112 |                     action = (np.argmax(qval))+1
113 |                     # movement action, make the proper zoom on the image
114 |                     if action != 6:
115 |                         region_mask = np.zeros(original_shape)
116 |                         size_mask = (size_mask[0] * scale_subregion, size_mask[1] * scale_subregion)
117 |                         if action == 1:
118 |                             offset_aux = (0, 0)
119 |                         elif action == 2:
120 |                             offset_aux = (0, size_mask[1] * scale_mask)
121 |                             offset = (offset[0], offset[1] + size_mask[1] * scale_mask)
122 |                         elif action == 3:
123 |                             offset_aux = (size_mask[0] * scale_mask, 0)
124 |                             offset = (offset[0] + size_mask[0] * scale_mask, offset[1])
125 |                         elif action == 4:
126 |                             offset_aux = (size_mask[0] * scale_mask,
127 |                                           size_mask[1] * scale_mask)
128 |                             offset = (offset[0] + size_mask[0] * scale_mask,
129 |                                       offset[1] + size_mask[1] * scale_mask)
130 |                         elif action == 5:
131 |                             offset_aux = (size_mask[0] * scale_mask / 2,
132 |                                           size_mask[0] * scale_mask / 2)
133 |                             offset = (offset[0] + size_mask[0] * scale_mask / 2,
134 |                                       offset[1] + size_mask[0] * scale_mask / 2)
135 |                         region_image = region_image[offset_aux[0]:offset_aux[0] + size_mask[0],
136 |                                        offset_aux[1]:offset_aux[1] + size_mask[1]]
137 |                         region_mask[offset[0]:offset[0] + size_mask[0], offset[1]:offset[1] + size_mask[1]] = 1
138 |                     draw_sequences_test(step, action, qval, draw, region_image, background, path_testing_folder,
139 |                                         region_mask, image_name, bool_draw)
140 |                     # trigger action
141 |                     if action == 6:
142 |                         offset = (0, 0)
143 |                         status = 0
144 |                         if step == 1:
145 |                             absolute_status = 0
146 |                         if only_first_object == 1:
147 |                             absolute_status = 0
148 |                         image_for_search = mask_image_with_mean_background(region_mask, image_for_search)
149 |                         region_image = image_for_search
150 |                         feature_maps = get_feature_maps(model_vgg, region_image)
151 |                     history_vector = update_history_vector(history_vector, action)
152 |                     region_coordinates = np.array([offset[0], offset[1], size_mask[0], size_mask[1]])
153 |                     region_descriptor = obtain_descriptor_from_feature_map(feature_maps, region_coordinates)
154 |                     region_descriptor_2 = np.reshape(region_descriptor, (25088, 1))
155 |                     state = get_state_pool45(history_vector, region_descriptor_2)
156 | 


--------------------------------------------------------------------------------
/scripts/pool45_crops_training.py:
--------------------------------------------------------------------------------
  1 | import cv2, numpy as np
  2 | import time
  3 | import math as mth
  4 | from PIL import Image, ImageDraw, ImageFont
  5 | import scipy.io
  6 | from keras.models import Sequential
  7 | from keras import initializations
  8 | from keras.initializations import normal, identity
  9 | from keras.layers.core import Dense, Dropout, Activation, Flatten
 10 | from keras.optimizers import RMSprop, SGD, Adam
 11 | import random
 12 | import argparse
 13 | from scipy import ndimage
 14 | from keras.preprocessing import image
 15 | from sklearn.preprocessing import OneHotEncoder
 16 | 
 17 | from features import get_image_descriptor_for_image, obtain_compiled_vgg_16, vgg_16, \
 18 |     get_conv_image_descriptor_for_image, calculate_all_initial_feature_maps
 19 | from parse_xml_annotations import *
 20 | from image_helper import *
 21 | from metrics import *
 22 | from visualization import *
 23 | from reinforcement import *
 24 | 
 25 | 
 26 | # Read number of epoch to be trained, to make checkpointing
 27 | parser = argparse.ArgumentParser(description='Epoch:')
 28 | parser.add_argument("-n", metavar='N', type=int, default=0)
 29 | args = parser.parse_args()
 30 | epochs_id = int(args.n)
 31 | 
 32 | 
 33 | if __name__ == "__main__":
 34 | 
 35 |     ######## PATHS definition ########
 36 | 
 37 |     # path of PASCAL VOC 2012 or other database to use for training
 38 |     path_voc = "./VOC2012_train/"
 39 |     # path of other PASCAL VOC dataset, if you want to train with 2007 and 2012 train datasets
 40 |     # path_voc2 = "/gpfs/projects/bsc31/bsc31429/VOC2007_train/"
 41 |     # path of where to store the models
 42 |     path_model = "../models_pool45_crops"
 43 |     # path of where to store visualizations of search sequences
 44 |     path_testing_folder = '../testing'
 45 |     # path of VGG16 weights
 46 |     path_vgg = "../vgg16_weights.h5"
 47 | 
 48 |     ######## PARAMETERS ########
 49 | 
 50 |     # Class category of PASCAL that the RL agent will be searching
 51 |     class_object = 1
 52 |     # Scale of subregion for the hierarchical regions (to deal with 2/4, 3/4)
 53 |     scale_subregion = float(3)/4
 54 |     scale_mask = float(1)/(scale_subregion*4)
 55 |     # 1 if you want to obtain visualizations of the search for objects
 56 |     bool_draw = 0
 57 |     # How many steps can run the agent until finding one object
 58 |     number_of_steps = 10
 59 |     # Boolean to indicate if you want to use the two databases, or just one
 60 |     two_databases = 0
 61 |     epochs = 50
 62 |     gamma = 0.90
 63 |     epsilon = 1
 64 |     batch_size = 100
 65 |     # Pointer to where to store the last experience in the experience replay buffer,
 66 |     # actually there is a pointer for each PASCAL category, in case all categories
 67 |     # are trained at the same time
 68 |     h = np.zeros([20])
 69 |     # Each replay memory (one for each possible category) has a capacity of 100 experiences
 70 |     buffer_experience_replay = 1000
 71 |     # Init replay memories
 72 |     replay = [[] for i in range(20)]
 73 |     reward = 0
 74 | 
 75 |     ######## MODELS ########
 76 | 
 77 |     model_vgg = get_convolutional_vgg16_compiled(path_vgg)
 78 | 
 79 |     # If you want to train it from first epoch, first option is selected. Otherwise,
 80 |     # when making checkpointing, weights of last stored weights are loaded for a particular class object
 81 |     # NOTICE that for POOL45 model, this script only can train one class category at a time. We did this as
 82 |     # we are pre-computing features and storing them to RAM, and it is not possible to store features for all
 83 |     # objects of all classes
 84 | 
 85 |     if epochs_id == 0:
 86 |         model = get_q_network("0")
 87 |     else:
 88 |         model = get_q_network(path_model + '/model' + str(class_object-1) + 'h5')
 89 | 
 90 |     ######## LOAD IMAGE NAMES ########
 91 | 
 92 |     if two_databases == 1:
 93 |         image_names_1 = np.array([load_images_names_in_data_set('aeroplane_trainval', path_voc)])
 94 |         labels = load_images_labels_in_data_set('aeroplane_trainval', path_voc)
 95 |         image_names_1_2 = []
 96 |         for i in range(0, np.size(labels)):
 97 |             if labels[i] == "1":
 98 |                 image_names_1_2.append(image_names_1[0][i])
 99 |         image_names_2 = np.array([load_images_names_in_data_set('aeroplane_trainval', path_voc2)])
100 |         labels = load_images_labels_in_data_set('aeroplane_trainval', path_voc2)
101 |         image_names_2_2 = []
102 |         for i in range(0, np.size(labels)):
103 |             if labels[i] == "1":
104 |                 image_names_2_2.append(image_names_2[0][i])
105 |         image_names = np.concatenate([image_names_1_2, image_names_2_2], axis=1)
106 |     else:
107 |         image_names = np.array([load_images_names_in_data_set('aeroplane_trainval', path_voc)])
108 |         # We check in the annotations which of the images actually contain the class category that we want
109 |         # notice that as we want to train it for planes (class category 1) we input this subset of the database
110 |         labels = load_images_labels_in_data_set('aeroplane_trainval', path_voc)
111 |         image_names_2 = []
112 |         for i in range(0, np.size(labels)):
113 |             if labels[i] == "1":
114 |                 image_names_2.append(image_names[0][i])
115 |         image_names = image_names_2
116 | 
117 |     ######## LOAD IMAGES ########
118 | 
119 |     if two_databases == 1:
120 |         images1 = get_all_images_pool(image_names_1_2, path_voc)
121 |         images2 = get_all_images_pool(image_names_2_2, path_voc2)
122 |         images = images1 + images2
123 |     else:
124 |         images = get_all_images_pool(image_names, path_voc)
125 | 
126 | 
127 |     ######## PRECOMPUTE ALL INITIAL FEATURE MAPS ########
128 | 
129 |     if two_databases == 1:
130 |         initial_feature_maps1 = calculate_all_initial_feature_maps(images1, model_vgg, image_names_1_2)
131 |         initial_feature_maps2 = calculate_all_initial_feature_maps(images2, model_vgg, image_names_2_2)
132 |         initial_feature_maps = initial_feature_maps1 + initial_feature_maps2
133 |     else:
134 |         initial_feature_maps = calculate_all_initial_feature_maps(images, model_vgg, image_names)
135 | 
136 |     for i in range(epochs_id, epochs_id+epochs_batch):
137 |         for j in range(np.size(image_names)):
138 |             masked = 0
139 |             not_finished = 1
140 |             image = np.array(images[j])
141 |             image_name = image_names[j]
142 |             feature_maps = initial_feature_maps[j]
143 |             annotation = get_bb_of_gt_from_pascal_xml_annotation(image_name, path_voc)
144 |             if two_databases == 1:
145 |                 if j < np.size(image_names1_2):
146 |                     annotation = get_bb_of_gt_from_pascal_xml_annotation(image_name, path_voc)
147 |                 else:
148 |                     annotation = get_bb_of_gt_from_pascal_xml_annotation(image_name, path_voc2)
149 |             gt_masks = generate_bounding_box_from_annotation(annotation, image.shape)
150 |             array_classes_gt_objects = get_ids_objects_from_annotation(annotation)
151 |             region_mask = np.ones([image.shape[0], image.shape[1]])
152 |             shape_gt_masks = np.shape(gt_masks)
153 |             available_objects = np.ones(np.size(array_classes_gt_objects))
154 |             # Iterate through all the objects in the ground truth of an image
155 |             for k in range(np.size(array_classes_gt_objects)):
156 |                 # Init visualization
157 |                 background = Image.new('RGBA', (10000, 2500), (255, 255, 255, 255))
158 |                 draw = ImageDraw.Draw(background)
159 |                 # We check whether the ground truth object is of the target class category
160 |                 if array_classes_gt_objects[k] == class_object:
161 |                     gt_mask = gt_masks[:, :, k]
162 |                     step = 0
163 |                     reward = 0
164 |                     # this matrix stores the IoU of each object of the ground-truth, just in case
165 |                     # the agent changes of observed object
166 |                     last_matrix = np.zeros([np.size(array_classes_gt_objects)])
167 |                     new_iou = 0
168 |                     region_image = image
169 |                     offset = (0, 0)
170 |                     size_mask = (image.shape[0], image.shape[1])
171 |                     original_shape = size_mask
172 |                     old_region_mask = region_mask
173 |                     region_mask = np.ones([image.shape[0], image.shape[1]])
174 |                     # If the ground truth object is already masked by other already found masks, do not
175 |                     # use it for training
176 |                     if masked == 1:
177 |                         for p in range(gt_masks.shape[2]):
178 |                             overlap = calculate_overlapping(old_region_mask, gt_masks[:, :, p])
179 |                             if overlap > 0.6:
180 |                                 available_objects[p] = 0
181 |                     # We check if there are still objects to be found
182 |                     if np.count_nonzero(available_objects) == 0:
183 |                         not_finished = 0
184 |                     # follow_iou function calculates at each time step which is the groun truth object
185 |                     # that overlaps more with the visual region, so that we can calculate the rewards appropiately
186 |                     iou, new_iou, last_matrix, index = follow_iou(gt_masks, region_mask, array_classes_gt_objects,
187 |                                                                   class_object, last_matrix, available_objects)
188 |                     new_iou = iou
189 |                     gt_mask = gt_masks[:, :, index]
190 |                     # init of the history vector that indicates past actions (6 actions * 4 steps in the memory)
191 |                     history_vector = np.zeros([24])
192 |                     region_coordinates = np.array([offset[0], offset[1], size_mask[0], size_mask[1]])
193 |                     # calculate descriptor of region by ROI-pooling
194 |                     region_descriptor = obtain_descriptor_from_feature_map(feature_maps, region_coordinates)
195 |                     region_descriptor_2 = np.reshape(region_descriptor, (25088, 1))
196 |                     # computation of the initial state
197 |                     state = get_state_pool45(history_vector, region_descriptor_2)
198 |                     # status indicates whether the agent is still alive and has not triggered the terminal action
199 |                     status = 1
200 |                     action = 0
201 |                     if step > number_of_steps:
202 |                         background = draw_sequences(i, k, step, action, draw, region_image, background,
203 |                                                     path_testing_folder, iou, reward, gt_mask, region_mask, image_name,
204 |                                                     bool_draw)
205 |                         step += 1
206 |                     while (status == 1) & (step < number_of_steps) & not_finished:
207 |                         category = int(array_classes_gt_objects[k]-1)
208 |                         counter[category] += 1
209 |                         qval = model.predict(state.T, batch_size=1)
210 |                         background = draw_sequences(i, k, step, action, draw, region_image, background,
211 |                                                     path_testing_folder, iou, reward, gt_mask, region_mask, image_name,
212 |                                                     bool_draw)
213 |                         step += 1
214 |                         # we force terminal action in case actual IoU is higher than 0.5, to train faster the agent
215 |                         if (i < 100) & (new_iou > 0.5):
216 |                             action = 6
217 |                         # epsilon-greedy policy
218 |                         elif random.random() < epsilon:
219 |                             action = np.random.randint(1, 7)
220 |                         else:
221 |                             action = (np.argmax(qval))+1
222 |                         # terminal action
223 |                         if action == 6:
224 |                             iou, new_iou, last_matrix, index = follow_iou(gt_masks, region_mask,
225 |                                                                           array_classes_gt_objects, class_object,
226 |                                                                           last_matrix, available_objects)
227 |                             gt_mask = gt_masks[:, :, index]
228 |                             reward = get_reward_trigger(new_iou)
229 |                             background = draw_sequences(i, k, step, action, draw, region_image, background,
230 |                                                         path_testing_folder, iou, reward, gt_mask, region_mask,
231 |                                                         image_name, bool_draw)
232 |                             step += 1
233 |                         # movement action, we perform the crop of the corresponding subregion
234 |                         else:
235 |                             region_mask = np.zeros(original_shape)
236 |                             size_mask = (size_mask[0] * scale_subregion, size_mask[1] * scale_subregion)
237 |                             if action == 1:
238 |                                 offset_aux = (0, 0)
239 |                             elif action == 2:
240 |                                 offset_aux = (0, size_mask[1] * scale_mask)
241 |                                 offset = (offset[0], offset[1] + size_mask[1] * scale_mask)
242 |                             elif action == 3:
243 |                                 offset_aux = (size_mask[0] * scale_mask, 0)
244 |                                 offset = (offset[0] + size_mask[0] * scale_mask, offset[1])
245 |                             elif action == 4:
246 |                                 offset_aux = (size_mask[0] * scale_mask,
247 |                                               size_mask[1] * scale_mask)
248 |                                 offset = (offset[0] + size_mask[0] * scale_mask,
249 |                                           offset[1] + size_mask[1] * scale_mask)
250 |                             elif action == 5:
251 |                                 offset_aux = (size_mask[0] * scale_mask / 2,
252 |                                               size_mask[0] * scale_mask / 2)
253 |                                 offset = (offset[0] + size_mask[0] * scale_mask / 2,
254 |                                           offset[1] + size_mask[0] * scale_mask / 2)
255 |                             region_image = region_image[offset_aux[0]:offset_aux[0] + size_mask[0],
256 |                                            offset_aux[1]:offset_aux[1] + size_mask[1]]
257 |                             region_mask[offset[0]:offset[0] + size_mask[0], offset[1]:offset[1] + size_mask[1]] = 1
258 |                             # new_IoU=calculateIoU(region_mask,gt_mask)
259 |                             iou, new_iou, last_matrix, index = follow_iou(gt_masks, region_mask,
260 |                                                                           array_classes_gt_objects, class_object,
261 |                                                                           last_matrix, available_objects)
262 |                             gt_mask = gt_masks[:, :, index]
263 |                             reward = get_reward_movement(iou, new_iou)
264 |                             iou = new_iou
265 |                             history_vector = update_history_vector(history_vector, action)
266 |                         region_coordinates = np.array([offset[0], offset[1], size_mask[0], size_mask[1]])
267 |                         region_descriptor = obtain_descriptor_from_feature_map(feature_maps, region_coordinates)
268 |                         region_descriptor_2 = np.reshape(region_descriptor, (25088, 1))
269 |                         new_state = get_state_pool45(history_vector, region_descriptor_2)
270 |                         #Experience replay storage
271 |                         if len(replay[category]) < buffer_experience_replay:
272 |                             replay[category].append((state, action, reward, new_state))
273 |                         else:
274 |                             if h[category] < (buffer_experience_replay-1):
275 |                                 h[category] += 1
276 |                             else:
277 |                                 h[category] = 0
278 |                             h_aux = h[category]
279 |                             h_aux = int(h_aux)
280 |                             replay[category][h_aux] = (state, action, reward, new_state)
281 |                             minibatch = random.sample(replay[category], batch_size)
282 |                             X_train = []
283 |                             y_train = []
284 |                             # we pick from the replay memory a sampled minibatch and generate the training samples
285 |                             for memory in minibatch:
286 |                                 old_state, action, reward, new_state = memory
287 |                                 old_qval = model.predict(old_state.T, batch_size=1)
288 |                                 newQ = model.predict(new_state.T, batch_size=1)
289 |                                 maxQ = np.max(newQ)
290 |                                 y = np.zeros([1, 6])
291 |                                 y = old_qval
292 |                                 y = y.T
293 |                                 if action != 6: #non-terminal state
294 |                                     update = (reward + (gamma * maxQ))
295 |                                 else: #terminal state
296 |                                     update = reward
297 |                                 y[action-1] = update #target output
298 |                                 X_train.append(old_state)
299 |                                 y_train.append(y)
300 |                             X_train = np.array(X_train)
301 |                             y_train = np.array(y_train)
302 |                             X_train = X_train.astype("float32")
303 |                             y_train = y_train.astype("float32")
304 |                             X_train = X_train[:, :, 0]
305 |                             y_train = y_train[:, :, 0]
306 |                             hist = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=1, verbose=0)
307 |                             state = new_state
308 |                         if action == 6:
309 |                             status = 0
310 |                             masked = 1
311 |                             # we mask object found with ground-truth so that agent learns faster
312 |                             image = mask_image_with_mean_background(gt_mask, image)
313 |                         else:
314 |                             masked = 0
315 |                     available_objects[index] = 0
316 |         if epsilon > 0.1:
317 |             epsilon -= 0.1
318 |         string = path_model + '/model' + str(class_object-1) + '_epoch_' + str(i) + 'h5'
319 |         string2 = path_model + '/model' + str(class_object-1) + 'h5'
320 |         model.save_weights(string, overwrite=True)
321 |         model.save_weights(string2, overwrite=True)
322 | 
323 | 


--------------------------------------------------------------------------------
/scripts/reinforcement.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from keras.models import Sequential
  3 | from keras import initializations
  4 | from keras.initializations import normal, identity
  5 | from keras.layers.core import Dense, Dropout, Activation, Flatten
  6 | from keras.layers.recurrent import LSTM
  7 | from keras.optimizers import RMSprop, SGD, Adam
  8 | from features import *
  9 | 
 10 | # Different actions that the agent can do
 11 | number_of_actions = 6
 12 | # Actions captures in the history vector
 13 | actions_of_history = 4
 14 | # Visual descriptor size
 15 | visual_descriptor_size = 25088
 16 | # Reward movement action
 17 | reward_movement_action = 1
 18 | # Reward terminal action
 19 | reward_terminal_action = 3
 20 | # IoU required to consider a positive detection
 21 | iou_threshold = 0.5
 22 | 
 23 | 
 24 | def update_history_vector(history_vector, action):
 25 |     action_vector = np.zeros(number_of_actions)
 26 |     action_vector[action-1] = 1
 27 |     size_history_vector = np.size(np.nonzero(history_vector))
 28 |     updated_history_vector = np.zeros(number_of_actions*actions_of_history)
 29 |     if size_history_vector < actions_of_history:
 30 |         aux2 = 0
 31 |         for l in range(number_of_actions*size_history_vector, number_of_actions*size_history_vector+number_of_actions - 1):
 32 |             history_vector[l] = action_vector[aux2]
 33 |             aux2 += 1
 34 |         return history_vector
 35 |     else:
 36 |         for j in range(0, number_of_actions*(actions_of_history-1) - 1):
 37 |             updated_history_vector[j] = history_vector[j+number_of_actions]
 38 |         aux = 0
 39 |         for k in range(number_of_actions*(actions_of_history-1), number_of_actions*actions_of_history):
 40 |             updated_history_vector[k] = action_vector[aux]
 41 |             aux += 1
 42 |         return updated_history_vector
 43 | 
 44 | 
 45 | def get_state(image, history_vector, model_vgg):
 46 |     descriptor_image = get_conv_image_descriptor_for_image(image, model_vgg)
 47 |     descriptor_image = np.reshape(descriptor_image, (visual_descriptor_size, 1))
 48 |     history_vector = np.reshape(history_vector, (number_of_actions*actions_of_history, 1))
 49 |     state = np.vstack((descriptor_image, history_vector))
 50 |     return state
 51 | 
 52 | 
 53 | def get_state_pool45(history_vector,  region_descriptor):
 54 |     history_vector = np.reshape(history_vector, (24, 1))
 55 |     return np.vstack((region_descriptor, history_vector))
 56 | 
 57 | 
 58 | def get_reward_movement(iou, new_iou):
 59 |     if new_iou > iou:
 60 |         reward = reward_movement_action
 61 |     else:
 62 |         reward = - reward_movement_action
 63 |     return reward
 64 | 
 65 | 
 66 | def get_reward_trigger(new_iou):
 67 |     if new_iou > iou_threshold:
 68 |         reward = reward_terminal_action
 69 |     else:
 70 |         reward = - reward_terminal_action
 71 |     return reward
 72 | 
 73 | 
 74 | def get_q_network(weights_path):
 75 |     model = Sequential()
 76 |     model.add(Dense(1024, init=lambda shape, name: normal(shape, scale=0.01, name=name), input_shape=(25112,)))
 77 |     model.add(Activation('relu'))
 78 |     model.add(Dropout(0.2))
 79 |     model.add(Dense(1024, init=lambda shape, name: normal(shape, scale=0.01, name=name)))
 80 |     model.add(Activation('relu'))
 81 |     model.add(Dropout(0.2))
 82 |     model.add(Dense(6, init=lambda shape, name: normal(shape, scale=0.01, name=name)))
 83 |     model.add(Activation('linear'))
 84 |     adam = Adam(lr=1e-6)
 85 |     model.compile(loss='mse', optimizer=adam)
 86 |     if weights_path != "0":
 87 |         model.load_weights(weights_path)
 88 |     return model
 89 | 
 90 | 
 91 | def get_array_of_q_networks_for_pascal(weights_path, class_object):
 92 |     q_networks = []
 93 |     if weights_path == "0":
 94 |         for i in range(20):
 95 |             q_networks.append(get_q_network("0"))
 96 |     else:
 97 |         for i in range(20):
 98 |             if i == (class_object-1):
 99 |                 q_networks.append(get_q_network(weights_path + "/model" + str(i) + "h5"))
100 |             else:
101 |                 q_networks.append(get_q_network("0"))
102 |     return np.array([q_networks])


--------------------------------------------------------------------------------
/scripts/visualization.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image, ImageDraw, ImageFont
 2 | import numpy as np
 3 | 
 4 | path_font = "/usr/share/fonts/liberation/LiberationMono-Regular.ttf"
 5 | font = ImageFont.truetype(path_font, 24)
 6 | 
 7 | 
 8 | def string_for_action(action):
 9 |     if action == 0:
10 |         return "START"
11 |     if action == 1:
12 |         return 'up-left'
13 |     elif action == 2:
14 |         return 'up-right'
15 |     elif action == 3:
16 |         return 'down-left'
17 |     elif action == 4:
18 |         return 'down-right'
19 |     elif action == 5:
20 |         return 'center'
21 |     elif action == 6:
22 |         return 'TRIGGER'
23 | 
24 | 
25 | def draw_sequences(i, k, step, action, draw, region_image, background, path_testing_folder, iou, reward,
26 |                    gt_mask, region_mask, image_name, save_boolean):
27 |     mask = Image.fromarray(255 * gt_mask)
28 |     mask_img = Image.fromarray(255 * region_mask)
29 |     image_offset = (1000 * step, 70)
30 |     text_offset = (1000 * step, 550)
31 |     masked_image_offset = (1000 * step, 1400)
32 |     mask_offset = (1000 * step, 700)
33 |     action_string = string_for_action(action)
34 |     footnote = 'action: ' + action_string + ' ' + 'reward: ' + str(reward) + ' Iou:' + str(iou)
35 |     draw.text(text_offset, str(footnote), (0, 0, 0), font=font)
36 |     img_for_paste = Image.fromarray(region_image)
37 |     background.paste(img_for_paste, image_offset)
38 |     background.paste(mask, mask_offset)
39 |     background.paste(mask_img, masked_image_offset)
40 |     file_name = path_testing_folder + '/' + image_name + str(i) + '_object_' + str(k) + '.png'
41 |     if save_boolean == 1:
42 |         background.save(file_name)
43 |     return background
44 | 
45 | 
46 | def draw_sequences_test(step, action, qval, draw, region_image, background, path_testing_folder,
47 |                         region_mask, image_name, save_boolean):
48 |     aux = np.asarray(region_image, np.uint8)
49 |     img_offset = (1000 * step, 70)
50 |     footnote_offset = (1000 * step, 550)
51 |     q_predictions_offset = (1000 * step, 500)
52 |     mask_img_offset = (1000 * step, 700)
53 |     img_for_paste = Image.fromarray(aux)
54 |     background.paste(img_for_paste, img_offset)
55 |     mask_img = Image.fromarray(255 * region_mask)
56 |     background.paste(mask_img, mask_img_offset)
57 |     footnote = 'action: ' + str(action)
58 |     q_val_predictions_text = str(qval)
59 |     draw.text(footnote_offset, footnote, (0, 0, 0), font=font)
60 |     draw.text(q_predictions_offset, q_val_predictions_text, (0, 0, 0), font=font)
61 |     file_name = path_testing_folder + image_name + '.png'
62 |     if save_boolean == 1:
63 |         background.save(file_name)
64 |     return background
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------