├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── config.py
├── data_engine
    ├── README.md
    ├── __init__.py
    ├── generate_corpus_full_history.py
    ├── generate_descriptions_lists.py
    ├── generate_features_lists.py
    ├── generate_img_lists.py
    ├── generate_img_lists_from_split.py
    ├── generate_link_lists.py
    ├── generate_parallel_corpus.py
    ├── prepare_data.py
    ├── split_data.py
    └── subsample_frames_features.py
├── docs
    └── model.png
├── main.py
├── meta-optimizers
    └── spearmint
    │   ├── README.md
    │   ├── __init__.py
    │   ├── config.json
    │   ├── launch_spearmint.sh
    │   └── spearmint_opt.py
├── train.sh
├── turing_test.py
├── utils
    ├── __init__.py
    ├── common.py
    ├── evaluate_from_file.py
    ├── plot_metric.sh
    ├── prepare_features.py
    ├── pretrain_word_vectors.py
    ├── sort_by_split.py
    ├── split_features.py
    └── vocabulary_size.sh
├── viddesc_model.py
└── visualization.ipynb


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | 
  3 | .idea
  4 | 
  5 | /meta-optimizers/spearmint/db/
  6 | /meta-optimizers/spearmint/trained_models/
  7 | /meta-optimizers/spearmint/output/
  8 | 
  9 | ### Python template
 10 | # Byte-compiled / optimized / DLL files
 11 | __pycache__/
 12 | *.py[cod]
 13 | *$py.class
 14 | 
 15 | # C extensions
 16 | *.so
 17 | 
 18 | # Distribution / packaging
 19 | .Python
 20 | env/
 21 | build/
 22 | develop-eggs/
 23 | dist/
 24 | downloads/
 25 | eggs/
 26 | .eggs/
 27 | lib/
 28 | lib64/
 29 | parts/
 30 | sdist/
 31 | var/
 32 | *.egg-info/
 33 | .installed.cfg
 34 | *.egg
 35 | 
 36 | # PyInstaller
 37 | #  Usually these files are written by a python script from a template
 38 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 39 | *.manifest
 40 | *.spec
 41 | 
 42 | # Installer logs
 43 | pip-log.txt
 44 | pip-delete-this-directory.txt
 45 | 
 46 | # Unit test / coverage reports
 47 | htmlcov/
 48 | .tox/
 49 | .coverage
 50 | .coverage.*
 51 | .cache
 52 | nosetests.xml
 53 | coverage.xml
 54 | *,cover
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | 
 63 | # Sphinx documentation
 64 | docs/_build/
 65 | 
 66 | # PyBuilder
 67 | target/
 68 | ### Emacs template
 69 | # -*- mode: gitignore; -*-
 70 | *~
 71 | \#*\#
 72 | /.emacs.desktop
 73 | /.emacs.desktop.lock
 74 | *.elc
 75 | auto-save-list
 76 | tramp
 77 | .\#*
 78 | 
 79 | # Org-mode
 80 | .org-id-locations
 81 | *_archive
 82 | 
 83 | # flymake-mode
 84 | *_flymake.*
 85 | 
 86 | # eshell files
 87 | /eshell/history
 88 | /eshell/lastdir
 89 | 
 90 | # elpa packages
 91 | /elpa/
 92 | 
 93 | # reftex files
 94 | *.rel
 95 | 
 96 | # AUCTeX auto folder
 97 | /auto/
 98 | 
 99 | # cask packages
100 | .cask/
101 | 
102 | # Models
103 | *.pkl
104 | *.json
105 | *.h5
106 | *.npy
107 | *.zip
108 | 
109 | # Training results
110 | *.vqa
111 | *.coco
112 | *.multiclass
113 | *.pred
114 | *.txt
115 | 
116 | # Visualization files
117 | .ipynb_checkpoints
118 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | COPYRIGHT
  2 | 
  3 | Copyright (c) 2016, the respective contributors
  4 | All rights reserved.
  5 | 
  6 | ABiViRNet uses a shared copyright model: each contributor 
  7 | holds copyright over their contributions to ABiViRNet. The project versioning records 
  8 | all such contribution and copyright details. If a contributor wants to further 
  9 | mark their specific copyright on a particular contribution, they should 
 10 | indicate their copyright solely in the commit message of the change when it 
 11 | is committed.
 12 | 
 13 | 
 14 |                     GNU GENERAL PUBLIC LICENSE
 15 |                        Version 2, June 1991
 16 | 
 17 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc., <http://fsf.org/>
 18 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 19 |  Everyone is permitted to copy and distribute verbatim copies
 20 |  of this license document, but changing it is not allowed.
 21 | 
 22 |                             Preamble
 23 | 
 24 |   The licenses for most software are designed to take away your
 25 | freedom to share and change it.  By contrast, the GNU General Public
 26 | License is intended to guarantee your freedom to share and change free
 27 | software--to make sure the software is free for all its users.  This
 28 | General Public License applies to most of the Free Software
 29 | Foundation's software and to any other program whose authors commit to
 30 | using it.  (Some other Free Software Foundation software is covered by
 31 | the GNU Lesser General Public License instead.)  You can apply it to
 32 | your programs, too.
 33 | 
 34 |   When we speak of free software, we are referring to freedom, not
 35 | price.  Our General Public Licenses are designed to make sure that you
 36 | have the freedom to distribute copies of free software (and charge for
 37 | this service if you wish), that you receive source code or can get it
 38 | if you want it, that you can change the software or use pieces of it
 39 | in new free programs; and that you know you can do these things.
 40 | 
 41 |   To protect your rights, we need to make restrictions that forbid
 42 | anyone to deny you these rights or to ask you to surrender the rights.
 43 | These restrictions translate to certain responsibilities for you if you
 44 | distribute copies of the software, or if you modify it.
 45 | 
 46 |   For example, if you distribute copies of such a program, whether
 47 | gratis or for a fee, you must give the recipients all the rights that
 48 | you have.  You must make sure that they, too, receive or can get the
 49 | source code.  And you must show them these terms so they know their
 50 | rights.
 51 | 
 52 |   We protect your rights with two steps: (1) copyright the software, and
 53 | (2) offer you this license which gives you legal permission to copy,
 54 | distribute and/or modify the software.
 55 | 
 56 |   Also, for each author's protection and ours, we want to make certain
 57 | that everyone understands that there is no warranty for this free
 58 | software.  If the software is modified by someone else and passed on, we
 59 | want its recipients to know that what they have is not the original, so
 60 | that any problems introduced by others will not reflect on the original
 61 | authors' reputations.
 62 | 
 63 |   Finally, any free program is threatened constantly by software
 64 | patents.  We wish to avoid the danger that redistributors of a free
 65 | program will individually obtain patent licenses, in effect making the
 66 | program proprietary.  To prevent this, we have made it clear that any
 67 | patent must be licensed for everyone's free use or not licensed at all.
 68 | 
 69 |   The precise terms and conditions for copying, distribution and
 70 | modification follow.
 71 | 
 72 |                     GNU GENERAL PUBLIC LICENSE
 73 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 74 | 
 75 |   0. This License applies to any program or other work which contains
 76 | a notice placed by the copyright holder saying it may be distributed
 77 | under the terms of this General Public License.  The "Program", below,
 78 | refers to any such program or work, and a "work based on the Program"
 79 | means either the Program or any derivative work under copyright law:
 80 | that is to say, a work containing the Program or a portion of it,
 81 | either verbatim or with modifications and/or translated into another
 82 | language.  (Hereinafter, translation is included without limitation in
 83 | the term "modification".)  Each licensee is addressed as "you".
 84 | 
 85 | Activities other than copying, distribution and modification are not
 86 | covered by this License; they are outside its scope.  The act of
 87 | running the Program is not restricted, and the output from the Program
 88 | is covered only if its contents constitute a work based on the
 89 | Program (independent of having been made by running the Program).
 90 | Whether that is true depends on what the Program does.
 91 | 
 92 |   1. You may copy and distribute verbatim copies of the Program's
 93 | source code as you receive it, in any medium, provided that you
 94 | conspicuously and appropriately publish on each copy an appropriate
 95 | copyright notice and disclaimer of warranty; keep intact all the
 96 | notices that refer to this License and to the absence of any warranty;
 97 | and give any other recipients of the Program a copy of this License
 98 | along with the Program.
 99 | 
100 | You may charge a fee for the physical act of transferring a copy, and
101 | you may at your option offer warranty protection in exchange for a fee.
102 | 
103 |   2. You may modify your copy or copies of the Program or any portion
104 | of it, thus forming a work based on the Program, and copy and
105 | distribute such modifications or work under the terms of Section 1
106 | above, provided that you also meet all of these conditions:
107 | 
108 |     a) You must cause the modified files to carry prominent notices
109 |     stating that you changed the files and the date of any change.
110 | 
111 |     b) You must cause any work that you distribute or publish, that in
112 |     whole or in part contains or is derived from the Program or any
113 |     part thereof, to be licensed as a whole at no charge to all third
114 |     parties under the terms of this License.
115 | 
116 |     c) If the modified program normally reads commands interactively
117 |     when run, you must cause it, when started running for such
118 |     interactive use in the most ordinary way, to print or display an
119 |     announcement including an appropriate copyright notice and a
120 |     notice that there is no warranty (or else, saying that you provide
121 |     a warranty) and that users may redistribute the program under
122 |     these conditions, and telling the user how to view a copy of this
123 |     License.  (Exception: if the Program itself is interactive but
124 |     does not normally print such an announcement, your work based on
125 |     the Program is not required to print an announcement.)
126 | 
127 | These requirements apply to the modified work as a whole.  If
128 | identifiable sections of that work are not derived from the Program,
129 | and can be reasonably considered independent and separate works in
130 | themselves, then this License, and its terms, do not apply to those
131 | sections when you distribute them as separate works.  But when you
132 | distribute the same sections as part of a whole which is a work based
133 | on the Program, the distribution of the whole must be on the terms of
134 | this License, whose permissions for other licensees extend to the
135 | entire whole, and thus to each and every part regardless of who wrote it.
136 | 
137 | Thus, it is not the intent of this section to claim rights or contest
138 | your rights to work written entirely by you; rather, the intent is to
139 | exercise the right to control the distribution of derivative or
140 | collective works based on the Program.
141 | 
142 | In addition, mere aggregation of another work not based on the Program
143 | with the Program (or with a work based on the Program) on a volume of
144 | a storage or distribution medium does not bring the other work under
145 | the scope of this License.
146 | 
147 |   3. You may copy and distribute the Program (or a work based on it,
148 | under Section 2) in object code or executable form under the terms of
149 | Sections 1 and 2 above provided that you also do one of the following:
150 | 
151 |     a) Accompany it with the complete corresponding machine-readable
152 |     source code, which must be distributed under the terms of Sections
153 |     1 and 2 above on a medium customarily used for software interchange; or,
154 | 
155 |     b) Accompany it with a written offer, valid for at least three
156 |     years, to give any third party, for a charge no more than your
157 |     cost of physically performing source distribution, a complete
158 |     machine-readable copy of the corresponding source code, to be
159 |     distributed under the terms of Sections 1 and 2 above on a medium
160 |     customarily used for software interchange; or,
161 | 
162 |     c) Accompany it with the information you received as to the offer
163 |     to distribute corresponding source code.  (This alternative is
164 |     allowed only for noncommercial distribution and only if you
165 |     received the program in object code or executable form with such
166 |     an offer, in accord with Subsection b above.)
167 | 
168 | The source code for a work means the preferred form of the work for
169 | making modifications to it.  For an executable work, complete source
170 | code means all the source code for all modules it contains, plus any
171 | associated interface definition files, plus the scripts used to
172 | control compilation and installation of the executable.  However, as a
173 | special exception, the source code distributed need not include
174 | anything that is normally distributed (in either source or binary
175 | form) with the major components (compiler, kernel, and so on) of the
176 | operating system on which the executable runs, unless that component
177 | itself accompanies the executable.
178 | 
179 | If distribution of executable or object code is made by offering
180 | access to copy from a designated place, then offering equivalent
181 | access to copy the source code from the same place counts as
182 | distribution of the source code, even though third parties are not
183 | compelled to copy the source along with the object code.
184 | 
185 |   4. You may not copy, modify, sublicense, or distribute the Program
186 | except as expressly provided under this License.  Any attempt
187 | otherwise to copy, modify, sublicense or distribute the Program is
188 | void, and will automatically terminate your rights under this License.
189 | However, parties who have received copies, or rights, from you under
190 | this License will not have their licenses terminated so long as such
191 | parties remain in full compliance.
192 | 
193 |   5. You are not required to accept this License, since you have not
194 | signed it.  However, nothing else grants you permission to modify or
195 | distribute the Program or its derivative works.  These actions are
196 | prohibited by law if you do not accept this License.  Therefore, by
197 | modifying or distributing the Program (or any work based on the
198 | Program), you indicate your acceptance of this License to do so, and
199 | all its terms and conditions for copying, distributing or modifying
200 | the Program or works based on it.
201 | 
202 |   6. Each time you redistribute the Program (or any work based on the
203 | Program), the recipient automatically receives a license from the
204 | original licensor to copy, distribute or modify the Program subject to
205 | these terms and conditions.  You may not impose any further
206 | restrictions on the recipients' exercise of the rights granted herein.
207 | You are not responsible for enforcing compliance by third parties to
208 | this License.
209 | 
210 |   7. If, as a consequence of a court judgment or allegation of patent
211 | infringement or for any other reason (not limited to patent issues),
212 | conditions are imposed on you (whether by court order, agreement or
213 | otherwise) that contradict the conditions of this License, they do not
214 | excuse you from the conditions of this License.  If you cannot
215 | distribute so as to satisfy simultaneously your obligations under this
216 | License and any other pertinent obligations, then as a consequence you
217 | may not distribute the Program at all.  For example, if a patent
218 | license would not permit royalty-free redistribution of the Program by
219 | all those who receive copies directly or indirectly through you, then
220 | the only way you could satisfy both it and this License would be to
221 | refrain entirely from distribution of the Program.
222 | 
223 | If any portion of this section is held invalid or unenforceable under
224 | any particular circumstance, the balance of the section is intended to
225 | apply and the section as a whole is intended to apply in other
226 | circumstances.
227 | 
228 | It is not the purpose of this section to induce you to infringe any
229 | patents or other property right claims or to contest validity of any
230 | such claims; this section has the sole purpose of protecting the
231 | integrity of the free software distribution system, which is
232 | implemented by public license practices.  Many people have made
233 | generous contributions to the wide range of software distributed
234 | through that system in reliance on consistent application of that
235 | system; it is up to the author/donor to decide if he or she is willing
236 | to distribute software through any other system and a licensee cannot
237 | impose that choice.
238 | 
239 | This section is intended to make thoroughly clear what is believed to
240 | be a consequence of the rest of this License.
241 | 
242 |   8. If the distribution and/or use of the Program is restricted in
243 | certain countries either by patents or by copyrighted interfaces, the
244 | original copyright holder who places the Program under this License
245 | may add an explicit geographical distribution limitation excluding
246 | those countries, so that distribution is permitted only in or among
247 | countries not thus excluded.  In such case, this License incorporates
248 | the limitation as if written in the body of this License.
249 | 
250 |   9. The Free Software Foundation may publish revised and/or new versions
251 | of the General Public License from time to time.  Such new versions will
252 | be similar in spirit to the present version, but may differ in detail to
253 | address new problems or concerns.
254 | 
255 | Each version is given a distinguishing version number.  If the Program
256 | specifies a version number of this License which applies to it and "any
257 | later version", you have the option of following the terms and conditions
258 | either of that version or of any later version published by the Free
259 | Software Foundation.  If the Program does not specify a version number of
260 | this License, you may choose any version ever published by the Free Software
261 | Foundation.
262 | 
263 |   10. If you wish to incorporate parts of the Program into other free
264 | programs whose distribution conditions are different, write to the author
265 | to ask for permission.  For software which is copyrighted by the Free
266 | Software Foundation, write to the Free Software Foundation; we sometimes
267 | make exceptions for this.  Our decision will be guided by the two goals
268 | of preserving the free status of all derivatives of our free software and
269 | of promoting the sharing and reuse of software generally.
270 | 
271 |                             NO WARRANTY
272 | 
273 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
274 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
275 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
276 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
277 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
278 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
279 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
280 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
281 | REPAIR OR CORRECTION.
282 | 
283 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
284 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
285 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
286 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
287 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
288 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
289 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
290 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
291 | POSSIBILITY OF SUCH DAMAGES.
292 | 
293 |                      END OF TERMS AND CONDITIONS
294 | 
295 |             How to Apply These Terms to Your New Programs
296 | 
297 |   If you develop a new program, and you want it to be of the greatest
298 | possible use to the public, the best way to achieve this is to make it
299 | free software which everyone can redistribute and change under these terms.
300 | 
301 |   To do so, attach the following notices to the program.  It is safest
302 | to attach them to the start of each source file to most effectively
303 | convey the exclusion of warranty; and each file should have at least
304 | the "copyright" line and a pointer to where the full notice is found.
305 | 
306 |     {description}
307 |     Copyright (C) {year}  {fullname}
308 | 
309 |     This program is free software; you can redistribute it and/or modify
310 |     it under the terms of the GNU General Public License as published by
311 |     the Free Software Foundation; either version 2 of the License, or
312 |     (at your option) any later version.
313 | 
314 |     This program is distributed in the hope that it will be useful,
315 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
316 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
317 |     GNU General Public License for more details.
318 | 
319 |     You should have received a copy of the GNU General Public License along
320 |     with this program; if not, write to the Free Software Foundation, Inc.,
321 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
322 | 
323 | Also add information on how to contact you by electronic and paper mail.
324 | 
325 | If the program is interactive, make it output a short notice like this
326 | when it starts in an interactive mode:
327 | 
328 |     Gnomovision version 69, Copyright (C) year name of author
329 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
330 |     This is free software, and you are welcome to redistribute it
331 |     under certain conditions; type `show c' for details.
332 | 
333 | The hypothetical commands `show w' and `show c' should show the appropriate
334 | parts of the General Public License.  Of course, the commands you use may
335 | be called something other than `show w' and `show c'; they could even be
336 | mouse-clicks or menu items--whatever suits your program.
337 | 
338 | You should also get your employer (if you work as a programmer) or your
339 | school, if any, to sign a "copyright disclaimer" for the program, if
340 | necessary.  Here is a sample; alter the names:
341 | 
342 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
343 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
344 | 
345 |   {signature of Ty Coon}, 1 April 1989
346 |   Ty Coon, President of Vice
347 | 
348 | This General Public License does not permit incorporating your program into
349 | proprietary programs.  If your program is a subroutine library, you may
350 | consider it more useful to permit linking proprietary applications with the
351 | library.  If this is what you want to do, use the GNU Lesser General
352 | Public License instead of this License.
353 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Egocentric Video Description based on Temporally-Linked Sequences 
 2 | 
 3 | This repository contains the code for building the Temporally-linked Multi-input Attention (TMA) model, which was presented in
 4 | the work [Egocentric Video Description based on Temporally-Linked Sequences](), 
 5 | submitted to the [Journal of Visual Communication and Image Representation](https://www.journals.elsevier.com/journal-of-visual-communication-and-image-representation). 
 6 | With this module, you can replicate our experiments and easily deploy new models. TMA is built upon our fork of 
 7 | [Keras](https://github.com/MarcBS/keras) framework ([version 1.2](https://github.com/MarcBS/keras/tree/Keras-1.2-(stable))) and tested for the [Theano](http://deeplearning.net/software/theano)
 8 | backend.
 9 | 
10 | ## Features: 
11 | 
12 |  * Temporally-linked mechanism for learning using information from previous events.
13 |  * Multi-input Attention LSTM model over any of the input multimodal sequences.
14 |  * Peeked decoder LSTM: The previously generated word is an input of the current LSTM timestep
15 |  * MLPs for initializing the LSTM hidden and memory state
16 |  * Beam search decoding
17 | 
18 | ## Architecture
19 | 
20 | ![TMA_model](./docs/model.png)
21 | 
22 | ## Requirements
23 | 
24 | TMA requires the following libraries:
25 | 
26 |  - [Our version of Keras](https://github.com/MarcBS/keras) >= 1.2.3
27 |  - [Multimodal Keras Wrapper](https://github.com/MarcBS/multimodal_keras_wrapper) >= 0.7
28 |  - [Coco-caption evaluation package](https://github.com/lvapeab/coco-caption/tree/master/pycocoevalcap/)
29 | 
30 | ## Instructions:
31 | 
32 | Assuming you have a dataset and features extracted from the video frames:
33 | 
34 | 0) Set the paths to Keras and Multimodal Keras Wraper in train.sh
35 |  
36 |  1) Prepare data:
37 |  
38 |    ``
39 |  python data_engine/subsample_frames_features.py
40 |  ``
41 |  
42 |   ``
43 |  python data_engine/generate_features_lists.py
44 |  ``
45 |  
46 |   ``
47 |  python data_engine/generate_descriptions_lists.py
48 |  ``
49 | 
50 | See [data_engine/README.md](data_engine/README.md) for detailed information.
51 | 
52 | 2) Prepare the inputs/outputs of your model in `data_engine/prepare_data.py`
53 |   
54 | 3) Set a model configuration in  `config.py` 
55 |  
56 | 4) Train!:
57 | 
58 |   ``
59 |  python main.py
60 |  ``
61 | 
62 | ## Dataset
63 | 
64 | The dataset [EDUB-SegDesc](http://www.ub.edu/cvub/edub-segdesc/) was used to evaluate this model. It was acquired by the wearable camera Narrative Clip, taking a picture every 30 seconds (2 fpm). It consists of 55 days acquired by 9 people. Containing a total of 48,717 images, divided in 1,339 events (or image sequences) and 3,991 captions.
65 | 
66 | ## Citation
67 | 
68 | If you use this code for any purpose, please, do not forget to cite the following paper:
69 | 
70 | ```
71 | Marc Bolaños, Álvaro Peris, Francisco Casacuberta, Sergi Soler and Petia Radeva.
72 | Egocentric Video Description based on Temporally-Linked Sequences
73 | In Special Issue on Egocentric Vision and Lifelogging Tools. 
74 | Journal of Visual Communication and Image Representation (VCIR), (SUBMITTED).
75 | ```
76 | 
77 | ## About
78 | 
79 | Joint collaboration between the [Computer Vision at the University of Barcelona (CVUB)](http://www.ub.edu/cvub/) group at [Universitat de Barcelona](www.ub.edu)-[CVC](http://www.cvc.uab.es) and the [PRHLT Research Center](https://www.prhlt.upv.es) at [Universitat Politècnica de València](https://www.upv.es).
80 | 
81 | 
82 | ## Contact
83 | 
84 | Marc Bolaños ([web page](http://www.ub.edu/cvub/marcbolanos/)): marc.bolanos@ub.edu
85 | 
86 | Álvaro Peris ([web page](http://lvapeab.github.io/)): lvapeab@prhlt.upv.es 
87 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarcBS/TMA/76f4e16b3f10c056c45ada5df4a64cc564b69011/__init__.py


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
  1 | def load_parameters():
  2 |     """
  3 |         Loads the defined parameters
  4 |     """
  5 |     # Input data params
  6 |     DATA_ROOT_PATH = '/media/HDD_3TB/DATASETS/EDUB-SegDesc/'
  7 | 
  8 |     # preprocessed features
  9 |     DATASET_NAME = 'EDUB-SegDesc_features'   # Dataset name (add '-linked' suffix for using
 10 |                                                     # dataset with temporally-linked training data)
 11 |                                                     #
 12 |                                                     #    -linked
 13 |                                                     #    -linked-upperbound
 14 |                                                     #    -linked-upperbound-copy
 15 |                                                     #    -linked-upperbound-prev
 16 |                                                     #    -linked-upperbound-nocopy
 17 |                                                     #    -linked-video
 18 |                                                     #    -linked-vidtext
 19 |                                                     #    -vidtext-embed
 20 |                                                     #
 21 | 
 22 |     PRE_TRAINED_DATASET_NAME = None  #'MSVD_features'     # Dataset name for reusing vocabulary of pre-trained model (set to None for disabling)
 23 |                                                           # (only applicable if we are using a pre-trained model, default None)
 24 |     VOCABULARIES_MAPPING = {'description': 'description',
 25 |                             'state_below': 'description',
 26 |                             'prev_description': 'description'}
 27 | 
 28 |     PRE_TRAINED_VOCABULARY_NAME = None  #'1BillionWords_vocabulary'      # Dataset name for reusing vocabulary of pre-trained model
 29 | 
 30 |     # Input data
 31 |     INPUT_DATA_TYPE = 'video-features'                          # 'video-features' or 'video'
 32 |     NUM_FRAMES = 26                                             # fixed number of input frames per video
 33 | 
 34 |     if '-noninfo' in DATASET_NAME:
 35 |         suffix_annotations = '_without_noninfo'
 36 |         suffix_features = '_Without_NonInfo'
 37 |     else:
 38 |         suffix_annotations = ''
 39 |         suffix_features = ''
 40 | 
 41 |     #### Features from video frames
 42 |     FRAMES_LIST_FILES = {'train': 'Annotations/%s/train_feat_list'+suffix_annotations+'.txt',                 # Feature frames list files
 43 |                          'val': 'Annotations/%s/val_feat_list'+suffix_annotations+'.txt',
 44 |                          'test': 'Annotations/%s/test_feat_list'+suffix_annotations+'.txt',
 45 |                         }
 46 |     FRAMES_COUNTS_FILES = {  'train': 'Annotations/%s/train_feat_counts'+suffix_annotations+'.txt',           # Frames counts files
 47 |                              'val': 'Annotations/%s/val_feat_counts'+suffix_annotations+'.txt',
 48 |                              'test': 'Annotations/%s/test_feat_counts'+suffix_annotations+'.txt',
 49 |                           }
 50 |     FEATURE_NAMES = ['ImageNet'
 51 |                      + suffix_features] # append '_L2' at the end of each feature type if using their L2 version
 52 | 
 53 |     # Output data
 54 |     DESCRIPTION_FILES = {'train': 'Annotations/train_descriptions'+suffix_annotations+'.txt',                 # Description files
 55 |                          'val': 'Annotations/val_descriptions'+suffix_annotations+'.txt',
 56 |                          'test': 'Annotations/test_descriptions'+suffix_annotations+'.txt',
 57 |                         }
 58 |     DESCRIPTION_COUNTS_FILES = { 'train': 'Annotations/train_descriptions_counts'+suffix_annotations+'.npy',  # Description counts files
 59 |                                  'val': 'Annotations/val_descriptions_counts'+suffix_annotations+'.npy',
 60 |                                  'test': 'Annotations/test_descriptions_counts'+suffix_annotations+'.npy',
 61 |                                }
 62 | 
 63 |     # Dataset parameters
 64 |     if not '-vidtext-embed' in DATASET_NAME:
 65 |         INPUTS_IDS_DATASET = ['video', 'state_below']  # Corresponding inputs of the dataset
 66 |         OUTPUTS_IDS_DATASET = ['description']  # Corresponding outputs of the dataset
 67 |         INPUTS_IDS_MODEL = ['video', 'state_below']  # Corresponding inputs of the built model
 68 |         OUTPUTS_IDS_MODEL = ['description']  # Corresponding outputs of the built model
 69 |     else:
 70 |         INPUTS_IDS_DATASET = ['video', 'description']  # Corresponding inputs of the dataset
 71 |         OUTPUTS_IDS_DATASET = ['match']  # Corresponding outputs of the dataset
 72 |         INPUTS_IDS_MODEL = ['video', 'description']  # Corresponding inputs of the built model
 73 |         OUTPUTS_IDS_MODEL = ['match']  # Corresponding outputs of the built model
 74 | 
 75 | 
 76 |     if '-linked' in DATASET_NAME:
 77 | 
 78 |         LINK_SAMPLE_FILES = {'train': 'Annotations/train_link_samples'+suffix_annotations+'.txt',     # Links index files
 79 |                              'val': 'Annotations/val_link_samples'+suffix_annotations+'.txt',
 80 |                              'test': 'Annotations/test_link_samples'+suffix_annotations+'.txt',
 81 |                             }
 82 | 
 83 |         INPUTS_IDS_DATASET.append('prev_description')
 84 |         INPUTS_IDS_MODEL.append('prev_description')
 85 | 
 86 |         if '-vidtext' in DATASET_NAME:
 87 |             INPUTS_IDS_DATASET.append('prev_video')
 88 |             INPUTS_IDS_MODEL.append('prev_video')
 89 | 
 90 |         if '-upperbound' not in DATASET_NAME and '-video' not in DATASET_NAME:
 91 |             INPUTS_IDS_DATASET.append('link_index')
 92 |             INPUTS_IDS_MODEL.append('link_index')
 93 | 
 94 | 
 95 |     # Evaluation params
 96 |     if not '-vidtext-embed' in DATASET_NAME:
 97 |         METRICS = ['coco']  # Metric used for evaluating model after each epoch (leave empty if only prediction is required)
 98 |     else:
 99 |         METRICS = ['multiclass_metrics']
100 |     EVAL_ON_SETS = ['val', 'test']                 # Possible values: 'train', 'val' and 'test' (external evaluator)
101 |     EVAL_ON_SETS_KERAS = []                        # Possible values: 'train', 'val' and 'test' (Keras' evaluator)
102 |     START_EVAL_ON_EPOCH = 0                        # First epoch where the model will be evaluated
103 |     EVAL_EACH_EPOCHS = False                       # Select whether evaluate between N epochs or N updates
104 |     EVAL_EACH = 50                                 # Sets the evaluation frequency (epochs or updates)
105 | 
106 |     # Search parameters
107 |     SAMPLING = 'max_likelihood'                   # Possible values: multinomial or max_likelihood (recommended)
108 |     TEMPERATURE = 1                               # Multinomial sampling parameter
109 |     if not '-vidtext-embed' in DATASET_NAME:
110 |         BEAM_SEARCH = True                            # Switches on-off the beam search procedure
111 |     else:
112 |         BEAM_SEARCH = False
113 |     BEAM_SIZE = 10                                # Beam size (in case of BEAM_SEARCH == True)
114 |     BEAM_SEARCH_COND_INPUT = 1                    # Index of the conditional input used in beam search (i.e., state_below)
115 |     OPTIMIZED_SEARCH = True                       # Compute annotations only a single time per sample
116 |     NORMALIZE_SAMPLING = False                    # Normalize hypotheses scores according to their length
117 |     ALPHA_FACTOR = .6                             # Normalization according to length**ALPHA_FACTOR
118 |                                                   # (see: arxiv.org/abs/1609.08144)
119 | 
120 |     # Sampling params: Show some samples during training
121 |     if not '-vidtext-embed' in DATASET_NAME:
122 |         SAMPLE_ON_SETS = ['train', 'val']             # Possible values: 'train', 'val' and 'test'
123 |     else:
124 |         SAMPLE_ON_SETS = []
125 |     N_SAMPLES = 5                                 # Number of samples generated
126 |     START_SAMPLING_ON_EPOCH = 0                   # First epoch where the model will be evaluated
127 |     SAMPLE_EACH_UPDATES = 50                     # Sampling frequency (default 450)
128 | 
129 |     # Word representation params
130 |     TOKENIZATION_METHOD = 'tokenize_icann'        # Select which tokenization we'll apply:
131 |                                                   #  tokenize_basic, tokenize_aggressive, tokenize_soft,
132 |                                                   #  tokenize_icann or tokenize_questions
133 | 
134 |     FILL = 'end'                                  # whether we fill the 'end' or the 'start' of the sentence with 0s
135 |     TRG_LAN = 'en'                                # Language of the outputs (mainly used for the Meteor evaluator)
136 |     PAD_ON_BATCH = True                           # Whether we take as many timesteps as the longes sequence of the batch
137 |                                                   # or a fixed size (MAX_OUTPUT_TEXT_LEN)
138 | 
139 |     # Input image parameters
140 |     DATA_AUGMENTATION = False                      # Apply data augmentation on input data (noise on features)
141 |     DATA_AUGMENTATION_TYPE = ['random_selection']  # 'random_selection', 'noise'
142 |     IMG_FEAT_SIZE = 1024                           # Size of the image features
143 | 
144 |     # Output text parameters
145 |     OUTPUT_VOCABULARY_SIZE = 0                    # Size of the input vocabulary. Set to 0 for using all,
146 |                                                   # otherwise it will be truncated to these most frequent words.
147 |     MAX_OUTPUT_TEXT_LEN = 30                      # Maximum length of the output sequence
148 |                                                   # set to 0 if we want to use the whole answer as a single class
149 |     MAX_OUTPUT_TEXT_LEN_TEST = 50                 # Maximum length of the output sequence during test time
150 |     MIN_OCCURRENCES_VOCAB = 0                     # Minimum number of occurrences allowed for the words in the vocabulay.
151 | 
152 |     # Optimizer parameters (see model.compile() function)
153 |     LOSS = 'categorical_crossentropy'
154 |     CLASSIFIER_ACTIVATION = 'softmax'
155 | 
156 |     OPTIMIZER = 'Adadelta'                            # Optimizer
157 |     LR = 1.                                   # Learning rate. Recommended values - Adam 0.001 - Adadelta 1.0
158 |     CLIP_C = 10.                                  # During training, clip gradients to this norm
159 |     if not '-vidtext-embed' in DATASET_NAME:
160 |         SAMPLE_WEIGHTS = True                         # Select whether we use a weights matrix (mask) for the data outputs
161 |     LR_DECAY = None                                  # Minimum number of epochs before the next LR decay. Set to None if don't want to decay the learning rate
162 |     LR_GAMMA = 0.995                               # Multiplier used for decreasing the LR
163 | 
164 |     # Training parameters
165 |     MAX_EPOCH = 200                                # Stop when computed this number of epochs
166 |     BATCH_SIZE = 64                               # ABiViRNet trained with BATCH_SIZE = 64
167 | 
168 |     HOMOGENEOUS_BATCHES = False                         # Use batches with homogeneous output lengths for every minibatch (Possibly buggy!)
169 |     PARALLEL_LOADERS = 8                                # Parallel data batch loaders
170 |     EPOCHS_FOR_SAVE = 1 if EVAL_EACH_EPOCHS else None   # Number of epochs between model saves (None for disabling epoch save)
171 |     WRITE_VALID_SAMPLES = True                          # Write valid samples in file
172 |     SAVE_EACH_EVALUATION = True if not EVAL_EACH_EPOCHS else False   # Save each time we evaluate the model
173 | 
174 |     # Early stop parameters
175 |     EARLY_STOP = True                             # Turns on/off the early stop protocol
176 |     PATIENCE = 20                                 # We'll stop if the val STOP_METRIC does not improve after this
177 |                                                   # number of evaluations
178 | 
179 |     if not '-vidtext-embed' in DATASET_NAME:
180 |         STOP_METRIC = 'Bleu_4'                        # Metric for the stop
181 |     else:
182 |         STOP_METRIC = 'accuracy'
183 | 
184 |     # Model parameters
185 |     MODEL_TYPE = 'TemporallyLinkedVideoDescriptionAttDoublePrev'  # 'ArcticVideoCaptionWithInit'
186 |                                                                   # 'ArcticVideoCaptionNoLSTMEncWithInit'
187 |                                                                   # 'TemporallyLinkedVideoDescriptionNoAtt'
188 |                                                                   # 'TemporallyLinkedVideoDescriptionAtt'
189 |                                                                   # 'TemporallyLinkedVideoDescriptionAttDoublePrev'
190 |                                                                   # 'VideoTextEmbedding'
191 |                                                                   # 'DeepSeek'
192 | 
193 |     RNN_TYPE = 'LSTM'                             # RNN unit type ('LSTM' supported)
194 | 
195 |     # Input text parameters
196 |     TARGET_TEXT_EMBEDDING_SIZE = 301              # Source language word embedding size (ABiViRNet 301)
197 |     TRG_PRETRAINED_VECTORS = None                 # Path to pretrained vectors. (e.g. DATA_ROOT_PATH + '/DATA/word2vec.%s.npy' % TRG_LAN)
198 |                                                   # Set to None if you don't want to use pretrained vectors.
199 |                                                   # When using pretrained word embeddings, the size of the pretrained word embeddings must match with the word embeddings size.
200 |     TRG_PRETRAINED_VECTORS_TRAINABLE = True       # Finetune or not the target word embedding vectors.
201 | 
202 |     # Encoder configuration
203 |     ENCODER_HIDDEN_SIZE = 717                     # For models with RNN encoder (ABiViRNet 717)
204 |     BIDIRECTIONAL_ENCODER = True                  # Use bidirectional encoder
205 |     N_LAYERS_ENCODER = 1                          # Stack this number of encoding layers (default 1)
206 |     BIDIRECTIONAL_DEEP_ENCODER = True             # Use bidirectional encoder in all encoding layers
207 | 
208 | 
209 |     # Previous sentence encoder
210 |     PREV_SENT_ENCODER_HIDDEN_SIZE = 717           # For models with previous sentence RNN encoder (484)
211 |     BIDIRECTIONAL_PREV_SENT_ENCODER = True        # Use bidirectional encoder
212 |     N_LAYERS_PREV_SENT_ENCODER = 1                # Stack this number of encoding layers
213 |     BIDIRECTIONAL_DEEP_PREV_SENT_ENCODER = True   # Use bidirectional encoder in all encoding layers
214 | 
215 |     DECODER_HIDDEN_SIZE = 484                     # For models with LSTM decoder (ABiViRNet 484)
216 |     SKIP_VECTORS_HIDDEN_SIZE = TARGET_TEXT_EMBEDDING_SIZE
217 |     ADDITIONAL_OUTPUT_MERGE_MODE = 'sum'          # Merge mode for the skip connections
218 |     WEIGHTED_MERGE = False       # Wether we want to apply a conventional or a weighted merge
219 | 
220 | 
221 |     AFFINE_LAYERS_DIM = 500     # Dimensionality of the affine layers in 'DeepSeek' model
222 | 
223 |     IMG_EMBEDDING_LAYERS = []  # FC layers for visual embedding
224 |                                # Here we should specify the activation function and the output dimension
225 |                                # (e.g IMG_EMBEDDING_LAYERS = [('linear', 1024)]
226 | 
227 |     # Fully-Connected layers for initializing the first RNN state
228 |     #       Here we should only specify the activation function of each layer
229 |     #       (as they have a potentially fixed size)
230 |     #       (e.g INIT_LAYERS = ['tanh', 'relu'])
231 |     INIT_LAYERS = ['tanh']
232 | 
233 |     # Additional Fully-Connected layers's sizes applied before softmax.
234 |     #       Here we should specify the activation function and the output dimension
235 |     #       (e.g DEEP_OUTPUT_LAYERS = [('tanh', 600), ('relu', 400), ('relu', 200)])
236 |     DEEP_OUTPUT_LAYERS = []
237 | 
238 |     # Regularizers
239 |     WEIGHT_DECAY = 1e-4                           # L2 regularization
240 |     RECURRENT_WEIGHT_DECAY = 0.                   # L2 regularization in recurrent layers
241 | 
242 |     USE_DROPOUT = True                           # Use dropout
243 |     DROPOUT_P = 0.5                               # Percentage of units to drop
244 | 
245 |     USE_RECURRENT_DROPOUT = False                 # Use dropout in recurrent layers # DANGEROUS!
246 |     RECURRENT_DROPOUT_P = 0.5                     # Percentage of units to drop in recurrent layers
247 | 
248 |     USE_NOISE = True                             # Use gaussian noise during training
249 |     NOISE_AMOUNT = 0.01                           # Amount of noise
250 | 
251 |     USE_BATCH_NORMALIZATION = True                # If True it is recommended to deactivate Dropout
252 |     BATCH_NORMALIZATION_MODE = 1                  # See documentation in Keras' BN
253 | 
254 |     USE_PRELU = False                             # use PReLU activations as regularizer
255 |     USE_L2 = False                                # L2 normalization on the features
256 | 
257 |     # Results plot and models storing parameters
258 |     EXTRA_NAME = ''                    # This will be appended to the end of the model name
259 |     MODEL_NAME = DATASET_NAME + '_' + MODEL_TYPE +\
260 |                  '_txtemb_' + str(TARGET_TEXT_EMBEDDING_SIZE) + \
261 |                  '_imgemb_' + '_'.join([layer[0] for layer in IMG_EMBEDDING_LAYERS]) + \
262 |                  '_lstmenc_' + str(ENCODER_HIDDEN_SIZE) + \
263 |                  '_lstm_' + str(DECODER_HIDDEN_SIZE) + \
264 |                  '_additional_output_mode_' + str(ADDITIONAL_OUTPUT_MERGE_MODE) + \
265 |                  '_deepout_' + '_'.join([layer[0] for layer in DEEP_OUTPUT_LAYERS]) + \
266 |                  '_' + OPTIMIZER + '_decay_' + str(LR_DECAY) + '-' + str(LR_GAMMA)
267 | 
268 |     MODEL_NAME += '_' + EXTRA_NAME
269 | 
270 |     # Name and location of the pre-trained model (only if RELOAD > 0)
271 |     PRE_TRAINED_MODELS = ['MSVD_best_model']
272 |             # default: MODEL_NAME
273 |             # ['EDUB-SegDesc_features-vidtext-embed_VideoTextEmbedding_txtemb_301_imgemb__lstmenc_717_lstm_484_additional_output_mode_sum_deepout__Adadelta_decay_None-0.95_vidtext_classification_BLSTM_text']
274 |             # ['EDUB-SegDesc_features-vidtext-embed_VideoTextEmbedding_txtemb_301_imgemb__lstmenc_717_lstm_484_additional_output_mode_sum_deepout__Adadelta_decay_None-0.95_vidtext_classification']
275 |             # ['EDUB-SegDesc_features-vidtext-embed_VideoTextEmbedding_txtemb_301_imgemb__lstmenc_717_lstm_484_additional_output_mode_sum_deepout__Adam_decay_1-0.95vidtext_embed']
276 |             # ['MSVD_best_model']
277 |             # ['MSVD_best_model', '1BillionWords']
278 |     PRE_TRAINED_MODEL_STORE_PATHS = map(lambda x: 'trained_models/' + x  + '/', PRE_TRAINED_MODELS) if isinstance(PRE_TRAINED_MODELS, list) else 'trained_models/'+PRE_TRAINED_MODELS+'/'
279 |     LOAD_WEIGHTS_ONLY = True                           # Load weights of pre-trained model or complete Model_Wrapper instance
280 |     # Layers' mapping from old to new model if LOAD_WEIGHTS_ONLY
281 |     #   You can check the layers of a model with [layer.name for layer in model_wrapper.model.layers]
282 |     if '-video' in DATASET_NAME:
283 |         # Pre-train MSVD
284 |         LAYERS_MAPPING = [{'bidirectional_encoder': 'bidirectional_encoder_LSTM',
285 |                           'initial_state': 'initial_state',
286 |                           'initial_memory': 'initial_memory',
287 |                           'attlstmcond_1': 'decoder_AttLSTMCond2Inputs',  # 'decoder_AttLSTMCond',
288 |                           'target_word_embedding': 'target_word_embedding',
289 |                           'logit_ctx': 'logit_ctx',
290 |                           'logit_lstm': 'logit_lstm',
291 |                           'description': 'description'
292 |                           }
293 |                         ]
294 |         # Pre-train vidtext embedding
295 |         """
296 |         LAYERS_MAPPING = [{'bidirectional_encoder_LSTM': 'bidirectional_encoder_LSTM',
297 |                            'bidirectional_encoder_LSTM': 'prev_desc_emb_bidirectional_encoder_LSTM',
298 |                            'target_word_embedding': 'target_word_embedding',
299 |                            'logit_ctx': 'logit_ctx',
300 |                            'logit_prev': 'logit_prev',
301 |                            }
302 |                           ]
303 |         """
304 | 
305 |     elif '-vidtext-embed' in DATASET_NAME:
306 |         LAYERS_MAPPING = [{'bidirectional_encoder': 'bidirectional_encoder_LSTM',
307 |                            'target_word_embedding': 'target_word_embedding',
308 |                            'logit_ctx': 'logit_ctx',
309 |                            }
310 |                           ]
311 |     else:
312 |         if MODEL_TYPE == 'ArcticVideoCaptionWithInit':
313 |             LAYERS_MAPPING = [{'bidirectional_encoder': 'bidirectional_encoder_LSTM',
314 |                                'initial_state': 'initial_state',
315 |                                'initial_memory': 'initial_memory',
316 |                                'attlstmcond_1': 'decoder_AttLSTMCond',
317 |                                'target_word_embedding': 'target_word_embedding',
318 |                                'logit_ctx': 'logit_ctx',
319 |                                'logit_lstm': 'logit_lstm',
320 |                                'description': 'description'
321 |                                }
322 |                               ]
323 | 
324 |         elif MODEL_TYPE == 'TemporallyLinkedVideoDescriptionAttDoublePrev':
325 |             LAYERS_MAPPING = [{'bidirectional_encoder': 'bidirectional_encoder_LSTM',
326 |                                'initial_state': 'initial_state',
327 |                                'initial_memory': 'initial_memory',
328 |                                'attlstmcond_1': 'decoder_AttLSTMCond3Inputs',  # 'decoder_AttLSTMCond',
329 |                                'target_word_embedding': 'target_word_embedding',
330 |                                'logit_ctx': 'logit_ctx',
331 |                                'logit_lstm': 'logit_lstm',
332 |                                'description': 'description'
333 |                                }
334 |                               ]
335 | 
336 |         elif len(PRE_TRAINED_MODELS) == 2:
337 |             LAYERS_MAPPING = [{'bidirectional_encoder': 'bidirectional_encoder_LSTM',
338 |                           'initial_state': 'initial_state',
339 |                           'initial_memory': 'initial_memory',
340 |                           'attlstmcond_1': 'decoder_AttLSTMCond2Inputs',  # 'decoder_AttLSTMCond',
341 |                           #'target_word_embedding': 'target_word_embedding',
342 |                           'logit_ctx': 'logit_ctx',
343 |                           'logit_lstm': 'logit_lstm',
344 |                           #'description': 'description'
345 |                           },
346 |                           {'bidirectional_encoder_LSTM': 'prev_desc_emb_bidirectional_encoder_LSTM', #'prev_desc_emb_encoder_LSTM',
347 |                           'target_word_embedding': 'target_word_embedding',
348 |                           'decoder_AttLSTMCond': 'decoder_AttLSTMCond2Inputs', #'decoder_AttLSTMCond',
349 |                           'target_text': 'description'
350 |                           }
351 |                         ]
352 | 
353 |         elif len(PRE_TRAINED_MODELS) == 1: # reuse data from vidtext-embedding model
354 | 
355 |             LAYERS_MAPPING = [{'bidirectional_encoder_LSTM': 'bidirectional_encoder_LSTM',
356 |                                'prev_desc_emb_bidirectional_encoder_LSTM': 'prev_desc_emb_bidirectional_encoder_LSTM',
357 |                                'target_word_embedding': 'target_word_embedding',
358 |                                'logit_ctx': 'logit_ctx',
359 |                                'logit_prev': 'logit_prev',
360 |                                }
361 |                               ]
362 | 
363 | 
364 |     STORE_PATH = 'trained_models/' + MODEL_NAME  + '/' # Models and evaluation results will be stored here
365 |     DATASET_STORE_PATH = 'datasets/'                   # Dataset instance will be stored here
366 | 
367 |     SAMPLING_SAVE_MODE = 'list'                        # 'list' or 'vqa'
368 |     VERBOSE = 1                                        # Vqerbosity level
369 |     RELOAD =  0                                        # If 0 start training from scratch, otherwise the model
370 |                                                        # Saved on epoch 'RELOAD' will be used
371 |     REBUILD_DATASET = True                             # Build again or use stored instance
372 |     MODE = 'training'                                  # 'training' or 'sampling' (if 'sampling' then RELOAD must
373 |                                                        # be greater than 0 and EVAL_ON_SETS will be used)
374 |     RELOAD_PATH = None
375 |     SAMPLING_RELOAD_EPOCH = False
376 |     SAMPLING_RELOAD_POINT = 0
377 |     # Extra parameters for special trainings
378 |     TRAIN_ON_TRAINVAL = False  # train the model on both training and validation sets combined
379 |     FORCE_RELOAD_VOCABULARY = False  # force building a new vocabulary from the training samples applicable if RELOAD > 1
380 | 
381 |     # ============================================
382 |     parameters = locals().copy()
383 |     return parameters
384 | 


--------------------------------------------------------------------------------
/data_engine/README.md:
--------------------------------------------------------------------------------
 1 | # Preprocessing of EDUB-SegDesc dataset
 2 | 
 3 | The scripts stored in this folder 'data_engine' are intended to preprocess the data from the [EDUB-SegDesc](http://www.ub.edu/cvub/edub-segdesc/) dataset in order to use them as an input for building a Dataset object instance (see [staged_keras_wrapper](https://github.com/MarcBS/staged_keras_wrapper)).
 4 | 
 5 | Two different kinds of inputs can be used for training the video description models:
 6 | 
 7 | 1) Raw video frames (see section 'Image lists generation')
 8 | 2) Features from video frames (see section 'Image features generation')
 9 | 
10 | Additionally, we can train a model for temporally-linked samples, in that case we have to run an additional pre-processing script.
11 | 
12 | ## Folder structure
13 | 
14 | Following we describe the desired folder structure for storing the dataset-related information:
15 | 
16 |     ./Images
17 |         video_[video_id]
18 |             [num_image].jpg
19 |             [num_image].jpg
20 |     ./Annotations
21 |         test_list.txt
22 |         train_list.txt
23 |         val_list.txt
24 |         captions.id.en
25 |     ./Features
26 |         test_[name_feat].csv
27 |         train_[name_feat].csv
28 |         val_[name_feat].csv
29 | 
30 | The folder ./Images contains a set of folders 'video_[video_id]', where each folder represents a video and contains a set of frames '[num_image].jpg'.
31 | 
32 | The folder ./Annotations contains, for each set split {train, val, test}, a file with the suffix _list.txt. Containing the list of videos 'video_[video_id]' belonging to the respective split. It also contains the file 'captions.id.en', which lists all the available captions for all the videos.
33 | 
34 | The folder ./Features contains any kind of features extracted from the respective set splits (only needed if using image features instead of raw images).
35 | 
36 | 
37 | ## Descriptions generation
38 | 
39 | This step will be needed either if we are using raw video frames or video features.
40 | 
41 |     Script name:
42 |         generate_descriptions_lists.py
43 |     Description:
44 |         Extracts and counts the available descriptions for each video.
45 |     Output:
46 |         - A file per split with the suffix _descriptions.txt. 
47 |             Containing a list of descriptions for all videos.
48 |         - A file per split with the suffix _descriptions_counts.npy. 
49 |             Containing a python list with the counts of descriptions per video.
50 |         The output will be stored in ./Annotations.
51 | 
52 | 
53 | ## Image lists generation
54 | 
55 | This step will be needed if we are using raw video frames only.
56 | 
57 |     Script name:
58 |         generate_img_lists.py
59 |     Description:
60 |         Lists and counts the frames belonging to each video.
61 |     Output:
62 |         - A file per split with the suffix _imgs_list.txt. 
63 |             Containing the list of frames for all videos.
64 |         - A file per split with the suffix _imgs_counts.txt. 
65 |             Containing a list of frame counts per video.
66 |         The output will be stored in ./Annotations.
67 | 
68 | 
69 | ## Image features generation
70 |     
71 | This step will be needed if we are using image features only. The number of feature vectors per video does not need to match the number of frames. 
72 | 
73 |     Script name:
74 |         generate_features_lists.py
75 |     Description:
76 |         Stores each feature vector contained in the corresponding .Features/[split_name]_[name_feat].csv in a separate .npy file and counts them.
77 |     Output:
78 |         - A file per split with the suffix _feat_list.txt.
79 |             Containing the path to each feature vector.
80 |         - A file per split with the suffix _feat_counts.txt.
81 |             Containing the counts of vectors per video.
82 |         The output .txt files will be stored in ./Annotations/[name_feat]/. And the .npy files in ./Features/[name_feat]/
83 |             
84 | ## Temporally-linked samples
85 | 
86 | This step will be needed if we are using temporally-linked samples.
87 | 
88 |     Script name:
89 |         generate_link_lists.py
90 |     Description:
91 |         Stores a separate list .Annotations/[split_list]_link_samples.txt with the indices to the previous samples in the temporal link.
92 |     Output:
93 |         - A file per split with the suffix _link_samples.txt.
94 |           Containing the index to the previous sample in the link (or -1) if it is the first sample in the link.
95 | 


--------------------------------------------------------------------------------
/data_engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarcBS/TMA/76f4e16b3f10c056c45ada5df4a64cc564b69011/data_engine/__init__.py


--------------------------------------------------------------------------------
/data_engine/generate_corpus_full_history.py:
--------------------------------------------------------------------------------
 1 | """
 2 | the file id_seg_cap.txt has been generated with the folloing script
 3 | 
 4 | awk '{print substr(FILENAME, 1, length(FILENAME)-4) "," $0}' * > ../id_seg_cap.txt
 5 | 
 6 | and its format is:
 7 |     file_id, segment_number, caption
 8 | """
 9 | 
10 | base_path = '/media/HDD_2TB/DATASETS/EDUB-SegDesc/GT/'
11 | 
12 | txt_files = base_path + 'id_seg_cap.txt'
13 | dest_files = base_path + 'captions.id.full_history.txt'
14 | 
15 | file = open(txt_files, mode='r')
16 | dest_file = open(dest_files + 'curr', mode='w')
17 | 
18 | separator = '----'
19 | space_sym = ' <pad> '
20 | 
21 | prev_id = 'Segment1'
22 | caps_txt = []
23 | prev_caps = []
24 | j = 0
25 | for line in file:
26 |     id_text = line.split(",")
27 |     user_id = id_text[0]
28 |     segment_id = id_text[1]
29 |     text = ' '.join(id_text[2:]).strip()
30 |     j += 1
31 |     if j % 1000 == 0:
32 |         print "Processed", j, "lines"
33 |     if segment_id == prev_id:
34 |         caps_txt.append(text)
35 | 
36 |         # for prev_cap in prev_caps:
37 |         #    caps_txt.append(prev_cap + space_sym  + text)
38 |     elif segment_id == 'Segment1':  # Start of day
39 |         prev_id = segment_id
40 |         i = 0
41 |         for curr_cap in caps_txt:
42 |             dest_file.write(user_id + '_' + segment_id + '#' + str(i) + separator + curr_cap + '\n')
43 |             i += 1
44 |         prev_caps = caps_txt
45 |     else:
46 |         # Different segment
47 |         # We combine
48 |         prev_id = segment_id
49 |         # for prev_cap in prev_caps:
50 |         #    prev_caps2.append(prev_cap + space_sym + cap)
51 |         caps_txt = []
52 |         caps_txt.append(text)
53 |         i = 0
54 |         for prev_cap in prev_caps:
55 |             for curr_cap in caps_txt:
56 |                 dest_file.write(
57 |                     user_id + '_' + segment_id + '#' + str(i) + separator + prev_cap + space_sym + curr_cap + '\n')
58 |             i += 1
59 |         prev_caps = [prev_cap + space_sym + curr_cap for curr_cap in caps_txt for prev_cap in prev_caps]
60 | 


--------------------------------------------------------------------------------
/data_engine/generate_descriptions_lists.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | def main():
  5 |     # base_path = '/media/HDD_2TB/DATASETS/MSVD/'
  6 |     base_path = '/media/HDD_3TB/DATASETS/EDUB-SegDesc/'
  7 | 
  8 |     without_noninfo = True
  9 | 
 10 |     path_files = 'Annotations'
 11 | 
 12 |     # Inputs
 13 |     # text = 'captions.id.en'
 14 |     if without_noninfo:
 15 |         text = 'captions_final_without_noninfo.id.en'
 16 |     else:
 17 |         text = 'captions_final.id.en'
 18 |     separator = '----'
 19 | 
 20 |     # train = 'train_list.txt'
 21 |     # val =   'val_list.txt'
 22 |     # test =  'test_list.txt'
 23 | 
 24 |     if without_noninfo:
 25 |         train = 'train_list_final_without_noninfo.txt'
 26 |         val = 'val_list_final_without_noninfo.txt'
 27 |         test = 'test_list_final_without_noninfo.txt'
 28 | 
 29 |         # Outputs
 30 |         train_out = 'train_descriptions_without_noninfo.txt'
 31 |         val_out = 'val_descriptions_without_noninfo.txt'
 32 |         test_out = 'test_descriptions_without_noninfo.txt'
 33 | 
 34 |         train_out_counts = 'train_descriptions_counts_without_noninfo.npy'
 35 |         val_out_counts = 'val_descriptions_counts_without_noninfo.npy'
 36 |         test_out_counts = 'test_descriptions_counts_without_noninfo.npy'
 37 | 
 38 |     else:
 39 |         train = 'train_list_final.txt'
 40 |         val = 'val_list_final.txt'
 41 |         test = 'test_list_final.txt'
 42 | 
 43 |         # Outputs
 44 |         train_out = 'train_descriptions.txt'
 45 |         val_out = 'val_descriptions.txt'
 46 |         test_out = 'test_descriptions.txt'
 47 | 
 48 |         train_out_counts = 'train_descriptions_counts.npy'
 49 |         val_out_counts = 'val_descriptions_counts.npy'
 50 |         test_out_counts = 'test_descriptions_counts.npy'
 51 | 
 52 |     #################################
 53 | 
 54 |     # Code
 55 | 
 56 |     text = path_files + '/' + text
 57 |     splits = [path_files + '/' + train, path_files + '/' + val, path_files + '/' + test]
 58 |     splits_out = [path_files + '/' + train_out, path_files + '/' + val_out, path_files + '/' + test_out]
 59 |     splits_counts = [path_files + '/' + train_out_counts, path_files + '/' + val_out_counts,
 60 |                      path_files + '/' + test_out_counts]
 61 | 
 62 |     # read video names
 63 |     img_splits = [[], [], []]
 64 |     for i, s in enumerate(splits):
 65 |         with open(base_path + s, 'r') as f:
 66 |             for line in f:
 67 |                 line = line.rstrip('\n')
 68 |                 img_splits[i].append(line)
 69 | 
 70 |     # print img_splits
 71 | 
 72 | 
 73 |     # read descriptions and assign them to a split
 74 |     desc_splits = []
 75 |     counts_splits = []
 76 |     for i_s, s in enumerate(splits):
 77 |         desc_splits.append([[] for i in range(len(img_splits[i_s]))])
 78 |         counts_splits.append([0 for i in range(len(img_splits[i_s]))])
 79 |     with open(base_path + text, 'r') as f:
 80 |         for line in f:
 81 |             line = line.rstrip('\n')
 82 |             line = line.split('#')
 83 |             img = line[0]
 84 |             line = line[1].split(separator)
 85 |             desc = line[1]
 86 | 
 87 |             found = False
 88 |             i = 0
 89 |             while (not found and i < len(splits)):
 90 |                 if (img in img_splits[i]):
 91 |                     found = True
 92 |                     idx = img_splits[i].index(img)
 93 |                     desc_splits[i][idx].append(desc)
 94 |                     counts_splits[i][idx] += 1
 95 |                 i += 1
 96 | 
 97 |             if (not found):
 98 |                 print 'Warning: Video ' + img + ' does not exist in lists'
 99 | 
100 |     # write descriptions in separate files
101 |     for f, d in zip(splits_out, desc_splits):
102 |         f = open(base_path + f, 'w')
103 |         for im in d:
104 |             for desc in im:
105 |                 f.write(desc + '\n')
106 |         f.close()
107 | 
108 |     # store description counts for each video
109 |     for c, s in zip(counts_splits, splits_counts):
110 |         np.save(base_path + s, c)
111 | 
112 |     print 'Done'
113 | 
114 | 
115 | main()
116 | 


--------------------------------------------------------------------------------
/data_engine/generate_features_lists.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | 
  4 | import numpy as np
  5 | 
  6 | base_path = '/media/HDD_3TB/DATASETS/EDUB-SegDesc/'
  7 | path_features = 'Features'
  8 | path_annotations = 'Annotations'
  9 | without_noninfo = True
 10 | 
 11 | # Inputs
 12 | if without_noninfo:
 13 |     features_name = 'ImageNet_Without_NonInfo'
 14 | else:
 15 |     features_name = 'ImageNet'
 16 | 
 17 | ###### Files with fixed number of frames per video
 18 | # features_files = ['train_' + features_name + '.csv', 'val_' + features_name + '.csv', 'test_' + features_name + '.csv']
 19 | # features_counts = ['train_' + features_name + '_counts.txt', 'val_' + features_name + '_counts.txt', 'test_' + features_name + '_counts.txt']
 20 | 
 21 | ###### Files all original frames of videos
 22 | # features_files = ['train_' + features_name + '.csv',
 23 | #                  'val_' + features_name + '.csv',
 24 | #                  'test_' + features_name + '.csv']
 25 | # features_counts = ['train_' + features_name + '_counts.txt',
 26 | #                   'val_' + features_name + '_counts.txt',
 27 | #                   'test_' + features_name + '_all_frames_counts.txt']
 28 | 
 29 | 
 30 | if without_noninfo:
 31 |     features_files = ['train_' + features_name + '_all_frames_without_noninfo.csv',
 32 |                       'val_' + features_name + '_all_frames_without_noninfo.csv',
 33 |                       'test_' + features_name + '_all_frames_without_noninfo.csv']
 34 |     features_counts = ['train_' + features_name + '_all_frames_counts_without_noninfo.txt',
 35 |                        'val_' + features_name + '_all_frames_counts_without_noninfo.txt',
 36 |                        'test_' + features_name + '_all_frames_counts_without_noninfo.txt']
 37 | else:
 38 |     features_files = ['train_' + features_name + '_all_frames.csv',
 39 |                       'val_' + features_name + '_all_frames.csv',
 40 |                       'test_' + features_name + '_all_frames.csv']
 41 |     features_counts = ['train_' + features_name + '_all_frames_counts.txt',
 42 |                        'val_' + features_name + '_all_frames_counts.txt',
 43 |                        'test_' + features_name + '_all_frames_counts.txt']
 44 | 
 45 | # features_name = 'C3D_fc8_ImageNet'
 46 | # features_files = ['train_' + features_name + '.csv', 'val_' + features_name + '.csv', 'test_' + features_name + '.csv']
 47 | # features_counts = ['train_' + features_name + '_counts.txt', 'val_' + features_name + '_counts.txt', 'test_' + features_name + '_counts.txt']
 48 | 
 49 | # Outputs
 50 | if without_noninfo:
 51 |     out_lists = ['train_feat_list_without_noninfo.txt',
 52 |                  'val_feat_list_without_noninfo.txt',
 53 |                  'test_feat_list_without_noninfo.txt']
 54 |     counts_lists = ['train_feat_counts_without_noninfo.txt',
 55 |                     'val_feat_counts_without_noninfo.txt',
 56 |                     'test_feat_counts_without_noninfo.txt']
 57 | else:
 58 |     out_lists = ['train_feat_list.txt', 'val_feat_list.txt', 'test_feat_list.txt']
 59 |     counts_lists = ['train_feat_counts.txt', 'val_feat_counts.txt', 'test_feat_counts.txt']
 60 | 
 61 | #########
 62 | 
 63 | if os.path.isdir(base_path + '/' + path_features + '/' + features_name):
 64 |     shutil.rmtree(base_path + '/' + path_features + '/' + features_name)
 65 | os.makedirs(base_path + '/' + path_features + '/' + features_name)
 66 | 
 67 | if not os.path.isdir(base_path + '/' + path_annotations + '/' + features_name):
 68 |     os.makedirs(base_path + '/' + path_annotations + '/' + features_name)
 69 | 
 70 | c_videos = 0
 71 | for f, fc, o, c in zip(features_files, features_counts, out_lists, counts_lists):
 72 |     print "Processing " + f
 73 | 
 74 |     f = open(base_path + '/' + path_features + '/' + f, 'r')
 75 |     fc = open(base_path + '/' + path_features + '/' + fc, 'r')
 76 |     o = open(base_path + '/' + path_annotations + '/' + features_name + '/' + o, 'w')
 77 |     c = open(base_path + '/' + path_annotations + '/' + features_name + '/' + c, 'w')
 78 | 
 79 |     all_counts = list()
 80 |     for line in fc:
 81 |         line = line.strip('\n')
 82 |         all_counts.append(int(line))
 83 | 
 84 |     c_frame = 0
 85 |     c_videos_split = 0
 86 |     # Process each line in the file
 87 |     for enum, line in enumerate(f):
 88 |         frame = line.strip('\n')
 89 |         frame = np.fromstring(frame, sep=',')  # covert csv line to numpy array
 90 | 
 91 |         this_path = "%s/video_%0.4d" % (path_features + '/' + features_name, c_videos)
 92 |         if not os.path.isdir(base_path + this_path):
 93 |             os.makedirs(base_path + this_path)
 94 |         this_path = "%s/video_%0.4d/frame_%0.4d.npy" % (path_features + '/' + features_name, c_videos, c_frame)
 95 |         # Save array in disk
 96 |         try:
 97 |             np.save(base_path + this_path, frame)
 98 |         except:
 99 |             print 'line file', enum
100 |             print 'file name', base_path + this_path
101 |             print 'lenvec', len(frame)
102 |             print 'vec', frame
103 |             print
104 |         # Write path to file
105 |         o.write(this_path + '\n')
106 | 
107 |         c_frame += 1
108 | 
109 |         # a complete video was processed
110 |         if c_frame % all_counts[c_videos_split] == 0:
111 |             c_videos += 1
112 |             c.write(str(all_counts[c_videos_split]) + '\n')  # store counts
113 |             c_videos_split += 1
114 |             c_frame = 0
115 | 
116 |     f.close()
117 |     fc.close()
118 |     o.close()
119 |     c.close()
120 | 
121 | print 'Done!'
122 | 


--------------------------------------------------------------------------------
/data_engine/generate_img_lists.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | 
 3 | base_path = '/media/HDD_2TB/DATASETS/MSVD/'
 4 | 
 5 | # Inputs
 6 | split_lists = ['train_list.txt', 'val_list.txt', 'test_list.txt']
 7 | imgs_format = '.jpg'
 8 | path_imgs = 'Images'
 9 | path_files = 'Annotations'
10 | 
11 | # Outputs
12 | out_lists = ['train_imgs_list.txt', 'val_imgs_list.txt', 'test_imgs_list.txt']
13 | counts_lists = ['train_imgs_counts.txt', 'val_imgs_counts.txt', 'test_imgs_counts.txt']
14 | 
15 | # Code
16 | print 'Listing all images from all videos...'
17 | 
18 | len_base = len(base_path)
19 | for s, o, c in zip(split_lists, out_lists, counts_lists):
20 |     s = open(base_path + '/' + path_files + '/' + s, 'r')
21 |     o = open(base_path + '/' + path_files + '/' + o, 'w')
22 |     c = open(base_path + '/' + path_files + '/' + c, 'w')
23 |     for line in s:
24 |         video = line.strip('\n')
25 |         this_path = base_path + '/' + path_imgs + "/video_" + video + "/*" + imgs_format
26 |         images = glob.glob(this_path)
27 |         for im in images:
28 |             # o.write(path_imgs+"/video_"+video+"/"+im+'\n') # store each image path
29 |             o.write(im[len_base:] + '\n')
30 |         c.write(str(len(images)) + '\n')  # store counts
31 |     s.close()
32 |     o.close()
33 |     c.close()
34 | 
35 | print 'Done!'
36 | 


--------------------------------------------------------------------------------
/data_engine/generate_img_lists_from_split.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | 
  4 | import xlrd
  5 | 
  6 | # Split the existent data in train, val and test
  7 | data_path = '/media/HDD_3TB/DATASETS/EDUB-SegDesc'
  8 | 
  9 | # input data paths
 10 | in_descriptions_path = 'GT/descriptions'
 11 | in_segments_path = 'GT/segmentations'
 12 | in_images_path = 'Images'  # <in_images_path>/<day_name>/<img_name>.jpg
 13 | imgs_format = '.jpg'
 14 | 
 15 | # output data paths
 16 | out_features_path = 'Features'  # <set_split>_<out_features_name>_all_frames.csv & <set_split>_<out_features_name>_all_frames_counts.txt
 17 | out_descriptions_path = 'Annotations'
 18 | out_image_lists_path = 'Annotations'  # <set_split>_imgs_list.txt & <set_split>_imgs_counts.txt
 19 | 
 20 | # Get day_sets for each data split
 21 | sets = dict()
 22 | for s in ['train', 'val', 'test']:
 23 |     sets[s] = []
 24 |     with open(data_path + '/' + out_descriptions_path + '/' + s + '_list_final.txt', 'r') as list_file:
 25 |         prev_set = -1
 26 |         for line in list_file:
 27 |             line = line.rstrip('\n')
 28 |             line = line.split('_')
 29 |             if line[0] != prev_set:
 30 |                 sets[s].append(line[0])
 31 |             prev_set = line[0]
 32 | 
 33 | # Get segments' IDs with errors
 34 | errors = dict()
 35 | for s in ['train', 'val', 'test']:
 36 |     errors[s] = dict()
 37 |     for day_split in sets[s]:
 38 |         errors[s][day_split] = []
 39 |         with open(data_path + '/' + in_descriptions_path + '/' + day_split + '.txt', 'r') as list_file:
 40 |             for line in list_file:
 41 |                 line = line.rstrip('\n').split(',')
 42 |                 segm_id = int(line[0][7:])
 43 |                 desc = ','.join(line[1:])
 44 |                 desc = desc.strip().lower()
 45 |                 if desc == 'error':
 46 |                     errors[s][day_split].append(segm_id)
 47 | 
 48 | # Get events of correct segments
 49 | for s in ['train', 'val', 'test']:
 50 | 
 51 |     file_imgs = open(data_path + '/' + out_image_lists_path + '/' + s + '_imgs_list.txt', 'w')
 52 |     file_counts = open(data_path + '/' + out_image_lists_path + '/' + s + '_imgs_counts.txt', 'w')
 53 | 
 54 |     for day_split in sets[s]:
 55 |         possible_names = ['/GT_' + day_split + '.xls', '/GT_' + day_split + '.xlsx', '/' + day_split + '.xls',
 56 |                           '/' + day_split + '.xlsx']
 57 |         exists = False
 58 |         i = 0
 59 |         while not os.path.isfile(data_path + '/' + in_segments_path + possible_names[i]):
 60 |             i += 1
 61 |         file = xlrd.open_workbook(data_path + '/' + in_segments_path + possible_names[i])
 62 |         sheet = file.sheet_by_index(0)
 63 | 
 64 |         count_segments = 1
 65 |         these_events = []
 66 |         empty = False
 67 |         i = 2  # 1st row with info
 68 |         while not empty:
 69 |             try:
 70 |                 evt = sheet.cell(i, 1).value.split()
 71 |                 if len(evt) == 1:
 72 |                     evt = sheet.cell(i, 1).value.split('-')
 73 |                 if evt:
 74 |                     if count_segments not in errors[s][day_split]:  # avoid segments with errors (dark/blurry images)
 75 |                         these_events.append([evt[0].strip(), evt[1].strip()])
 76 |                 else:
 77 |                     empty = True
 78 |                 i += 1
 79 |                 count_segments += 1
 80 |             except:
 81 |                 empty = True
 82 | 
 83 |         # Get list of images
 84 |         these_images = glob.glob(data_path + '/' + in_images_path + '/' + day_split + '/*' + imgs_format)
 85 |         final_these_images = []
 86 |         for im in these_images:
 87 |             final_these_images.append(im.split('/')[-1].split('.')[0])
 88 |         final_these_images = sorted(final_these_images)
 89 | 
 90 |         for e in these_events:
 91 |             if e[1] not in final_these_images:
 92 |                 e[1] = '0' + e[1]
 93 |             if e[0] not in final_these_images:
 94 |                 e[0] = '0' + e[0]
 95 | 
 96 |             fin_idx = final_these_images.index(e[1]) + 1
 97 |             ini_idx = final_these_images.index(e[0])
 98 |             current_event_imgs = final_these_images[ini_idx:fin_idx]
 99 | 
100 |             # Store in files
101 |             this_count = 0
102 |             for imid in current_event_imgs:
103 |                 file_imgs.write(in_images_path + '/' + day_split + '/' + imid + imgs_format + '\n')
104 |                 this_count += 1
105 |             file_counts.write(str(this_count) + '\n')
106 | 
107 |     file_imgs.close()
108 |     file_counts.close()
109 | 
110 | print 'DONE!'
111 | 


--------------------------------------------------------------------------------
/data_engine/generate_link_lists.py:
--------------------------------------------------------------------------------
 1 | ## Parameters
 2 | 
 3 | base_path = '/media/HDD_3TB/DATASETS/EDUB-SegDesc/'
 4 | 
 5 | path_files = 'Annotations'
 6 | without_noninfo = True
 7 | 
 8 | # Names of the different samples
 9 | #   All samples belonging to the same day must accomplish the following requirements:
10 | #       - Be referenced continuously, without mixing with other days
11 | #       - Be stored in chronological order
12 | #       - Include the day identifier at the beginning of the line separated by the symbol '_'
13 | #   Example:
14 | #       Day1_video_1
15 | #       Day1_video_2
16 | #       Day1_video_3
17 | #       Day2_video_1
18 | #       Day2_video_2
19 | ####
20 | 
21 | if without_noninfo:
22 |     suffix = '_without_noninfo'
23 | else:
24 |     suffix = ''
25 | 
26 | train = 'train_list_final' + suffix + '.txt'
27 | val = 'val_list_final' + suffix + '.txt'
28 | test = 'test_list_final' + suffix + '.txt'
29 | 
30 | # Outputs
31 | train_out = 'train_link_samples' + suffix + '.txt'
32 | val_out = 'val_link_samples' + suffix + '.txt'
33 | test_out = 'test_link_samples' + suffix + '.txt'
34 | 
35 | #################################
36 | 
37 | ## Code
38 | 
39 | # Generate temporal links between samples which belong to the same day
40 | for fin, fout in zip([train, val, test], [train_out, val_out, test_out]):
41 | 
42 |     with open(base_path + '/' + path_files + '/' + fin, 'r') as fi, open(base_path + '/' + path_files + '/' + fout,
43 |                                                                          'w') as fo:
44 |         prev_day_name = ''
45 |         lines_counter = -1
46 |         for line in fi:
47 |             day_name = line.split('_')[0]
48 |             if day_name == prev_day_name:
49 |                 fo.write(str(lines_counter) + '\n')
50 |                 lines_counter += 1
51 |             else:
52 |                 fo.write('-1\n')
53 |                 lines_counter += 1
54 | 
55 |             prev_day_name = day_name
56 | 
57 | print 'Done'
58 | 


--------------------------------------------------------------------------------
/data_engine/generate_parallel_corpus.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Generates a parallel corpus from the EDUB-GT Annotations:
 3 |     A language is the image captions.
 4 |     The other language is the previous caption of each sentence.
 5 | """
 6 | 
 7 | base_path = '/media/HDD_2TB/DATASETS/EDUB-SegDesc/GT/'
 8 | 
 9 | txt_files = base_path + 'text.clean.txt'
10 | dest_files = base_path + 'training.'
11 | 
12 | file = open(txt_files, mode='r')
13 | 
14 | file_prevs = open(dest_files + 'prev', mode='w')
15 | file_curr = open(dest_files + 'curr', mode='w')
16 | 
17 | prev_id = 'Segment1'
18 | caps_txt = []
19 | prev_caps = ['None']
20 | i = 0
21 | for line in file:
22 |     id_text = line.split(",")
23 |     id = id_text[0]
24 |     text = ' '.join(id_text[1:]).strip()
25 |     if id == prev_id:
26 |         caps_txt.append(text)
27 |     elif id == 'Segment1':
28 |         prev_id = id
29 |         prev_caps = ['None']
30 |         caps_txt.append(text)
31 |         for curr_cap in caps_txt:
32 |             for prev_cap in prev_caps:
33 |                 file_prevs.write(prev_cap + '\n')
34 |                 file_curr.write(curr_cap + '\n')
35 |                 i += 1
36 |     else:
37 |         caps_txt.append(text)
38 |         for curr_cap in caps_txt:
39 |             for prev_cap in prev_caps:
40 |                 file_prevs.write(prev_cap + '\n')
41 |                 file_curr.write(curr_cap + '\n')
42 |                 i += 1
43 | 
44 |         prev_id = id
45 |         prev_caps = caps_txt
46 |         caps_txt = []
47 | 


--------------------------------------------------------------------------------
/data_engine/prepare_data.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import logging
  3 | 
  4 | import numpy as np
  5 | 
  6 | from keras_wrapper.dataset import Dataset, saveDataset, loadDataset
  7 | from keras_wrapper.extra.read_write import pkl2dict
  8 | 
  9 | logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(message)s', datefmt='%d/%m/%Y %H:%M:%S')
 10 | 
 11 | 
 12 | def build_dataset(params):
 13 |     if params['REBUILD_DATASET']:  # We build a new dataset instance
 14 |         if params['VERBOSE'] > 0:
 15 |             silence = False
 16 |             logging.info('Building ' + params['DATASET_NAME'] + ' dataset')
 17 |         else:
 18 |             silence = True
 19 | 
 20 |         base_path = params['DATA_ROOT_PATH']
 21 |         name = params['DATASET_NAME']
 22 |         ds = Dataset(name, base_path, silence=silence)
 23 | 
 24 |         if not '-vidtext-embed' in params['DATASET_NAME']:
 25 |             # OUTPUT DATA
 26 |             # Let's load the train, val and test splits of the descriptions (outputs)
 27 |             #    the files include a description per line. In this dataset a variable number
 28 |             #    of descriptions per video are provided.
 29 |             ds.setOutput(base_path + '/' + params['DESCRIPTION_FILES']['train'],
 30 |                          'train',
 31 |                          type='text',
 32 |                          id=params['OUTPUTS_IDS_DATASET'][0],
 33 |                          build_vocabulary=True,
 34 |                          tokenization=params['TOKENIZATION_METHOD'],
 35 |                          fill=params['FILL'],
 36 |                          pad_on_batch=True,
 37 |                          max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
 38 |                          sample_weights=params['SAMPLE_WEIGHTS'],
 39 |                          min_occ=params['MIN_OCCURRENCES_VOCAB'])
 40 | 
 41 |             ds.setOutput(base_path + '/' + params['DESCRIPTION_FILES']['val'],
 42 |                          'val',
 43 |                          type='text',
 44 |                          id=params['OUTPUTS_IDS_DATASET'][0],
 45 |                          build_vocabulary=True,
 46 |                          pad_on_batch=True,
 47 |                          tokenization=params['TOKENIZATION_METHOD'],
 48 |                          sample_weights=params['SAMPLE_WEIGHTS'],
 49 |                          max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'],
 50 |                          min_occ=params['MIN_OCCURRENCES_VOCAB'])
 51 | 
 52 |             ds.setOutput(base_path + '/' + params['DESCRIPTION_FILES']['test'],
 53 |                          'test',
 54 |                          type='text',
 55 |                          id=params['OUTPUTS_IDS_DATASET'][0],
 56 |                          build_vocabulary=True,
 57 |                          pad_on_batch=True,
 58 |                          tokenization=params['TOKENIZATION_METHOD'],
 59 |                          sample_weights=params['SAMPLE_WEIGHTS'],
 60 |                          max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'],
 61 |                          min_occ=params['MIN_OCCURRENCES_VOCAB'])
 62 | 
 63 |         else:
 64 |             # Use descriptions as inputs instead --> 'matching'/'non-matching' as output
 65 |             ds.setInput(base_path + '/' + params['DESCRIPTION_FILES']['train'],
 66 |                         'train',
 67 |                         type='text',
 68 |                         id=params['INPUTS_IDS_DATASET'][1],
 69 |                         build_vocabulary=True,
 70 |                         tokenization=params['TOKENIZATION_METHOD'],
 71 |                         fill=params['FILL'],
 72 |                         pad_on_batch=True,
 73 |                         max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
 74 |                         min_occ=params['MIN_OCCURRENCES_VOCAB'])
 75 | 
 76 |             ds.setInput(base_path + '/' + params['DESCRIPTION_FILES']['val'],
 77 |                         'val',
 78 |                         type='text',
 79 |                         id=params['INPUTS_IDS_DATASET'][1],
 80 |                         build_vocabulary=True,
 81 |                         pad_on_batch=True,
 82 |                         tokenization=params['TOKENIZATION_METHOD'],
 83 |                         max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'],
 84 |                         min_occ=params['MIN_OCCURRENCES_VOCAB'])
 85 | 
 86 |             ds.setInput(base_path + '/' + params['DESCRIPTION_FILES']['test'],
 87 |                         'test',
 88 |                         type='text',
 89 |                         id=params['INPUTS_IDS_DATASET'][1],
 90 |                         build_vocabulary=True,
 91 |                         pad_on_batch=True,
 92 |                         tokenization=params['TOKENIZATION_METHOD'],
 93 |                         max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'],
 94 |                         min_occ=params['MIN_OCCURRENCES_VOCAB'])
 95 | 
 96 |         # INPUT DATA
 97 |         # Let's load the associated videos (inputs)
 98 |         #    we must take into account that in this dataset we have a different number of sentences per video, 
 99 |         #    for this reason we introduce the parameter 'repeat_set'=num_captions, where num_captions is a list
100 |         #    containing the number of captions in each video.
101 | 
102 |         num_captions_train = np.load(base_path + '/' + params['DESCRIPTION_COUNTS_FILES']['train'])
103 |         num_captions_val = np.load(base_path + '/' + params['DESCRIPTION_COUNTS_FILES']['val'])
104 |         num_captions_test = np.load(base_path + '/' + params['DESCRIPTION_COUNTS_FILES']['test'])
105 | 
106 |         for feat_type in params['FEATURE_NAMES']:
107 |             for split, num_cap in zip(['train', 'val', 'test'],
108 |                                       [num_captions_train, num_captions_val, num_captions_test]):
109 |                 list_files = base_path + '/' + params['FRAMES_LIST_FILES'][split] % feat_type
110 |                 counts_files = base_path + '/' + params['FRAMES_COUNTS_FILES'][split] % feat_type
111 | 
112 |                 ds.setInput([list_files, counts_files],
113 |                             split,
114 |                             type=params['INPUT_DATA_TYPE'],
115 |                             id=params['INPUTS_IDS_DATASET'][0],
116 |                             repeat_set=num_cap,
117 |                             max_video_len=params['NUM_FRAMES'],
118 |                             feat_len=params['IMG_FEAT_SIZE'],
119 |                             data_augmentation_types=params['DATA_AUGMENTATION_TYPE'])
120 | 
121 |         if not '-vidtext-embed' in params['DATASET_NAME'] and len(params['INPUTS_IDS_DATASET']) > 1:
122 |             ds.setInput(base_path + '/' + params['DESCRIPTION_FILES']['train'],
123 |                         'train',
124 |                         type='text',
125 |                         id=params['INPUTS_IDS_DATASET'][1],
126 |                         required=False,
127 |                         tokenization=params['TOKENIZATION_METHOD'],
128 |                         pad_on_batch=True,
129 |                         build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
130 |                         offset=1,
131 |                         fill=params['FILL'],
132 |                         max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
133 |                         max_words=params['OUTPUT_VOCABULARY_SIZE'],
134 |                         min_occ=params['MIN_OCCURRENCES_VOCAB'])
135 | 
136 |             ds.setInput(None, 'val', type='ghost', id=params['INPUTS_IDS_DATASET'][1], required=False)
137 |             ds.setInput(None, 'test', type='ghost', id=params['INPUTS_IDS_DATASET'][1], required=False)
138 | 
139 |         # Set inputs for temporally-linked samples
140 |         if not '-vidtext-embed' in params['DATASET_NAME'] and '-linked' in params['DATASET_NAME']:
141 |             # Set input captions from previous event/video
142 |             if '-upperbound' not in params['DATASET_NAME']:
143 |                 if '-vidtext' in params['DATASET_NAME']:  # use both previous video and previous description
144 | 
145 |                     ds, repeat_images = insertTemporallyLinkedCaptionsVidText(ds, params,
146 |                                                                               vidtext_set_names={
147 |                                                                                   'video': ['train', 'val', 'test'],
148 |                                                                                   'text': ['train']})
149 |                     del repeat_images['test']
150 |                     del repeat_images['val']
151 |                     # Insert empty prev_descriptions on val and test sets
152 |                     ds.setInput([],
153 |                                 'val',
154 |                                 type='text',
155 |                                 id=params['INPUTS_IDS_DATASET'][2],
156 |                                 build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
157 |                                 tokenization=params['TOKENIZATION_METHOD'],
158 |                                 fill=params['FILL'],
159 |                                 pad_on_batch=True,
160 |                                 max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
161 |                                 min_occ=params['MIN_OCCURRENCES_VOCAB'],
162 |                                 required=False,
163 |                                 overwrite_split=True)
164 |                     ds.setInput([],
165 |                                 'test',
166 |                                 type='text',
167 |                                 id=params['INPUTS_IDS_DATASET'][2],
168 |                                 build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
169 |                                 tokenization=params['TOKENIZATION_METHOD'],
170 |                                 fill=params['FILL'],
171 |                                 pad_on_batch=True,
172 |                                 max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
173 |                                 min_occ=params['MIN_OCCURRENCES_VOCAB'],
174 |                                 required=False,
175 |                                 overwrite_split=True)
176 | 
177 |                 elif '-video' in params['DATASET_NAME']:
178 |                     ds, repeat_images = insertTemporallyLinkedCaptions(ds, params,
179 |                                                                        set_names=['train', 'val', 'test'],
180 |                                                                        video=True)
181 |                     num_captions_val = repeat_images['val']
182 |                     num_captions_test = repeat_images['test']
183 |                 else:
184 |                     ds, repeat_images = insertTemporallyLinkedCaptions(ds, params)
185 |                     # Insert empty prev_descriptions on val and test sets
186 |                     ds.setInput([],
187 |                                 'val',
188 |                                 type='text',
189 |                                 id=params['INPUTS_IDS_DATASET'][2],
190 |                                 build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
191 |                                 tokenization=params['TOKENIZATION_METHOD'],
192 |                                 fill=params['FILL'],
193 |                                 pad_on_batch=True,
194 |                                 max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
195 |                                 min_occ=params['MIN_OCCURRENCES_VOCAB'],
196 |                                 required=False,
197 |                                 overwrite_split=True)
198 |                     ds.setInput([],
199 |                                 'test',
200 |                                 type='text',
201 |                                 id=params['INPUTS_IDS_DATASET'][2],
202 |                                 build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
203 |                                 tokenization=params['TOKENIZATION_METHOD'],
204 |                                 fill=params['FILL'],
205 |                                 pad_on_batch=True,
206 |                                 max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
207 |                                 min_occ=params['MIN_OCCURRENCES_VOCAB'],
208 |                                 required=False,
209 |                                 overwrite_split=True)
210 |             else:
211 |                 ds, repeat_images = insertTemporallyLinkedCaptions(ds,
212 |                                                                    params,
213 |                                                                    set_names=['train', 'val', 'test'],
214 |                                                                    upperbound=True,
215 |                                                                    video='-video' in params['DATASET_NAME'],
216 |                                                                    copy='-copy' in params['DATASET_NAME'],
217 |                                                                    force_nocopy='-nocopy' in params['DATASET_NAME'],
218 |                                                                    prev='-prev' in params['DATASET_NAME'])
219 |                 num_captions_val = repeat_images['val']
220 |                 num_captions_test = repeat_images['test']
221 | 
222 |         if not '-vidtext-embed' in params['DATASET_NAME']:
223 |             # Process dataset for keeping only one caption per video and storing the rest in a dict() with the following format:
224 |             #        ds.extra_variables[set_name][id_output][img_position] = [cap1, cap2, cap3, ..., capN]
225 |             keep_n_captions(ds, repeat=[num_captions_val, num_captions_test], n=1, set_names=['val', 'test'])
226 | 
227 |         else:
228 |             # Set outputs for -vidtext-embed model
229 |             insertVidTextEmbedNegativeSamples(ds, params,
230 |                                               repeat=[num_captions_train, num_captions_val, num_captions_test])
231 | 
232 |         if not '-vidtext-embed' in params['DATASET_NAME'] and \
233 |                         '-linked' in params['DATASET_NAME'] and \
234 |                         '-upperbound' not in params['DATASET_NAME'] and \
235 |                         '-video' not in params['DATASET_NAME']:
236 |             # Set previous data indices
237 |             for s, file in params['LINK_SAMPLE_FILES'].iteritems():
238 |                 if s in repeat_images:
239 |                     rep = repeat_images[s]
240 |                 else:
241 |                     rep = 1
242 |                 ds.setInput(base_path + '/' + file,
243 |                             s,
244 |                             type='id',
245 |                             id=params['INPUTS_IDS_DATASET'][-1],
246 |                             repeat_set=rep)
247 | 
248 |         # We have finished loading the dataset, now we can store it for using it in the future
249 |         saveDataset(ds, params['DATASET_STORE_PATH'])
250 |     else:
251 |         # We can easily recover it with a single line
252 |         ds = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' + params['DATASET_NAME'] + '.pkl')
253 | 
254 |     # Load vocabulary-related parameters of dataset used for pre-training
255 |     if params['PRE_TRAINED_DATASET_NAME'] is not None:
256 |         logging.info('Re-using previous dataset vocabulary ' + params['PRE_TRAINED_DATASET_NAME'])
257 |         dataset_pretrained = loadDataset(
258 |             params['DATASET_STORE_PATH'] + 'Dataset_' + params['PRE_TRAINED_DATASET_NAME'] + '.pkl')
259 |         for id_new, id_old in params['VOCABULARIES_MAPPING'].iteritems():
260 |             ds.vocabulary[id_new] = copy.deepcopy(dataset_pretrained.vocabulary[id_old])
261 |             ds.vocabulary_len[id_new] = copy.deepcopy(dataset_pretrained.vocabulary_len[id_old])
262 |     elif params['PRE_TRAINED_VOCABULARY_NAME'] is not None:
263 |         logging.info('Re-using previous vocabulary ' + params['PRE_TRAINED_VOCABULARY_NAME'])
264 |         dataset_pretrained_vocabulary = pkl2dict(
265 |             params['DATASET_STORE_PATH'] + params['PRE_TRAINED_VOCABULARY_NAME'] + '.pkl')
266 |         for id_new, id_old in params['VOCABULARIES_MAPPING'].iteritems():
267 |             ds.vocabulary[id_new] = copy.deepcopy(dataset_pretrained_vocabulary[id_old])
268 |             ds.vocabulary_len[id_new] = len(dataset_pretrained_vocabulary[id_old]['idx2words'])
269 | 
270 |     return ds
271 | 
272 | 
273 | def keep_n_captions(ds, repeat, n=1, set_names=['val', 'test']):
274 |     ''' Keeps only n captions per image and stores the rest in dictionaries for a later evaluation
275 |     '''
276 | 
277 |     for s, r in zip(set_names, repeat):
278 |         logging.info('Keeping ' + str(n) + ' captions per input on the ' + str(s) + ' set.')
279 | 
280 |         ds.extra_variables[s] = dict()
281 |         exec ('n_samples = ds.len_' + s)
282 | 
283 |         # Process inputs
284 |         for id_in in ds.ids_inputs:
285 |             new_X = []
286 |             if id_in in ds.optional_inputs:
287 |                 try:
288 |                     exec ('X = ds.X_' + s)
289 |                     i = 0
290 |                     for next_repeat in r:
291 |                         for j in range(n):
292 |                             new_X.append(X[id_in][i + j])
293 |                         i += next_repeat
294 |                     exec ('ds.X_' + s + '[id_in] = new_X')
295 |                 except:
296 |                     pass
297 |             else:
298 |                 exec ('X = ds.X_' + s)
299 |                 i = 0
300 |                 for next_repeat in r:
301 |                     for j in range(n):
302 |                         new_X.append(X[id_in][i + j])
303 |                     i += next_repeat
304 |                 exec ('ds.X_' + s + '[id_in] = new_X')
305 |         # Process outputs
306 |         for id_out in ds.ids_outputs:
307 |             new_Y = []
308 |             exec ('Y = ds.Y_' + s)
309 |             dict_Y = dict()
310 |             count_samples = 0
311 |             i = 0
312 |             for next_repeat in r:
313 |                 dict_Y[count_samples] = []
314 |                 for j in range(next_repeat):
315 |                     if j < n:
316 |                         new_Y.append(Y[id_out][i + j])
317 |                     dict_Y[count_samples].append(Y[id_out][i + j])
318 |                 count_samples += 1
319 |                 i += next_repeat
320 |             exec ('ds.Y_' + s + '[id_out] = new_Y')
321 |             # store dictionary with vid_pos -> [cap1, cap2, cap3, ..., capNi]
322 |             ds.extra_variables[s][id_out] = dict_Y
323 | 
324 |         new_len = len(new_Y)
325 |         exec ('ds.len_' + s + ' = new_len')
326 |         logging.info('Samples reduced to ' + str(new_len) + ' in ' + s + ' set.')
327 | 
328 | 
329 | def insertTemporallyLinkedCaptions(ds, params, set_names=['train'],
330 |                                    upperbound=False,
331 |                                    video=False, copy=False, force_nocopy=False, prev=False):
332 |     """
333 |         Inserts an additional input consisting of the desired captions from the previous segment/event
334 |         in chronological order. Example:
335 |             <video1, in_caption1> : <out_caption1>
336 |             <video1, in_caption1> : <out_caption2>
337 |             .
338 |             .
339 |             .
340 |             <video1, in_captionM> : <out_captionN>
341 |             <video2, in_caption1> : <out_caption1>
342 |             .
343 |             .
344 |             .
345 | 
346 |         :param ds: dataset to modify
347 |         :param params: parameters from config
348 |         :param set_names: names of the splits that will be modified (default 'train' only)
349 |         :param upperbound: whether we want to generate a dataset for an upper bound comparison by using the same captions both as input and output
350 |         :param video: whether we use the previous' event video as input instead of the previous caption
351 |         :param copy: generates an upperbound dataset only intending to copy giving only matching input-output sequences (only valid if upperbound=True)
352 |         :param force_nocopy: generates an upperbound dataset using the same captions both as input and output but avoiding direct copies
353 |         :param prev: indicates if we want to use the previous event's caption as input for the next, or use the current event's output instead
354 | 
355 |         :return: dataset modified with the additional input
356 |     """
357 |     base_path = params['DATA_ROOT_PATH']
358 |     repeat_images = dict()
359 | 
360 |     for s in set_names:
361 |         # retrieve number of output captions per sample
362 |         num_cap = np.load(base_path + '/' + params['DESCRIPTION_COUNTS_FILES'][s])
363 | 
364 |         # get temporal links
365 |         links = []
366 |         with open(base_path + '/' + params['LINK_SAMPLE_FILES'][s], 'r') as f_links:
367 |             for line in f_links:
368 |                 links.append(int(line.strip()))
369 | 
370 |         outputs = []
371 |         with open(base_path + '/' + params['DESCRIPTION_FILES'][s], 'r') as f_outs:
372 |             for line in f_outs:
373 |                 outputs.append(line.strip())
374 | 
375 |         # get outputs
376 |         if video:
377 |             prev_videos = []
378 |             for feat_type in params['FEATURE_NAMES']:
379 |                 list_files = base_path + '/' + params['FRAMES_LIST_FILES'][s] % feat_type
380 |                 counts_files = base_path + '/' + params['FRAMES_COUNTS_FILES'][s] % feat_type
381 |                 with open(list_files, 'r') as f_outs, open(counts_files, 'r') as f_outs_counts:
382 |                     prev_videos.append(
383 |                         [[line.strip() for line in f_outs], [int(line.strip()) for line in f_outs_counts]])
384 | 
385 |         # modify outputs and prepare inputs
386 |         images_repeat = []
387 |         upperbound_images_repeat = []
388 |         final_outputs = []
389 |         if video:
390 |             final_inputs = dict()
391 |             for feat_type in params['FEATURE_NAMES']:
392 |                 final_inputs[feat_type] = [[], []]
393 |         else:
394 |             final_inputs = []
395 |         for i, link in enumerate(links):
396 |             ini_out = np.sum(num_cap[:i])
397 |             these_outputs = outputs[ini_out:ini_out + num_cap[i]]
398 | 
399 |             if upperbound:
400 |                 if copy:
401 |                     images_repeat.append(num_cap[i])
402 |                     upperbound_images_repeat.append(num_cap[i])
403 |                     for out in these_outputs:
404 |                         final_outputs.append(out)
405 |                         final_inputs.append(out)
406 |                 elif prev:
407 |                     # first sample in the temporally-linked sequence
408 |                     if link == -1:
409 |                         images_repeat.append(num_cap[i])
410 |                         upperbound_images_repeat.append(num_cap[i])
411 |                         for out in these_outputs:
412 |                             final_outputs.append(out)
413 |                             final_inputs.append('')
414 |                     else:
415 |                         prev_ini_out = np.sum(num_cap[:link])
416 |                         prev_outputs = outputs[prev_ini_out:prev_ini_out + num_cap[link]]
417 |                         images_repeat.append(num_cap[i] * num_cap[link])
418 |                         for n in range(num_cap[link]):
419 |                             upperbound_images_repeat.append(num_cap[i])
420 |                             for out in these_outputs:
421 |                                 final_outputs.append(out)
422 |                                 final_inputs.append(prev_outputs[n])
423 |                 elif force_nocopy:
424 |                     raise NotImplementedError()
425 |                     prev_outputs = these_outputs
426 |                     images_repeat.append(num_cap[i] * (num_cap[i] - 1))
427 |                     for n in range(num_cap[i]):
428 |                         upperbound_images_repeat.append(num_cap[i] - 1)
429 |                         for nthese, out in enumerate(these_outputs):
430 |                             if nthese != n:
431 |                                 final_outputs.append(out)
432 |                                 final_inputs.append(prev_outputs[n])
433 |                 else:
434 |                     prev_outputs = these_outputs
435 |                     images_repeat.append(num_cap[i] * num_cap[i])
436 |                     for n in range(num_cap[i]):
437 |                         upperbound_images_repeat.append(num_cap[i])
438 |                         for out in these_outputs:
439 |                             final_outputs.append(out)
440 |                             final_inputs.append(prev_outputs[n])
441 |             else:
442 |                 if video:
443 |                     # first sample in the temporally-linked sequence
444 |                     if link == -1:
445 |                         images_repeat.append(num_cap[i])
446 |                         for out in these_outputs:
447 |                             final_outputs.append(out)
448 |                         for ifeat, feat_type in enumerate(params['FEATURE_NAMES']):
449 |                             final_inputs[feat_type][1] += [0]
450 |                     else:
451 |                         images_repeat.append(num_cap[i])
452 |                         for out in these_outputs:
453 |                             final_outputs.append(out)
454 |                         for ifeat, feat_type in enumerate(params['FEATURE_NAMES']):
455 |                             if link > 0:
456 |                                 init_frame = int(sum(prev_videos[ifeat][1][:link]))
457 |                             else:
458 |                                 init_frame = 0
459 |                             this_count = prev_videos[ifeat][1][link]
460 |                             final_inputs[feat_type][0] += prev_videos[ifeat][0][init_frame:init_frame + this_count]
461 |                             final_inputs[feat_type][1] += [this_count]
462 |                 else:
463 |                     # first sample in the temporally-linked sequence
464 |                     if link == -1:
465 |                         images_repeat.append(num_cap[i])
466 |                         for out in these_outputs:
467 |                             final_outputs.append(out)
468 |                             final_inputs.append('')
469 |                     else:
470 |                         prev_ini_out = np.sum(num_cap[:link])
471 |                         prev_outputs = outputs[prev_ini_out:prev_ini_out + num_cap[link]]
472 |                         images_repeat.append(num_cap[i] * num_cap[link])
473 |                         for n in range(num_cap[link]):
474 |                             for out in these_outputs:
475 |                                 final_outputs.append(out)
476 |                                 final_inputs.append(prev_outputs[n])
477 | 
478 |         # Overwrite input images assigning the new repeat pattern
479 |         for feat_type in params['FEATURE_NAMES']:
480 |             list_files = base_path + '/' + params['FRAMES_LIST_FILES'][s] % feat_type
481 |             counts_files = base_path + '/' + params['FRAMES_COUNTS_FILES'][s] % feat_type
482 | 
483 |             ds.setInput([list_files, counts_files],
484 |                         s,
485 |                         type=params['INPUT_DATA_TYPE'],
486 |                         id=params['INPUTS_IDS_DATASET'][0],
487 |                         repeat_set=images_repeat,
488 |                         max_video_len=params['NUM_FRAMES'],
489 |                         feat_len=params['IMG_FEAT_SIZE'],
490 |                         overwrite_split=True,
491 |                         data_augmentation_types=params['DATA_AUGMENTATION_TYPE'])
492 | 
493 |         if not video:
494 |             # Overwrite outputs assigning the new outputs repeat pattern
495 |             ds.setOutput(final_outputs,
496 |                          s,
497 |                          type='text',
498 |                          id=params['OUTPUTS_IDS_DATASET'][0],
499 |                          build_vocabulary=True,
500 |                          tokenization=params['TOKENIZATION_METHOD'],
501 |                          fill=params['FILL'],
502 |                          pad_on_batch=True,
503 |                          max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
504 |                          sample_weights=params['SAMPLE_WEIGHTS'],
505 |                          min_occ=params['MIN_OCCURRENCES_VOCAB'],
506 |                          overwrite_split=True)
507 | 
508 |             # Overwrite the input state_below assigning the new outputs repeat pattern
509 |             ds.setInput(final_outputs,
510 |                         s,
511 |                         type='text',
512 |                         id=params['INPUTS_IDS_DATASET'][1],
513 |                         required=False,
514 |                         tokenization=params['TOKENIZATION_METHOD'],
515 |                         pad_on_batch=True,
516 |                         build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
517 |                         offset=1,
518 |                         fill=params['FILL'],
519 |                         max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
520 |                         max_words=params['OUTPUT_VOCABULARY_SIZE'],
521 |                         min_occ=params['MIN_OCCURRENCES_VOCAB'],
522 |                         overwrite_split=True)
523 | 
524 |         if video:
525 |             for feat_type in params['FEATURE_NAMES']:
526 |                 ds.setInput(final_inputs[feat_type],
527 |                             s,
528 |                             type=params['INPUT_DATA_TYPE'],
529 |                             id=params['INPUTS_IDS_DATASET'][2],
530 |                             repeat_set=images_repeat,
531 |                             max_video_len=params['NUM_FRAMES'],
532 |                             feat_len=params['IMG_FEAT_SIZE'],
533 |                             overwrite_split=True,
534 |                             data_augmentation_types=params['DATA_AUGMENTATION_TYPE'])
535 |         else:
536 |             # Set new input captions from previous temporally-linked event/video
537 |             ds.setInput(final_inputs,
538 |                         s,
539 |                         type='text',
540 |                         id=params['INPUTS_IDS_DATASET'][2],
541 |                         build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
542 |                         tokenization=params['TOKENIZATION_METHOD'],
543 |                         fill=params['FILL'],
544 |                         pad_on_batch=True,
545 |                         max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
546 |                         min_occ=params['MIN_OCCURRENCES_VOCAB'])
547 | 
548 |         if upperbound:
549 |             images_repeat = upperbound_images_repeat
550 |         repeat_images[s] = images_repeat
551 | 
552 |     return ds, repeat_images
553 | 
554 | 
555 | def insertTemporallyLinkedCaptionsVidText(ds, params, vidtext_set_names={'video': ['train'], 'text': ['train']}):
556 |     """
557 |         Inserts two additional input consisting of the videos and captions from the previous segment/event
558 |         in chronological order. Example:
559 |             <video1, prev_video1, in_caption1> : <out_caption1>
560 |             <video1, prev_video1, in_caption1> : <out_caption2>
561 |             .
562 |             .
563 |             .
564 |             <video1, prev_video1, in_captionM> : <out_captionN>
565 |             <video2, prev_video2, in_caption1> : <out_caption1>
566 |             .
567 |             .
568 |             .
569 | 
570 |         :param ds: dataset to modify
571 |         :param params: parameters from config
572 |         :param vidtext_set_names: dictionary names of the splits that will be modified for 'video' and for 'text'
573 | 
574 |         :return: dataset modified with the additional input
575 |     """
576 |     base_path = params['DATA_ROOT_PATH']
577 |     repeat_images = dict()
578 | 
579 |     set_names = set(vidtext_set_names['video'] + vidtext_set_names['text'])
580 |     for s in set_names:
581 |         # retrieve number of output captions per sample
582 |         num_cap = np.load(base_path + '/' + params['DESCRIPTION_COUNTS_FILES'][s])
583 | 
584 |         # get temporal links
585 |         links = []
586 |         with open(base_path + '/' + params['LINK_SAMPLE_FILES'][s], 'r') as f_links:
587 |             for line in f_links:
588 |                 links.append(int(line.strip()))
589 | 
590 |         outputs = []
591 |         with open(base_path + '/' + params['DESCRIPTION_FILES'][s], 'r') as f_outs:
592 |             for line in f_outs:
593 |                 outputs.append(line.strip())
594 | 
595 |         # get outputs
596 |         if s in vidtext_set_names['video']:
597 |             prev_videos = []
598 |             for feat_type in params['FEATURE_NAMES']:
599 |                 list_files = base_path + '/' + params['FRAMES_LIST_FILES'][s] % feat_type
600 |                 counts_files = base_path + '/' + params['FRAMES_COUNTS_FILES'][s] % feat_type
601 |                 with open(list_files, 'r') as f_outs, open(counts_files, 'r') as f_outs_counts:
602 |                     prev_videos.append(
603 |                         [[line.strip() for line in f_outs], [int(line.strip()) for line in f_outs_counts]])
604 | 
605 |         # modify outputs and prepare inputs
606 |         images_repeat = []
607 |         final_outputs = []
608 |         if s in vidtext_set_names['video']:
609 |             final_inputs_vid = dict()
610 |             for feat_type in params['FEATURE_NAMES']:
611 |                 final_inputs_vid[feat_type] = [[], []]
612 |         final_inputs_txt = []
613 | 
614 |         for i, link in enumerate(links):
615 |             ini_out = np.sum(num_cap[:i])
616 |             these_outputs = outputs[ini_out:ini_out + num_cap[i]]
617 | 
618 |             # first sample in the temporally-linked sequence
619 |             if link == -1:
620 |                 images_repeat.append(num_cap[i])
621 |                 for out in these_outputs:
622 |                     final_outputs.append(out)
623 |                     if s in vidtext_set_names['text']:
624 |                         final_inputs_txt.append('')
625 |                 if s in vidtext_set_names['video']:
626 |                     for ifeat, feat_type in enumerate(params['FEATURE_NAMES']):
627 |                         final_inputs_vid[feat_type][1] += [0]
628 |             else:
629 |                 if s in vidtext_set_names['text']:
630 |                     prev_ini_out = np.sum(num_cap[:link])
631 |                     prev_outputs = outputs[prev_ini_out:prev_ini_out + num_cap[link]]
632 |                     images_repeat.append(num_cap[i] * num_cap[link])
633 |                 else:
634 |                     images_repeat.append(num_cap[i])
635 | 
636 |                 # video only
637 |                 if s not in vidtext_set_names['text'] and s in vidtext_set_names['video']:
638 |                     for out in these_outputs:
639 |                         final_outputs.append(out)
640 | 
641 |                     for ifeat, feat_type in enumerate(params['FEATURE_NAMES']):
642 |                         if link > 0:
643 |                             init_frame = int(sum(prev_videos[ifeat][1][:link]))
644 |                         else:
645 |                             init_frame = 0
646 |                         this_count = prev_videos[ifeat][1][link]
647 |                         final_inputs_vid[feat_type][0] += prev_videos[ifeat][0][init_frame:init_frame + this_count]
648 |                         final_inputs_vid[feat_type][1] += [this_count]
649 | 
650 |                 # text only
651 |                 elif s in vidtext_set_names['text'] and s not in vidtext_set_names['video']:
652 |                     for n in range(num_cap[link]):
653 |                         for out in these_outputs:
654 |                             final_outputs.append(out)
655 |                             final_inputs_txt.append(prev_outputs[n])
656 | 
657 |                 # both
658 |                 else:
659 |                     for n in range(num_cap[link]):
660 |                         for out in these_outputs:
661 |                             final_outputs.append(out)
662 |                             final_inputs_txt.append(prev_outputs[n])
663 | 
664 |                     for ifeat, feat_type in enumerate(params['FEATURE_NAMES']):
665 |                         if link > 0:
666 |                             init_frame = int(sum(prev_videos[ifeat][1][:link]))
667 |                         else:
668 |                             init_frame = 0
669 |                         this_count = prev_videos[ifeat][1][link]
670 |                         final_inputs_vid[feat_type][0] += prev_videos[ifeat][0][init_frame:init_frame + this_count]
671 |                         final_inputs_vid[feat_type][1] += [this_count]
672 | 
673 |         # Overwrite input images assigning the new repeat pattern
674 |         for feat_type in params['FEATURE_NAMES']:
675 |             list_files = base_path + '/' + params['FRAMES_LIST_FILES'][s] % feat_type
676 |             counts_files = base_path + '/' + params['FRAMES_COUNTS_FILES'][s] % feat_type
677 | 
678 |             ds.setInput([list_files, counts_files],
679 |                         s,
680 |                         type=params['INPUT_DATA_TYPE'],
681 |                         id=params['INPUTS_IDS_DATASET'][0],
682 |                         repeat_set=images_repeat,
683 |                         max_video_len=params['NUM_FRAMES'],
684 |                         feat_len=params['IMG_FEAT_SIZE'],
685 |                         overwrite_split=True,
686 |                         data_augmentation_types=params['DATA_AUGMENTATION_TYPE'])
687 | 
688 |         # if text
689 |         if s in vidtext_set_names['text']:
690 |             # Overwrite outputs assigning the new outputs repeat pattern
691 |             ds.setOutput(final_outputs,
692 |                          s,
693 |                          type='text',
694 |                          id=params['OUTPUTS_IDS_DATASET'][0],
695 |                          build_vocabulary=True,
696 |                          tokenization=params['TOKENIZATION_METHOD'],
697 |                          fill=params['FILL'],
698 |                          pad_on_batch=True,
699 |                          max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
700 |                          sample_weights=params['SAMPLE_WEIGHTS'],
701 |                          min_occ=params['MIN_OCCURRENCES_VOCAB'],
702 |                          overwrite_split=True)
703 | 
704 |             # Overwrite the input state_below assigning the new outputs repeat pattern
705 |             ds.setInput(final_outputs,
706 |                         s,
707 |                         type='text',
708 |                         id=params['INPUTS_IDS_DATASET'][1],
709 |                         required=False,
710 |                         tokenization=params['TOKENIZATION_METHOD'],
711 |                         pad_on_batch=True,
712 |                         build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
713 |                         offset=1,
714 |                         fill=params['FILL'],
715 |                         max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
716 |                         max_words=params['OUTPUT_VOCABULARY_SIZE'],
717 |                         min_occ=params['MIN_OCCURRENCES_VOCAB'],
718 |                         overwrite_split=True)
719 | 
720 |         if s in vidtext_set_names['video']:
721 |             for feat_type in params['FEATURE_NAMES']:
722 |                 ds.setInput(final_inputs_vid[feat_type],
723 |                             s,
724 |                             type=params['INPUT_DATA_TYPE'],
725 |                             id=params['INPUTS_IDS_DATASET'][3],
726 |                             repeat_set=images_repeat,
727 |                             max_video_len=params['NUM_FRAMES'],
728 |                             feat_len=params['IMG_FEAT_SIZE'],
729 |                             overwrite_split=True,
730 |                             data_augmentation_types=params['DATA_AUGMENTATION_TYPE'])
731 | 
732 |         if s in vidtext_set_names['text']:
733 |             # Set new input captions from previous temporally-linked event/video
734 |             ds.setInput(final_inputs_txt,
735 |                         s,
736 |                         type='text',
737 |                         id=params['INPUTS_IDS_DATASET'][2],
738 |                         required=False,
739 |                         build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
740 |                         tokenization=params['TOKENIZATION_METHOD'],
741 |                         fill=params['FILL'],
742 |                         pad_on_batch=True,
743 |                         max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
744 |                         min_occ=params['MIN_OCCURRENCES_VOCAB'],
745 |                         overwrite_split=True)
746 | 
747 |         repeat_images[s] = images_repeat
748 | 
749 |     return ds, repeat_images
750 | 
751 | 
752 | def insertVidTextEmbedNegativeSamples(ds, params, repeat):
753 |     """
754 |     Inserts negative balanced examples for training a Video-Text Embedding model.
755 | 
756 |     :param ds: dataset object with inputs of positive samples inserted
757 |     :param params: config params
758 |     :param repeat: number of times each video was repeated
759 |     """
760 | 
761 |     for s, r in zip(['train', 'val', 'test'], repeat):
762 | 
763 |         # Get data from dataset
764 |         X = None
765 |         num_samples = 0
766 |         exec ('num_samples = ds.len_' + s)
767 |         exec ('X = ds.X_' + s)
768 | 
769 |         video_indices = X[params['INPUTS_IDS_DATASET'][0]]
770 |         descriptions = X[params['INPUTS_IDS_DATASET'][1]]
771 | 
772 |         # Get real indices considering repetitions
773 |         desc_real_indices = np.repeat(range(len(r)), r)
774 | 
775 |         # Let's generate some random video-description pairs
776 |         negative_videos = np.random.choice(video_indices, num_samples, replace=True)
777 |         for neg_id in negative_videos:
778 |             # Insert index of repeated video (now as negative sample)
779 |             video_indices.append(neg_id)
780 | 
781 |             # New find random description (avoiding correct descriptions for the selected video)
782 |             real_id = desc_real_indices[neg_id]
783 |             desc_id = np.random.choice([ind for ind in range(num_samples) if desc_real_indices[ind] != real_id], 1)[0]
784 | 
785 |             # Insert description of negative sample
786 |             descriptions.append(descriptions[desc_id])
787 | 
788 |         # Re-insert videos and descriptions, including new length
789 |         exec ('ds.X_' + s + '["' + params['INPUTS_IDS_DATASET'][0] + '"] = video_indices')
790 |         exec ('ds.X_' + s + '["' + params['INPUTS_IDS_DATASET'][1] + '"] = descriptions')
791 |         exec ('ds.len_' + s + ' = num_samples*2')
792 | 
793 |         # Insert output, which consists in 'matching'/'non-matching labels'
794 |         matches = [1 for i in range(num_samples)] + [0 for i in range(num_samples)]
795 |         ds.setOutput(matches,
796 |                      s,
797 |                      type='categorical',
798 |                      id=params['OUTPUTS_IDS_DATASET'][0])
799 | 
800 |     ds.setClasses(['matching', 'non-matching'], id=params['OUTPUTS_IDS_DATASET'][0])
801 | 


--------------------------------------------------------------------------------
/data_engine/split_data.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | 
  4 | import numpy as np
  5 | import xlrd
  6 | 
  7 | # Split the existent data in train, val and test
  8 | data_path = '/media/HDD_3TB/DATASETS/EDUB-SegDesc'
  9 | split_prop = {'train': 0.7,
 10 |               'val': 0.15,
 11 |               'test': 0.15,
 12 |               }
 13 | sets_names = ['Estefania1', 'Estefania2', 'Estefania3', 'Estefania4', 'Estefania5',
 14 |               'Gabriel1', 'Gabriel2', 'Gabriel3', 'Gabriel4',
 15 |               'MAngeles1', 'MAngeles2', 'MAngeles3', 'MAngeles4',
 16 |               'Marc1', 'Marc2', 'Marc3', 'Marc4', 'Marc5', 'Marc6', 'Marc7', 'Marc8', 'Marc9',
 17 |               'Marc10', 'Marc11', 'Marc12', 'Marc13', 'Marc14', 'Marc15', 'Marc16', 'Marc17', 'Marc18',
 18 |               'MarcC1',
 19 |               'Mariella', 'Mariella2', 'Mariella3',
 20 |               'Maya1', 'Maya2', 'Maya3', 'Maya4', 'Maya5', 'Maya6', 'Maya7', 'Maya8',
 21 |               'Maya9', 'Maya10', 'Maya11', 'Maya12', 'Maya13', 'Maya14',
 22 |               'Pedro1', 'Pedro2', 'Pedro3', 'Pedro4',
 23 |               # 'Txell1'
 24 |               'Petia1', 'Petia2',
 25 |               ]
 26 | 
 27 | sets = {'train': ['Maya14', 'Maya11', 'Maya10', 'Maya13', 'Maya12', 'Petia2',
 28 |                   'MAngeles4', 'Mariella', 'MAngeles1', 'Pedro1', 'MAngeles3',
 29 |                   'Pedro3', 'MarcC1', 'Estefania1', 'Estefania3', 'Marc18', 'Maya5',
 30 |                   'Gabriel3', 'Maya6', 'Maya1', 'Maya3', 'Marc16', 'Marc17',
 31 |                   'Marc15', 'Maya9', 'Maya8', 'Marc10', 'Marc11', 'Gabriel2',
 32 |                   'Marc7', 'Maya4', 'MAngeles2', 'Gabriel1', 'Marc8', 'Marc12',
 33 |                   'Marc5', 'Mariella3', 'Marc2', 'Marc3'],
 34 |         'val': ['Pedro4', 'Pedro2', 'Estefania4', 'Maya7', 'Marc6', 'Petia1', 'Mariella2'],
 35 |         'test': ['Estefania2', 'Marc1', 'Estefania5', 'Marc9', 'Gabriel4', 'Maya2', 'Marc4', 'Marc14', 'Marc13'],
 36 |         }
 37 | 
 38 | # input data paths
 39 | in_features_path = 'Features/Features_original'  # <name>/<in_features_name>.csv
 40 | in_descriptions_path = 'GT/descriptions'  # <name>.txt
 41 | in_segments_path = 'GT/segmentations'  # GT_<name>.xls(x)
 42 | in_images_path = 'Images'  # <name>/<image_name>.jpg
 43 | in_features_name = 'GoogleNet_ImageNet'
 44 | format = '.jpg'
 45 | # list of non-informative images stored in <in_features_path>/NonInfo/<noninformative_prefix>.csv
 46 | # leave empty for not using it
 47 | in_noninfo_path = 'Features/NonInfo'
 48 | noninformative_prefix = 'infoCNN_outputClasses'
 49 | 
 50 | # output data paths
 51 | out_features_path = 'Features'  # <set_split>_<out_features_name>_all_frames.csv & <set_split>_<out_features_name>_all_frames_counts.txt
 52 | out_descriptions_path = 'Annotations'  # captions.id.en & <set_split>_list.txt
 53 | out_features_name = 'ImageNet_Without_NonInfo'
 54 | separator = '----'
 55 | 
 56 | ####################################
 57 | 
 58 | if noninformative_prefix:
 59 |     suffix_name = '_without_noninfo'
 60 | else:
 61 |     suffix_name = ''
 62 | 
 63 | # Only apply random selection if the sets split is not already provided
 64 | if not sets:
 65 |     # generate data splits
 66 |     available_sets = len(sets_names)
 67 |     randomized = np.random.choice(sets_names, available_sets, replace=False)
 68 | 
 69 |     # randomized = np.array(sets_names)
 70 | 
 71 |     sets = dict()
 72 |     picked_so_far = 0
 73 |     for s, p in split_prop.iteritems():
 74 |         last_picked = np.ceil(picked_so_far + available_sets * p)
 75 |         sets[s] = randomized[picked_so_far:last_picked]
 76 |         picked_so_far = last_picked
 77 | 
 78 | # read images
 79 | images = dict()
 80 | for n, s in sets.iteritems():
 81 |     for set in s:
 82 |         images[set] = []
 83 |         these_images = glob.glob(data_path + '/' + in_images_path + '/' + set + '/*' + format)
 84 |         for im in these_images:
 85 |             images[set].append(im.split('/')[-1].split('.')[0])
 86 |         images[set] = sorted(images[set])
 87 | 
 88 | # read segmentations
 89 | events = dict()
 90 | for n, s in sets.iteritems():
 91 |     for set in s:
 92 |         possible_names = ['/GT_' + set + '.xls', '/GT_' + set + '.xlsx', '/' + set + '.xls', '/' + set + '.xlsx']
 93 |         exists = False
 94 |         i = 0
 95 |         while not os.path.isfile(data_path + '/' + in_segments_path + possible_names[i]):
 96 |             i += 1
 97 |         file = xlrd.open_workbook(data_path + '/' + in_segments_path + possible_names[i])
 98 |         sheet = file.sheet_by_index(0)
 99 | 
100 |         these_events = []
101 |         empty = False
102 |         i = 2  # 1st row with info
103 |         while not empty:
104 |             try:
105 |                 evt = sheet.cell(i, 1).value.split()
106 |                 if len(evt) == 1:
107 |                     evt = sheet.cell(i, 1).value.split('-')
108 |                 if evt:
109 |                     these_events.append([evt[0].strip(), evt[1].strip()])
110 |                 else:
111 |                     empty = True
112 |                 i += 1
113 |             except:
114 |                 empty = True
115 |         events[set] = these_events
116 | 
117 | # get frames counts from segments and images lists
118 | counts = dict()
119 | for n, s in sets.iteritems():
120 |     counts[n] = []
121 |     for set in s:
122 |         counts[set] = []
123 |         prev = -1
124 |         for e in events[set]:
125 |             if e[1] not in images[set]:
126 |                 e[1] = '0' + e[1]
127 |             if e[0] not in images[set]:
128 |                 e[0] = '0' + e[0]
129 | 
130 |             if prev != -1 and images[set].index(e[0]) - images[set].index(prev) > 1:
131 |                 raise Exception(images[set].index(e[0]), images[set].index(prev))
132 |             c = images[set].index(e[1]) - images[set].index(e[0]) + 1
133 |             prev = e[1]
134 | 
135 |             counts[set].append(c)
136 |             counts[n].append(c)
137 | 
138 |         assert np.sum(counts[set]) == len(images[set])
139 | 
140 | # get erroneous segments
141 | to_remove = dict()
142 | for n, s in sets.iteritems():
143 |     to_remove[n] = dict()
144 |     for set in s:
145 |         to_remove[n][set] = []
146 |         with open(data_path + '/' + in_descriptions_path + '/' + set + '.txt', 'r') as desc_file:
147 |             prev_segm = -1
148 |             count = 0
149 |             segm_count = 0
150 |             segm_count_show = 0
151 |             for cline, line in enumerate(desc_file):
152 |                 if line:
153 |                     line = line.rstrip('\n').split(',')
154 |                     segm = line[0]
155 |                     desc = ','.join(line[1:])
156 |                     desc = desc.strip().lower()
157 |                     if desc == 'error':
158 |                         to_remove[n][set].append(segm_count)
159 |                     else:
160 |                         if prev_segm != segm:
161 |                             segm_count_show += 1
162 |                             count = 0
163 |                         count += 1
164 |                     assert segm[:7] == 'Segment', set + ', line ' + str(cline)
165 |                     if prev_segm != segm:
166 |                         if prev_segm == -1:
167 |                             assert int(segm[7:]) == 1
168 |                         else:
169 |                             assert int(segm[7:]) == int(prev_segm[7:]) + 1, set + ', line ' + str(cline) + ': ' + str(
170 |                                 int(segm[7:])) + ' != ' + str(int(prev_segm[7:]) + 1)
171 |                         segm_count += 1
172 |                     prev_segm = segm
173 | 
174 | # get features for each data splits
175 | print 'Building features files...'
176 | print '----------------------------------------'
177 | for n, s in sets.iteritems():
178 |     extra_removed = 0
179 |     written_in_file = 0
180 |     all_total = 0
181 |     all_error = 0
182 |     feats_file = open(
183 |         data_path + '/' + out_features_path + '/' + n + '_' + out_features_name + '_all_frames' + suffix_name + '.csv',
184 |         'w')
185 |     counts_file = open(
186 |         data_path + '/' + out_features_path + '/' + n + '_' + out_features_name + '_all_frames_counts' + suffix_name + '.txt',
187 |         'w')
188 |     for set in s:
189 |         these_removed = to_remove[n][set]
190 |         these_counts = counts[set]
191 |         feats_set = open(data_path + '/' + in_features_path + '/' + set + '/' + in_features_name + '.csv', 'r')
192 |         if noninformative_prefix:
193 |             noninfo_file = open(data_path + '/' + in_noninfo_path + '/' + noninformative_prefix + '_' + set + '.csv',
194 |                                 'r')
195 |         for ic, count in enumerate(these_counts):
196 |             all_total += 1
197 |             new_count = 0
198 |             these_feats = []
199 |             for c in range(count):
200 |                 line = feats_set.next().rstrip('\n')
201 |                 is_informative = True
202 |                 if noninformative_prefix:
203 |                     noninfo_line = noninfo_file.next().rstrip('\n')
204 |                     # checks if the current frame is non-informative and discards it
205 |                     if float(noninfo_line.split(',')[0]) >= 0.5:
206 |                         is_informative = False
207 |                 if is_informative:
208 |                     these_feats.append(line)
209 |                     new_count += 1
210 |             if ic in these_removed:
211 |                 all_error += 1
212 |             # Empty sequence due to non-informative removal. Let's introduce it into to_remove list
213 |             if noninformative_prefix and len(these_feats) == 0:
214 |                 if ic not in these_removed:
215 |                     extra_removed += 1
216 |                 to_remove[n][set].append(ic)
217 |                 these_removed.append(ic)
218 |             if ic not in these_removed:
219 |                 written_in_file += 1
220 |                 for feat in these_feats:
221 |                     feats_file.write(feat + '\n')
222 |                 counts_file.write(str(new_count) + '\n')
223 | 
224 |         if noninformative_prefix:
225 |             noninfo_file.close()
226 |         feats_set.close()
227 |     feats_file.close()
228 |     counts_file.close()
229 | 
230 |     print 'Extra removed', n, ':', extra_removed
231 |     print 'Written in file', n, ':', written_in_file
232 |     print '"ERROR" events', n, ':', all_error
233 |     print 'Total original events', n, ':', all_total
234 |     print
235 | 
236 | # get descriptions for each segment
237 | print 'Building captions files...'
238 | print '----------------------------------------'
239 | caption_general = open(data_path + '/' + out_descriptions_path + '/' + 'captions_final' + suffix_name + '.id.en', 'w')
240 | for n, s in sets.iteritems():
241 |     written_in_file = 0
242 |     all_total = 0
243 |     all_error = 0
244 |     split_file = open(data_path + '/' + out_descriptions_path + '/' + n + '_list_final' + suffix_name + '.txt', 'w')
245 |     for set in s:
246 |         with open(data_path + '/' + in_descriptions_path + '/' + set + '.txt', 'r') as desc_file:
247 |             prev_segm = -1
248 |             count = 0
249 |             segm_count = -1
250 |             segm_count_show = 0
251 |             for cline, line in enumerate(desc_file):
252 |                 if line:
253 |                     line = line.rstrip('\n').split(',')
254 |                     segm = line[0]
255 |                     desc = ','.join(line[1:])
256 |                     desc = desc.strip().lower()
257 |                     if prev_segm != segm:
258 |                         all_total += 1
259 |                         if prev_segm == -1:
260 |                             assert int(segm[7:]) == 1
261 |                         else:
262 |                             assert int(segm[7:]) == int(prev_segm[7:]) + 1, set + ', line ' + str(cline) + ': ' + str(
263 |                                 int(segm[7:])) + ' != ' + str(int(prev_segm[7:]) + 1)
264 |                         segm_count += 1
265 |                     if desc != 'error' and segm_count not in to_remove[n][set]:
266 |                         if prev_segm != segm:
267 |                             written_in_file += 1
268 |                             segm_count_show += 1
269 |                             split_file.write(set + '_Segment_' + str(segm_count_show) + '\n')
270 |                             count = 0
271 |                         caption_general.write(set + '_Segment_' + str(segm_count_show)
272 |                                               + '#' + str(count) + separator + desc + '\n')
273 |                         count += 1
274 |                     else:
275 |                         if prev_segm != segm:
276 |                             all_error += 1
277 |                     assert segm[:7] == 'Segment', set + ', line ' + str(cline)
278 | 
279 |                     prev_segm = segm
280 |             try:
281 |                 int(segm[7:])
282 |             except:
283 |                 raise Exception(set + ' wrong Segment identifier: ' + segm)
284 |             assert segm_count + 1 == int(segm[7:]), set + ': ' + str(segm_count + 1) + ' != ' + segm[7:]
285 |             assert len(counts[set]) == segm_count + 1, set + ': ' + str(segm_count + 1) + ' != ' + str(len(counts[set]))
286 | 
287 |     split_file.close()
288 | 
289 |     print 'Written in file', n, ':', written_in_file
290 |     print 'All removed events', n, ':', all_error
291 |     print 'Total original events', n, ':', all_total
292 |     print
293 | 
294 | caption_general.close()
295 | 
296 | print 'DONE!'
297 | 


--------------------------------------------------------------------------------
/data_engine/subsample_frames_features.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | base_path = '/media/HDD_2TB/DATASETS/MSVD/'
 4 | features_path = 'Features/Full_Features'
 5 | output_path = 'Features'
 6 | 
 7 | n_frames_per_video_subsample = 26  # subsample fixed number of equidistant frames per video
 8 | repeat_frames = False  # decides if we are going to repeate some frames when needed for filling the desired
 9 | # "n_frames_per_video_subsample", or we are simply filling the video frames with 0s
10 | 
11 | 
12 | # Inputs
13 | features_name = 'C3D_fc8_ImageNet'
14 | features_files = ['train_' + features_name + '_features.csv', 'val_' + features_name + '_features.csv',
15 |                   'test_' + features_name + '_features.csv']
16 | features_counts_files = ['train_' + features_name + '_counts.txt', 'val_' + features_name + '_counts.txt',
17 |                          'test_' + features_name + '_counts.txt']
18 | 
19 | # Outputs
20 | out_features_name = 'C3D_fc8_ImageNet'
21 | out_features = ['train_' + out_features_name + '.csv', 'val_' + out_features_name + '.csv',
22 |                 'test_' + out_features_name + '.csv']
23 | out_features_counts = ['train_' + out_features_name + '_counts.txt', 'val_' + out_features_name + '_counts.txt',
24 |                        'test_' + out_features_name + '_counts.txt']
25 | 
26 | #########
27 | 
28 | for ff_, fc_, of_, oc_ in zip(features_files, features_counts_files, out_features, out_features_counts):
29 | 
30 |     print 'Processing file', base_path + '/' + features_path + '/' + ff_
31 | 
32 |     # Open files
33 |     ff = open(base_path + '/' + features_path + '/' + ff_, 'r')
34 |     fc = open(base_path + '/' + features_path + '/' + fc_, 'r')
35 |     of = open(base_path + '/' + output_path + '/' + of_, 'w')
36 |     oc = open(base_path + '/' + output_path + '/' + oc_, 'w')
37 | 
38 |     # Process each video
39 |     for count_videos, count in enumerate(fc):
40 |         # Calculate chosen frames
41 |         count = int(count.strip('\n'))
42 |         # pick_pos = np.round(np.linspace(0,count-1,n_frames_per_video_subsample)).astype('int64')
43 |         pick_pos = np.linspace(0, count - 1, n_frames_per_video_subsample).astype('int64')
44 |         if not repeat_frames:
45 |             pick_pos = np.unique(pick_pos)
46 |             count_pick = len(pick_pos)
47 | 
48 |         # Get all frames from current video
49 |         feats = [[] for i in range(count)]
50 |         for i in range(count):
51 |             feats[i] = ff.next()
52 | 
53 |         # Get chosen frames
54 |         for p in pick_pos:
55 |             of.write(feats[p])
56 |             oc.write(str(count_pick) + '\n')
57 |             if count_pick != n_frames_per_video_subsample:
58 |                 print "different", count_videos
59 |                 print "num", count_pick
60 | 
61 |     ff.close()
62 |     fc.close()
63 |     of.close()
64 |     oc.close()
65 | 
66 |     print 'Output stored in', base_path + '/' + output_path + '/' + of_
67 | 


--------------------------------------------------------------------------------
/docs/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarcBS/TMA/76f4e16b3f10c056c45ada5df4a64cc564b69011/docs/model.png


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import ast
  2 | import logging
  3 | import sys
  4 | from timeit import default_timer as timer
  5 | 
  6 | from config import load_parameters
  7 | from data_engine.prepare_data import build_dataset
  8 | from keras_wrapper.cnn_model import loadModel, transferWeights, updateModel
  9 | from keras_wrapper.extra.callbacks import EvalPerformance, Sample
 10 | from keras_wrapper.extra.evaluation import selectMetric
 11 | from keras_wrapper.extra.read_write import dict2pkl, list2file
 12 | from keras_wrapper.utils import decode_predictions_beam_search, decode_predictions
 13 | from viddesc_model import VideoDesc_Model
 14 | 
 15 | logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(message)s', datefmt='%d/%m/%Y %H:%M:%S')
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | def train_model(params):
 20 |     """
 21 |     Training function. Sets the training parameters from params. Build or loads the model and launches the training.
 22 |     :param params: Dictionary of network hyperparameters.
 23 |     :return: None
 24 |     """
 25 | 
 26 |     if params['RELOAD'] > 0:
 27 |         logging.info('Resuming training.')
 28 | 
 29 |     check_params(params)
 30 | 
 31 |     ########### Load data
 32 |     dataset = build_dataset(params)
 33 |     if not '-vidtext-embed' in params['DATASET_NAME']:
 34 |         params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]]
 35 |     else:
 36 |         params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][1]]
 37 |     ###########
 38 | 
 39 | 
 40 |     ########### Build model
 41 | 
 42 |     if params['MODE'] == 'finetuning':
 43 |         # video_model = loadModel(params['PRE_TRAINED_MODEL_STORE_PATHS'], params['RELOAD'])
 44 |         video_model = VideoDesc_Model(params,
 45 |                                       type=params['MODEL_TYPE'],
 46 |                                       verbose=params['VERBOSE'],
 47 |                                       model_name=params['MODEL_NAME'] + '_reloaded',
 48 |                                       vocabularies=dataset.vocabulary,
 49 |                                       store_path=params['STORE_PATH'],
 50 |                                       set_optimizer=False,
 51 |                                       clear_dirs=False)
 52 |         video_model = updateModel(video_model, params['RELOAD_PATH'], params['RELOAD'], reload_epoch=False)
 53 |         video_model.setParams(params)
 54 | 
 55 |         # Define the inputs and outputs mapping from our Dataset instance to our model
 56 |         inputMapping = dict()
 57 |         for i, id_in in enumerate(params['INPUTS_IDS_DATASET']):
 58 |             if len(video_model.ids_inputs) > i:
 59 |                 pos_source = dataset.ids_inputs.index(id_in)
 60 |                 id_dest = video_model.ids_inputs[i]
 61 |                 inputMapping[id_dest] = pos_source
 62 |         video_model.setInputsMapping(inputMapping)
 63 | 
 64 |         outputMapping = dict()
 65 |         for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']):
 66 |             if len(video_model.ids_outputs) > i:
 67 |                 pos_target = dataset.ids_outputs.index(id_out)
 68 |                 id_dest = video_model.ids_outputs[i]
 69 |                 outputMapping[id_dest] = pos_target
 70 |         video_model.setOutputsMapping(outputMapping)
 71 | 
 72 |         video_model.setOptimizer()
 73 |         params['MAX_EPOCH'] += params['RELOAD']
 74 | 
 75 |     else:
 76 |         if params['RELOAD'] == 0 or params['LOAD_WEIGHTS_ONLY']:  # build new model
 77 |             video_model = VideoDesc_Model(params,
 78 |                                           type=params['MODEL_TYPE'],
 79 |                                           verbose=params['VERBOSE'],
 80 |                                           model_name=params['MODEL_NAME'],
 81 |                                           vocabularies=dataset.vocabulary,
 82 |                                           store_path=params['STORE_PATH'],
 83 |                                           set_optimizer=True)
 84 |             dict2pkl(params, params['STORE_PATH'] + '/config')
 85 | 
 86 |             # Define the inputs and outputs mapping from our Dataset instance to our model
 87 |             inputMapping = dict()
 88 |             for i, id_in in enumerate(params['INPUTS_IDS_DATASET']):
 89 |                 if len(video_model.ids_inputs) > i:
 90 |                     pos_source = dataset.ids_inputs.index(id_in)
 91 |                     id_dest = video_model.ids_inputs[i]
 92 |                     inputMapping[id_dest] = pos_source
 93 |             video_model.setInputsMapping(inputMapping)
 94 | 
 95 |             outputMapping = dict()
 96 |             for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']):
 97 |                 if len(video_model.ids_outputs) > i:
 98 |                     pos_target = dataset.ids_outputs.index(id_out)
 99 |                     id_dest = video_model.ids_outputs[i]
100 |                     outputMapping[id_dest] = pos_target
101 |             video_model.setOutputsMapping(outputMapping)
102 | 
103 |             # Only load weights from pre-trained model
104 |             if params['LOAD_WEIGHTS_ONLY'] and params['RELOAD'] > 0:
105 |                 for i in range(0, len(params['RELOAD'])):
106 |                     old_model = loadModel(params['PRE_TRAINED_MODEL_STORE_PATHS'][i], params['RELOAD'][i])
107 |                     video_model = transferWeights(old_model, video_model, params['LAYERS_MAPPING'][i])
108 |                 video_model.setOptimizer()
109 |                 params['RELOAD'] = 0
110 |         else:  # resume from previously trained model
111 |             video_model = loadModel(params['PRE_TRAINED_MODEL_STORE_PATHS'], params['RELOAD'])
112 |             video_model.params['LR'] = params['LR']
113 |             video_model.setOptimizer()
114 | 
115 |             if video_model.model_path != params['STORE_PATH']:
116 |                 video_model.setName(params['MODEL_NAME'], models_path=params['STORE_PATH'], clear_dirs=False)
117 |     # Update optimizer either if we are loading or building a model
118 |     video_model.params = params
119 |     video_model.setOptimizer()
120 |     ###########
121 | 
122 | 
123 |     ########### Test model saving/loading functions
124 |     # saveModel(video_model, params['RELOAD'])
125 |     # video_model = loadModel(params['STORE_PATH'], params['RELOAD'])
126 |     ###########
127 | 
128 | 
129 |     ########### Callbacks
130 |     callbacks = buildCallbacks(params, video_model, dataset)
131 |     ###########
132 | 
133 | 
134 |     ########### Training
135 |     total_start_time = timer()
136 | 
137 |     logger.debug('Starting training!')
138 |     training_params = {'n_epochs': params['MAX_EPOCH'], 'batch_size': params['BATCH_SIZE'],
139 |                        'homogeneous_batches': params['HOMOGENEOUS_BATCHES'], 'maxlen': params['MAX_OUTPUT_TEXT_LEN'],
140 |                        'lr_decay': params['LR_DECAY'], 'lr_gamma': params['LR_GAMMA'],
141 |                        'epochs_for_save': params['EPOCHS_FOR_SAVE'], 'verbose': params['VERBOSE'],
142 |                        'eval_on_sets': params['EVAL_ON_SETS_KERAS'], 'n_parallel_loaders': params['PARALLEL_LOADERS'],
143 |                        'extra_callbacks': callbacks, 'reload_epoch': params['RELOAD'], 'epoch_offset': params['RELOAD'],
144 |                        'data_augmentation': params['DATA_AUGMENTATION'],
145 |                        'patience': params.get('PATIENCE', 0),  # early stopping parameters
146 |                        'metric_check': params.get('STOP_METRIC', None),
147 |                        'eval_on_epochs': params.get('EVAL_EACH_EPOCHS', True),
148 |                        'each_n_epochs': params.get('EVAL_EACH', 1),
149 |                        'start_eval_on_epoch': params.get('START_EVAL_ON_EPOCH', 0)
150 |                        }
151 | 
152 |     video_model.trainNet(dataset, training_params)
153 | 
154 |     total_end_time = timer()
155 |     time_difference = total_end_time - total_start_time
156 |     logging.info('In total is {0:.2f}s = {1:.2f}m'.format(time_difference, time_difference / 60.0))
157 | 
158 | 
159 | def apply_Video_model(params):
160 |     """
161 |         Function for using a previously trained model for sampling.
162 |     """
163 | 
164 |     ########### Load data
165 |     dataset = build_dataset(params)
166 |     if not '-vidtext-embed' in params['DATASET_NAME']:
167 |         params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]]
168 |     else:
169 |         params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][1]]
170 |     ###########
171 | 
172 | 
173 |     ########### Load model
174 |     video_model = loadModel(params['STORE_PATH'], params['SAMPLING_RELOAD_POINT'],
175 |                             reload_epoch=params['SAMPLING_RELOAD_EPOCH'])
176 |     video_model.setOptimizer()
177 |     ###########
178 | 
179 | 
180 |     ########### Apply sampling
181 |     extra_vars = dict()
182 |     extra_vars['tokenize_f'] = eval('dataset.' + params['TOKENIZATION_METHOD'])
183 |     extra_vars['language'] = params.get('TRG_LAN', 'en')
184 | 
185 |     for s in params["EVAL_ON_SETS"]:
186 | 
187 |         # Apply model predictions
188 |         params_prediction = {'max_batch_size': params['BATCH_SIZE'],
189 |                              'n_parallel_loaders': params['PARALLEL_LOADERS'],
190 |                              'predict_on_sets': [s]}
191 | 
192 |         # Convert predictions into sentences
193 |         if not '-vidtext-embed' in params['DATASET_NAME']:
194 |             vocab = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['idx2words']
195 |         else:
196 |             vocab = None
197 | 
198 |         if params['BEAM_SEARCH']:
199 |             params_prediction['beam_size'] = params['BEAM_SIZE']
200 |             params_prediction['maxlen'] = params['MAX_OUTPUT_TEXT_LEN_TEST']
201 |             params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH'] and '-upperbound' not in params[
202 |                 'DATASET_NAME']
203 |             params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL']
204 |             params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL']
205 |             params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET']
206 |             params_prediction['dataset_outputs'] = params['OUTPUTS_IDS_DATASET']
207 |             params_prediction['normalize_probs'] = params['NORMALIZE_SAMPLING']
208 |             params_prediction['alpha_factor'] = params['ALPHA_FACTOR']
209 |             params_prediction['temporally_linked'] = '-linked' in params['DATASET_NAME'] and '-upperbound' not in \
210 |                                                                                              params[
211 |                                                                                                  'DATASET_NAME'] and '-video' not in \
212 |                                                                                                                      params[
213 |                                                                                                                          'DATASET_NAME']
214 |             predictions = video_model.predictBeamSearchNet(dataset, params_prediction)[s]
215 |             predictions = decode_predictions_beam_search(predictions, vocab, verbose=params['VERBOSE'])
216 |         else:
217 |             predictions = video_model.predictNet(dataset, params_prediction)[s]
218 |             predictions = decode_predictions(predictions, 1, vocab, params['SAMPLING'], verbose=params['VERBOSE'])
219 | 
220 |         # Store result
221 |         filepath = video_model.model_path + '/' + s + '_sampling.pred'  # results file
222 |         if params['SAMPLING_SAVE_MODE'] == 'list':
223 |             list2file(filepath, predictions)
224 |         else:
225 |             raise Exception, 'Only "list" is allowed in "SAMPLING_SAVE_MODE"'
226 | 
227 |         # Evaluate if any metric in params['METRICS']
228 |         for metric in params['METRICS']:
229 |             logging.info('Evaluating on metric ' + metric)
230 |             filepath = video_model.model_path + '/' + s + '_sampling.' + metric  # results file
231 | 
232 |             # Evaluate on the chosen metric
233 |             extra_vars[s] = dict()
234 |             extra_vars[s]['references'] = dataset.extra_variables[s][params['OUTPUTS_IDS_DATASET'][0]]
235 |             metrics = selectMetric[metric](
236 |                 pred_list=predictions,
237 |                 verbose=1,
238 |                 extra_vars=extra_vars,
239 |                 split=s)
240 | 
241 |             # Print results to file
242 |             with open(filepath, 'w') as f:
243 |                 header = ''
244 |                 line = ''
245 |                 for metric_ in sorted(metrics):
246 |                     value = metrics[metric_]
247 |                     header += metric_ + ','
248 |                     line += str(value) + ','
249 |                 f.write(header + '\n')
250 |                 f.write(line + '\n')
251 |             logging.info('Done evaluating on metric ' + metric)
252 | 
253 | 
254 | def buildCallbacks(params, model, dataset):
255 |     """
256 |     Builds the selected set of callbacks run during the training of the model.
257 | 
258 |     :param params: Dictionary of network hyperparameters.
259 |     :param model: Model instance on which to apply the callback.
260 |     :param dataset: Dataset instance on which to apply the callback.
261 |     :return:
262 |     """
263 | 
264 |     callbacks = []
265 | 
266 |     if params['METRICS']:
267 |         # Evaluate training
268 |         extra_vars = {'language': params.get('TRG_LAN', 'en'),
269 |                       'n_parallel_loaders': params['PARALLEL_LOADERS'],
270 |                       'tokenize_f': eval('dataset.' + params['TOKENIZATION_METHOD'])}
271 | 
272 |         if not '-vidtext-embed' in params['DATASET_NAME']:
273 |             vocab = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['idx2words']
274 |             for s in params['EVAL_ON_SETS']:
275 |                 extra_vars[s] = dict()
276 |                 extra_vars[s]['references'] = dataset.extra_variables[s][params['OUTPUTS_IDS_DATASET'][0]]
277 |         else:
278 |             vocab = None
279 |             extra_vars['n_classes'] = len(dataset.dic_classes[params['OUTPUTS_IDS_DATASET'][0]].values())
280 |             for s in params['EVAL_ON_SETS']:
281 |                 extra_vars[s] = dict()
282 |                 extra_vars[s]['references'] = eval('dataset.Y_' + s + '["' + params['OUTPUTS_IDS_DATASET'][0] + '"]')
283 | 
284 |         if params['BEAM_SEARCH']:
285 |             extra_vars['beam_size'] = params.get('BEAM_SIZE', 6)
286 |             extra_vars['state_below_index'] = params.get('BEAM_SEARCH_COND_INPUT', -1)
287 |             extra_vars['maxlen'] = params.get('MAX_OUTPUT_TEXT_LEN_TEST', 30)
288 |             extra_vars['optimized_search'] = params.get('OPTIMIZED_SEARCH', True) and '-upperbound' not in params[
289 |                 'DATASET_NAME']
290 |             extra_vars['model_inputs'] = params['INPUTS_IDS_MODEL']
291 |             extra_vars['model_outputs'] = params['OUTPUTS_IDS_MODEL']
292 |             extra_vars['dataset_inputs'] = params['INPUTS_IDS_DATASET']
293 |             extra_vars['dataset_outputs'] = params['OUTPUTS_IDS_DATASET']
294 |             extra_vars['normalize_probs'] = params.get('NORMALIZE_SAMPLING', False)
295 |             extra_vars['alpha_factor'] = params.get('ALPHA_FACTOR', 1.)
296 |             extra_vars['temporally_linked'] = '-linked' in params['DATASET_NAME'] and '-upperbound' not in params[
297 |                 'DATASET_NAME'] and '-video' not in params['DATASET_NAME']
298 |             input_text_id = None
299 |             vocab_src = None
300 | 
301 |             callback_metric = EvalPerformance(model,
302 |                                               dataset,
303 |                                               gt_id=params['OUTPUTS_IDS_DATASET'][0],
304 |                                               metric_name=params['METRICS'],
305 |                                               set_name=params['EVAL_ON_SETS'],
306 |                                               batch_size=params['BATCH_SIZE'],
307 |                                               each_n_epochs=params['EVAL_EACH'],
308 |                                               extra_vars=extra_vars,
309 |                                               reload_epoch=params['RELOAD'],
310 |                                               is_text=True,
311 |                                               input_text_id=input_text_id,
312 |                                               index2word_y=vocab,
313 |                                               index2word_x=vocab_src,
314 |                                               sampling_type=params['SAMPLING'],
315 |                                               beam_search=params['BEAM_SEARCH'],
316 |                                               save_path=model.model_path,
317 |                                               start_eval_on_epoch=params['START_EVAL_ON_EPOCH'],
318 |                                               write_samples=True,
319 |                                               write_type=params['SAMPLING_SAVE_MODE'],
320 |                                               eval_on_epochs=params['EVAL_EACH_EPOCHS'],
321 |                                               save_each_evaluation=params['SAVE_EACH_EVALUATION'],
322 |                                               verbose=params['VERBOSE'])
323 |         else:
324 |             callback_metric = EvalPerformance(model,
325 |                                               dataset,
326 |                                               gt_id=params['OUTPUTS_IDS_DATASET'][0],
327 |                                               metric_name=params['METRICS'],
328 |                                               set_name=params['EVAL_ON_SETS'],
329 |                                               batch_size=params['BATCH_SIZE'],
330 |                                               each_n_epochs=params['EVAL_EACH'],
331 |                                               extra_vars=extra_vars,
332 |                                               reload_epoch=params['RELOAD'],
333 |                                               save_path=model.model_path,
334 |                                               start_eval_on_epoch=params[
335 |                                                   'START_EVAL_ON_EPOCH'],
336 |                                               write_samples=True,
337 |                                               write_type=params['SAMPLING_SAVE_MODE'],
338 |                                               eval_on_epochs=params['EVAL_EACH_EPOCHS'],
339 |                                               save_each_evaluation=params[
340 |                                                   'SAVE_EACH_EVALUATION'],
341 |                                               verbose=params['VERBOSE'])
342 | 
343 |         callbacks.append(callback_metric)
344 | 
345 |     if params['SAMPLE_ON_SETS']:
346 |         # Write some samples
347 |         extra_vars = {'language': params.get('TRG_LAN', 'en'), 'n_parallel_loaders': params['PARALLEL_LOADERS']}
348 |         if not '-vidtext-embed' in params['DATASET_NAME']:
349 |             vocab = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['idx2words']
350 |         else:
351 |             vocab = None
352 |         if params['BEAM_SEARCH']:
353 |             extra_vars['beam_size'] = params['BEAM_SIZE']
354 |             extra_vars['state_below_index'] = params.get('BEAM_SEARCH_COND_INPUT', -1)
355 |             extra_vars['maxlen'] = params['MAX_OUTPUT_TEXT_LEN_TEST']
356 |             extra_vars['optimized_search'] = params['OPTIMIZED_SEARCH'] and '-upperbound' not in params['DATASET_NAME']
357 |             extra_vars['model_inputs'] = params['INPUTS_IDS_MODEL']
358 |             extra_vars['model_outputs'] = params['OUTPUTS_IDS_MODEL']
359 |             extra_vars['dataset_inputs'] = params['INPUTS_IDS_DATASET']
360 |             extra_vars['dataset_outputs'] = params['OUTPUTS_IDS_DATASET']
361 |             extra_vars['normalize_probs'] = params['NORMALIZE_SAMPLING']
362 |             extra_vars['alpha_factor'] = params['ALPHA_FACTOR']
363 |             extra_vars['temporally_linked'] = '-linked' in params['DATASET_NAME'] and '-upperbound' not in params[
364 |                 'DATASET_NAME'] and '-video' not in params['DATASET_NAME']
365 | 
366 |         callback_sampling = Sample(model,
367 |                                    dataset,
368 |                                    gt_id=params['OUTPUTS_IDS_DATASET'][0],
369 |                                    set_name=params['SAMPLE_ON_SETS'],
370 |                                    n_samples=params['N_SAMPLES'],
371 |                                    each_n_updates=params['SAMPLE_EACH_UPDATES'],
372 |                                    extra_vars=extra_vars,
373 |                                    reload_epoch=params['RELOAD'],
374 |                                    batch_size=params['BATCH_SIZE'],
375 |                                    is_text=True,
376 |                                    index2word_y=vocab,  # text info
377 |                                    in_pred_idx=params['INPUTS_IDS_DATASET'][0],
378 |                                    sampling_type=params['SAMPLING'],  # text info
379 |                                    beam_search=params['BEAM_SEARCH'],
380 |                                    start_sampling_on_epoch=params['START_SAMPLING_ON_EPOCH'],
381 |                                    verbose=params['VERBOSE'])
382 |         callbacks.append(callback_sampling)
383 | 
384 |     return callbacks
385 | 
386 | 
387 | def check_params(params):
388 |     if 'Glove' in params['MODEL_TYPE'] and params['GLOVE_VECTORS'] is None:
389 |         logger.warning("You set a model that uses pretrained word vectors but you didn't specify a vector file."
390 |                        "We'll train WITHOUT pretrained embeddings!")
391 |     if params["USE_DROPOUT"] and params["USE_BATCH_NORMALIZATION"]:
392 |         logger.warning("It's not recommended to use both dropout and batch normalization")
393 | 
394 | 
395 | if __name__ == "__main__":
396 | 
397 |     parameters = load_parameters()
398 |     try:
399 |         for arg in sys.argv[1:]:
400 |             k, v = arg.split('=')
401 |             parameters[k] = ast.literal_eval(v)
402 |     except ValueError:
403 |         print 'Overwritten arguments must have the form key=Value'
404 |         exit(1)
405 |     check_params(parameters)
406 |     if parameters['MODE'] == 'training' or parameters['MODE'] == 'finetuning':
407 |         logging.info('Running training.')
408 |         train_model(parameters)
409 |     elif parameters['MODE'] == 'sampling':
410 |         logging.info('Running sampling.')
411 |         apply_Video_model(parameters)
412 | 
413 |     logging.info('Done!')
414 | 


--------------------------------------------------------------------------------
/meta-optimizers/spearmint/README.md:
--------------------------------------------------------------------------------
 1 | Package for performing hyperparameter optimization with [Spearmint] (https://github.com/HIPS/Spearmint).
 2 | 
 3 | Requirements:  Those specified in the [Spearmint] (https://github.com/HIPS/Spearmint) package:
 4 | 
 5 | * [NumPy](http://www.numpy.org/)
 6 | * [scikit learn](http://scikit-learn.org/stable/index.html)
 7 | * [pymongo](https://api.mongodb.org/python/current)
 8 | * [MongoDB](https://www.mongodb.org)
 9 | 
10 | Installation: 
11 | 
12 | * Install [Spearmint] (https://github.com/HIPS/Spearmint/blob/master/README.md)
13 | 
14 | Usage:
15 | 
16 |  1) Set your experimental settings (see `${nmt_keras_path}/spearmint/config.json` for an example)
17 | 
18 |  * **_WARNING!_**: It is highly recommendable to specify an absolute path to the data files in `config.py` when launching spearmint!
19 | 
20 |  2) Run the `launch_spearmint.sh` script. It will execute the following steps:
21 | 
22 |  * Get NMT-Keras directory:
23 |  
24 |  ```bash
25 |     cd nmt-keras
26 |     nmt_keras_path=`pwd`
27 |  ```
28 |   
29 |  * Create directory for storing the database:
30 |  
31 |  ```bash
32 |  mkdir ${nmt_keras_path}/spearmint/db
33 |  ```
34 |  
35 |  * Start the Mongo database:
36 |  
37 |  ```bash
38 |  mongod --fork --logpath ${nmt_keras_path}/spearmint/db/log --dbpath ${nmt_keras_path}/spearmint/db
39 |  ```
40 |  
41 |   * Remove eventual instances of previous experiments
42 |   
43 |  ```bash
44 |   ${spearmint_path}/spearmint/cleanup.sh ${nmt_keras_path}/spearmint/
45 |  ```
46 |  
47 |  * Lauch Spearmint! Assuming that it is installed under `${spearmint_path}`:
48 |  
49 |     ```bash
50 |     cd ${nmt_keras_path}; nohup python ${spearmint_path}/spearmint/main.py ${dest_dir} --config=${nmt_keras_path}/meta-optimizers/spearmint/config.json >> ${dest_dir}/logs/out.log 2> ${dest_dir}/logs/out.err &
51 |     ```
52 |     
53 |  * The results will appear at `${nmt_keras_path}/spearmint/output` 
54 |  
55 |  


--------------------------------------------------------------------------------
/meta-optimizers/spearmint/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarcBS/TMA/76f4e16b3f10c056c45ada5df4a64cc564b69011/meta-optimizers/spearmint/__init__.py


--------------------------------------------------------------------------------
/meta-optimizers/spearmint/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "language": "PYTHON",
 3 |   "main-file": "spearmint_opt.py",
 4 |   "experiment-name": "TemporallyLinkedVideoDescriptionAtt",
 5 |   "likelihood": "GAUSSIAN",
 6 |   "variables": {
 7 |     "TARGET_TEXT_EMBEDDING_SIZE": {
 8 |       "type": "INT",
 9 |       "size": 1,
10 |       "min": 50,
11 |       "max": 600
12 |     },
13 |     "ENCODER_HIDDEN_SIZE": {
14 |       "type": "INT",
15 |       "size": 1,
16 |       "min": 100,
17 |       "max": 500
18 |     },
19 |     "LR_GAMMA": {
20 |       "type": "FLOAT",
21 |       "size": 1,
22 |       "min": 0.95,
23 |       "max": 1.0
24 |     },
25 |     "N_LAYERS_ENCODER": {
26 |       "type": "INT",
27 |       "size": 1,
28 |       "min": 1,
29 |       "max": 2
30 |     },
31 |     "N_LAYERS_PREV_SENT_ENCODER": {
32 |       "type": "INT",
33 |       "size": 1,
34 |       "min": 1,
35 |       "max": 2
36 |     },
37 |     "DECODER_HIDDEN_SIZE": {
38 |       "type": "INT",
39 |       "size": 1,
40 |       "min": 100,
41 |       "max": 600
42 |     },
43 |     "PREV_SENT_ENCODER_HIDDEN_SIZE": {
44 |       "type": "INT",
45 |       "size": 1,
46 |       "min": 100,
47 |       "max": 500
48 |     }
49 |   }
50 | }
51 | 
52 | 


--------------------------------------------------------------------------------
/meta-optimizers/spearmint/launch_spearmint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | spearmint_path=${SOFTWARE_PREFIX}/Spearmint
 4 | nmt_keras_path=${SOFTWARE_PREFIX}/egocentric-video-description
 5 | dest_dir=${nmt_keras_path}/meta-optimizers/spearmint
 6 | mkdir -p ${dest_dir}/db
 7 | mkdir -p ${dest_dir}/logs
 8 | 
 9 | #Launch mongodb if it is not already launched
10 | if [ `ps -wuax |grep mongod |wc -l` -lt 2 ]; then
11 |     mongod --fork --logpath ${dest_dir}/db/log --dbpath ${dest_dir}/db;
12 | fi
13 | 
14 | 
15 | ${spearmint_path}/spearmint/cleanup.sh ${dest_dir}
16 | 
17 | cd ${nmt_keras_path}; nohup python ${spearmint_path}/spearmint/main.py ${dest_dir} --config=${nmt_keras_path}/meta-optimizers/spearmint/config.json >> ${dest_dir}/logs/out.log 2> ${dest_dir}/logs/out.err &
18 | echo "Main Spearmint process PID:" $! >> ${dest_dir}/logs/out.log


--------------------------------------------------------------------------------
/meta-optimizers/spearmint/spearmint_opt.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import subprocess
 4 | import sys
 5 | 
 6 | # sys.path.append("../../") # Adds higher directory to python modules path.
 7 | sys.path.insert(1, os.path.abspath("."))
 8 | sys.path.insert(0, os.path.abspath("../../"))
 9 | 
10 | print sys.path
11 | 
12 | from config import load_parameters
13 | from main import check_params, train_model
14 | 
15 | logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(message)s', datefmt='%d/%m/%Y %H:%M:%S')
16 | logger = logging.getLogger(__name__)
17 | metric_name = 'Bleu_4'
18 | maximize = True  # Select whether we want to maximize the metric or minimize it
19 | d = dict(os.environ.copy())
20 | d['LC_NUMERIC'] = 'en_US.utf-8'
21 | 
22 | 
23 | def invoke_model(parameters):
24 |     model_params = load_parameters()
25 |     model_name = model_params["MODEL_TYPE"]
26 |     for parameter in parameters.keys():
27 |         model_params[parameter] = parameters[parameter][0]
28 |         logger.debug("Assigning to %s the value %s" % (str(parameter), parameters[parameter][0]))
29 |         model_name += '_' + str(parameter) + '_' + str(parameters[parameter][0])
30 |     model_params["SKIP_VECTORS_HIDDEN_SIZE"] = model_params["TARGET_TEXT_EMBEDDING_SIZE"]
31 |     model_params["MODEL_NAME"] = model_name
32 |     # models and evaluation results will be stored here
33 |     model_params[
34 |         "STORE_PATH"] = '/home/lvapeab/smt/software/egocentric-video-description/meta-optimizers/spearmint/trained_models/' + \
35 |                         model_params["MODEL_NAME"] + '/'
36 |     check_params(model_params)
37 |     assert model_params['MODE'] == 'training', 'You can only launch Spearmint when training!'
38 |     logging.info('Running training.')
39 |     train_model(model_params)
40 | 
41 |     results_path = model_params['STORE_PATH'] + '/' + model_params['EVAL_ON_SETS'][0] + '.' + model_params['METRICS'][0]
42 | 
43 |     # Recover the highest metric score
44 |     metric_pos_cmd = "head -n 1 " + results_path + \
45 |                      " |awk -v metric=" + metric_name + \
46 |                      " 'BEGIN{FS=\",\"}" \
47 |                      "{for (i=1; i<=NF; i++) if ($i == metric) print i;}'"
48 |     metric_pos = \
49 |     subprocess.Popen(metric_pos_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True).communicate()[0][:-1]
50 |     cmd = "tail -n +2 " + results_path + \
51 |           " |awk -v m_pos=" + str(metric_pos) + \
52 |           " 'BEGIN{FS=\",\"}{print $m_pos}'|sort -gr|head -n 1"
53 |     ps = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, env=d)
54 |     metric_value = float(ps.communicate()[0])
55 |     print "Best %s: %f" % (metric_name, metric_value)
56 | 
57 |     return 1. - metric_value if maximize else metric_value  # Spearmint minimizes a function
58 | 
59 | 
60 | def main(job_id, params):
61 |     print params
62 |     return invoke_model(params)
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     # Testing function
67 |     params = {'SOURCE_TEXT_EMBEDDING_SIZE': [1],
68 |               'ENCODER_HIDDEN_SIZE': [2],
69 |               'TARGET_TEXT_EMBEDDING_SIZE': [1],
70 |               'DECODER_HIDDEN_SIZE': [2],
71 |               'MAX_EPOCH': [2],
72 |               'START_EVAL_ON_EPOCH': [1]}
73 |     main(1, params)
74 | 


--------------------------------------------------------------------------------
/train.sh:
--------------------------------------------------------------------------------
1 | PYTHONPATH=$PYTHONPATH:/media/HDD_2TB/marc/multimodal_keras_wrapper python -u main.py
2 | 


--------------------------------------------------------------------------------
/turing_test.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import random
  3 | import sys
  4 | 
  5 | import numpy as np
  6 | 
  7 | from config import load_parameters
  8 | from data_engine.prepare_data import build_dataset
  9 | from viddesc_model import VideoDesc_Model
 10 | 
 11 | logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(message)s', datefmt='%d/%m/%Y %H:%M:%S')
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | def build(params):
 16 |     ds = build_dataset(params)
 17 |     params['OUTPUT_VOCABULARY_SIZE'] = ds.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]]
 18 |     vocab = ds.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['idx2words']
 19 | 
 20 |     # We only want the model for decoding
 21 |     video_model = VideoDesc_Model(params,
 22 |                                   type=params['MODEL_TYPE'],
 23 |                                   verbose=0,
 24 |                                   model_name=params['MODEL_NAME'],
 25 |                                   vocabularies=ds.vocabulary,
 26 |                                   store_path=params['STORE_PATH'],
 27 |                                   set_optimizer=False)
 28 | 
 29 |     return ds, vocab, video_model
 30 | 
 31 | 
 32 | def sample(ds, vocab, video_model, n_samples, split='train', verbose=1):
 33 |     truth_data = np.random.randint(0, high=eval('ds.len_' + split), size=n_samples)
 34 | 
 35 |     matches = 0
 36 |     misses = 0
 37 |     guesses = 0
 38 | 
 39 |     [truth_X, truth_Y] = ds.getXY_FromIndices('train', truth_data)
 40 | 
 41 |     truth_Xs = video_model.decode_predictions_beam_search(np.asarray(truth_X[-2]), vocab, verbose=0, pad_sequences=True)
 42 |     truth_Ys = video_model.decode_predictions_one_hot(np.asarray(truth_Y[0][0]), vocab)
 43 | 
 44 |     for i, (truth_X, truth_Y) in enumerate(zip(truth_Xs, truth_Ys)):
 45 |         try:
 46 |             fake_data = np.random.randint(0, high=eval('ds.len_' + split), size=n_samples)
 47 |             [fake_X, fake_Y] = ds.getXY_FromIndices('train', fake_data)
 48 |             fake_Xs = video_model.decode_predictions_beam_search(np.asarray(fake_X[-2]), vocab, verbose=0,
 49 |                                                                  pad_sequences=True)
 50 |             fake_Ys = video_model.decode_predictions_one_hot(np.asarray(fake_Y[0][0]), vocab)
 51 | 
 52 |             print "Input", i, ":", truth_X
 53 |             print "Which is the following event?"
 54 | 
 55 |             answer_list = [truth_Y] + fake_Ys
 56 |             correctness_list = [True] + [False] * len(fake_Ys)
 57 |             answer_correctness_list = list(zip(answer_list, correctness_list))
 58 |             random.shuffle(answer_correctness_list)
 59 |             shuffled_answer_list, shuffled_correctness_list = zip(*answer_correctness_list)
 60 |             for j, answer in enumerate(shuffled_answer_list):
 61 |                 print "\t", j, ":", answer
 62 |             action = int(raw_input('Select the upcoming event. \n'))
 63 |             if shuffled_correctness_list[action]:
 64 |                 matches += 1
 65 |                 if verbose:
 66 |                     print "Correct!"
 67 |             else:
 68 |                 misses += 1
 69 |                 if verbose:
 70 |                     print "Not correct!. The correct one was:", shuffled_answer_list[
 71 |                         shuffled_correctness_list.index(True)]
 72 |             guesses += 1
 73 |             print ""
 74 |             print ""
 75 |         except KeyboardInterrupt:
 76 |             return matches, misses, guesses
 77 | 
 78 |     return matches, misses, guesses
 79 | 
 80 | 
 81 | if __name__ == "__main__":
 82 | 
 83 |     parameters = load_parameters()
 84 |     ###########
 85 |     ds, vocab, model = build(parameters)
 86 |     total_matches = 0
 87 |     total_misses = 0
 88 |     total_guesses = 0
 89 |     while True:
 90 |         try:
 91 |             matches, misses, guesses = sample(ds, vocab, model, 4, split='train', verbose=0)
 92 |             total_matches += matches
 93 |             total_misses += misses
 94 |             total_guesses += guesses
 95 |         except KeyboardInterrupt:
 96 |             print "Interrupted!"
 97 |             print "Total number of matches: %d/%d" % (total_matches, total_guesses)
 98 |             print "Total number of misses: %d/%d" % (total_misses, total_guesses)
 99 |             print "Precision: %f" % (float(total_matches) / total_guesses)
100 |             sys.exit(0)
101 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarcBS/TMA/76f4e16b3f10c056c45ada5df4a64cc564b69011/utils/__init__.py


--------------------------------------------------------------------------------
/utils/common.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | import json
  4 | import os
  5 | import re
  6 | 
  7 | from toolz import itemmap
  8 | 
  9 | from keras.optimizers import Adadelta
 10 | from keras.optimizers import Adagrad
 11 | from keras.optimizers import Adam
 12 | from keras.optimizers import RMSprop
 13 | from keras.optimizers import SGD
 14 | 
 15 | PADDING = '<pad>'
 16 | UNKNOWN = 'UNK'
 17 | EOA = '<eos>'  # end of answer
 18 | EOQ = '<eoq>'  # end of question
 19 | EXTRA_WORDS_NAMES = [PADDING, UNKNOWN, EOA, EOQ]
 20 | EXTRA_WORDS = {PADDING: 0, UNKNOWN: 1, EOA: 2, EOQ: 3}
 21 | EXTRA_WORDS_ID = itemmap(reversed, EXTRA_WORDS)
 22 | MAXLEN = 50
 23 | 
 24 | OPTIMIZERS = { \
 25 |     'sgd': SGD,
 26 |     'adagrad': Adagrad,
 27 |     'adadelta': Adadelta,
 28 |     'rmsprop': RMSprop,
 29 |     'adam': Adam,
 30 | }
 31 | 
 32 | 
 33 | ###
 34 | # Functions
 35 | ###
 36 | def static_vars(**kwargs):
 37 |     def decorate(func):
 38 |         for k in kwargs:
 39 |             setattr(func, k, kwargs[k])
 40 |         return func
 41 | 
 42 |     return decorate
 43 | 
 44 | 
 45 | @static_vars(counter=len(EXTRA_WORDS))
 46 | def _myinc(d):
 47 |     """
 48 |     Gets a tuple d, and returns d[0]: id.
 49 |     """
 50 |     x = d[0]
 51 |     _myinc.counter += 1
 52 |     return (x, _myinc.counter - 1)
 53 | 
 54 | 
 55 | def create_dir_if_not_exists(directory):
 56 |     if not os.path.exists(directory):
 57 |         print 'creating directory %s' % directory
 58 |         os.makedirs(directory)
 59 |     else:
 60 |         print "%s already exists!" % directory
 61 | 
 62 | 
 63 | def preprocess_line(line):
 64 |     cap_tmp = line.strip().decode('utf-8').lower().encode('utf8')
 65 |     return cap_tmp
 66 | 
 67 | 
 68 | def preprocess_caption(cap):
 69 |     commaStrip = re.compile("(\d)(\,)(\d)")
 70 |     punct = [';', r"/", '[', ']', '"', '{', '}', '(', ')', '=', '+', '\\', '_', '-', '>', '<', '@', '`', ',', '?', '!']
 71 |     periodStrip = re.compile("(?!<=\d)(\.)(?!\d)")
 72 | 
 73 |     def processPunctuation(inText):
 74 |         outText = inText
 75 |         for p in punct:
 76 |             if (p + ' ' in inText or ' ' + p in inText) or (re.search(commaStrip, inText) != None):
 77 |                 outText = outText.replace(p, '')
 78 |             else:
 79 |                 outText = outText.replace(p, ' ')
 80 |         outText = periodStrip.sub("", outText, re.UNICODE)
 81 |         return outText
 82 | 
 83 |     cap_tmp = cap.strip().decode('utf-8').lower().encode('utf8')
 84 |     cap_tmp = processPunctuation(cap_tmp)
 85 |     return cap_tmp
 86 | 
 87 | 
 88 | def preprocess_question(q):
 89 |     contractions = {"aint": "ain't", "arent": "aren't", "cant": "can't", "couldve": "could've", "couldnt": "couldn't",
 90 |                     "couldn'tve": "couldn’t’ve", "couldnt’ve": "couldn’t’ve", "didnt": "didn’t", "doesnt": "doesn’t",
 91 |                     "dont": "don’t", "hadnt": "hadn’t", "hadnt’ve": "hadn’t’ve", "hadn'tve": "hadn’t’ve",
 92 |                     "hasnt": "hasn’t", "havent": "haven’t", "hed": "he’d", "hed’ve": "he’d’ve", "he’dve": "he’d’ve",
 93 |                     "hes": "he’s", "howd": "how’d", "howll": "how’ll", "hows": "how’s", "Id’ve": "I’d’ve",
 94 |                     "I’dve": "I’d’ve", "Im": "I’m", "Ive": "I’ve", "isnt": "isn’t", "itd": "it’d", "itd’ve": "it’d’ve",
 95 |                     "it’dve": "it’d’ve", "itll": "it’ll", "let’s": "let’s", "maam": "ma’am", "mightnt": "mightn’t",
 96 |                     "mightnt’ve": "mightn’t’ve", "mightn’tve": "mightn’t’ve", "mightve": "might’ve",
 97 |                     "mustnt": "mustn’t",
 98 |                     "mustve": "must’ve", "neednt": "needn’t", "notve": "not’ve", "oclock": "o’clock",
 99 |                     "oughtnt": "oughtn’t",
100 |                     "ow’s’at": "’ow’s’at", "’ows’at": "’ow’s’at", "’ow’sat": "’ow’s’at", "shant": "shan’t",
101 |                     "shed’ve": "she’d’ve", "she’dve": "she’d’ve", "she’s": "she’s", "shouldve": "should’ve",
102 |                     "shouldnt": "shouldn’t", "shouldnt’ve": "shouldn’t’ve", "shouldn’tve": "shouldn’t’ve",
103 |                     "somebody’d": "somebodyd", "somebodyd’ve": "somebody’d’ve", "somebody’dve": "somebody’d’ve",
104 |                     "somebodyll": "somebody’ll", "somebodys": "somebody’s", "someoned": "someone’d",
105 |                     "someoned’ve": "someone’d’ve", "someone’dve": "someone’d’ve", "someonell": "someone’ll",
106 |                     "someones": "someone’s", "somethingd": "something’d", "somethingd’ve": "something’d’ve",
107 |                     "something’dve": "something’d’ve", "somethingll": "something’ll", "thats": "that’s",
108 |                     "thered": "there’d", "thered’ve": "there’d’ve", "there’dve": "there’d’ve", "therere": "there’re",
109 |                     "theres": "there’s", "theyd": "they’d", "theyd’ve": "they’d’ve", "they’dve": "they’d’ve",
110 |                     "theyll": "they’ll", "theyre": "they’re", "theyve": "they’ve", "twas": "’twas", "wasnt": "wasn’t",
111 |                     "wed’ve": "we’d’ve", "we’dve": "we’d’ve", "weve": "we've", "werent": "weren’t", "whatll": "what’ll",
112 |                     "whatre": "what’re", "whats": "what’s", "whatve": "what’ve", "whens": "when’s", "whered":
113 |                         "where’d", "wheres": "where's", "whereve": "where’ve", "whod": "who’d", "whod’ve": "who’d’ve",
114 |                     "who’dve": "who’d’ve", "wholl": "who’ll", "whos": "who’s", "whove": "who've", "whyll": "why’ll",
115 |                     "whyre": "why’re", "whys": "why’s", "wont": "won’t", "wouldve": "would’ve", "wouldnt": "wouldn’t",
116 |                     "wouldnt’ve": "wouldn’t’ve", "wouldn’tve": "wouldn’t’ve", "yall": "y’all", "yall’ll": "y’all’ll",
117 |                     "y’allll": "y’all’ll", "yall’d’ve": "y’all’d’ve", "y’alld’ve": "y’all’d’ve",
118 |                     "y’all’dve": "y’all’d’ve",
119 |                     "youd": "you’d", "youd’ve": "you’d’ve", "you’dve": "you’d’ve", "youll": "you’ll",
120 |                     "youre": "you’re", "youve": "you’ve"}
121 |     manualMap = {'none': '0', 'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6',
122 |                  'seven': '7', 'eight': '8', 'nine': '9', 'ten': '10'}
123 |     articles = ['a', 'an', 'the']
124 |     commaStrip = re.compile("(\d)(\,)(\d)")
125 |     punct = [';', r"/", '[', ']', '"', '{', '}', '(', ')', '=', '+', '\\', '_', '-', '>', '<', '@', '`', ',', '?', '!']
126 |     periodStrip = re.compile("(?!<=\d)(\.)(?!\d)")
127 | 
128 |     def processPunctuation(inText):
129 |         outText = inText
130 |         for p in punct:
131 |             if (p + ' ' in inText or ' ' + p in inText) or (re.search(commaStrip, inText) != None):
132 |                 outText = outText.replace(p, '')
133 |             else:
134 |                 outText = outText.replace(p, ' ')
135 |         outText = periodStrip.sub("", outText, re.UNICODE)
136 |         return outText
137 | 
138 |     def processDigitArticle(inText):
139 |         outText = []
140 |         tempText = inText.lower().split()
141 |         for word in tempText:
142 |             word = manualMap.setdefault(word, word)
143 |             if word not in articles:
144 |                 outText.append(word)
145 |             else:
146 |                 pass
147 |         for wordId, word in enumerate(outText):
148 |             if word in contractions:
149 |                 outText[wordId] = contractions[word]
150 |         outText = ' '.join(outText)
151 |         return outText
152 | 
153 |     q_tmp = q.strip().lower().encode('utf8')
154 |     # q_tmp = processPunctuation(q_tmp)
155 |     # q_tmp = processDigitArticle(q_tmp)
156 |     if q_tmp[-1] == '?' and q_tmp[-2] != ' ':
157 |         # separate word token from the question mark
158 |         q_tmp = q_tmp[:-1] + ' ?'
159 |     # remove question mark
160 |     if q_tmp[-1] == '?': q_tmp = q_tmp[:-1]
161 |     return q_tmp
162 | 
163 | 
164 | def save_txt_answers(samples, savefile='./sample', whichset='val', step=''):
165 |     with open(savefile + '_' + whichset + '_samples_' + str(step) + '.json', 'w') as f:
166 |         print >> f, '\n'.join(samples)
167 | 
168 | 
169 | def save_json_answers(samples, savefile='./sample', whichset='val', step=''):
170 |     with open(savefile + '_' + whichset + '_samples_' + str(step) + '.json', 'w') as f:
171 |         json.dump(samples, f)
172 | 
173 | 
174 | def build_vocabulary(this_wordcount, extra_words=EXTRA_WORDS,
175 |                      is_reset=True, truncate_to_most_frequent=0):
176 |     """
177 |     Builds vocabulary from wordcount.
178 |     It also adds extra words to the vocabulary.
179 | 
180 |     In:
181 |         this_wordcount - dictionary of wordcounts, e.g. {'cpu':3}
182 |         extra_words - additional words to build the vocabulary
183 |             dictionary of {word: id}
184 |             by default {UNKNOWN: 0}
185 |         is_reset - if True we restart the vocabulary counting
186 |             by defaults False
187 |         truncate_to_most_frequent - if positive then the vocabulary
188 |             is truncated to 'truncate_to_most_frequent' words;
189 |             by default 0
190 |     Out:
191 |         word2index - mapping from words to indices
192 |         index2word - mapping from indices to words
193 |     """
194 |     if is_reset:
195 |         _myinc.counter = len(EXTRA_WORDS)
196 |     if truncate_to_most_frequent > 0:
197 |         sorted_wordcount = dict(sorted(
198 |             this_wordcount.items(), key=lambda x: x[1], reverse=True)[:truncate_to_most_frequent])
199 |         this_wordcount = sorted_wordcount
200 |     word2index = itemmap(_myinc, this_wordcount)
201 |     if not extra_words == {}:
202 |         assert (all([el not in word2index.values() for el in extra_words.values()]))
203 |         word2index.update(extra_words)
204 |     index2word = itemmap(reversed, word2index)
205 |     return word2index, index2word
206 | 
207 | 
208 | def index_sequence(x, word2index):
209 |     """
210 |     Converts list of words into a list of its indices wrt. word2index, that is into
211 |     index encoded sequence.
212 | 
213 |     In:
214 |         x - list of lines
215 |         word2index - mapping from words to indices
216 | 
217 |     Out:
218 |         a list of the list of indices that encode the words
219 |     """
220 |     one_hot_x = []
221 |     for line in x:
222 |         line_list = []
223 |         for w in line.split():
224 |             w = w.strip()
225 |             if w in word2index:
226 |                 this_ind = word2index[w]
227 |             else:
228 |                 this_ind = word2index[UNKNOWN]
229 |             line_list.append(this_ind)
230 |         one_hot_x.append(line_list)
231 |     return one_hot_x
232 | 


--------------------------------------------------------------------------------
/utils/evaluate_from_file.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Scores a file of hypothesis.
  3 | Usage:
  4 |     1. Set the references in this file (questions and annotations).
  5 |     2. python evaluate_vqa.py hypothesis.json
  6 | """
  7 | 
  8 | import argparse
  9 | 
 10 | from pycocoevalcap.bleu.bleu import Bleu
 11 | from pycocoevalcap.cider.cider import Cider
 12 | from pycocoevalcap.meteor.meteor import Meteor
 13 | from pycocoevalcap.rouge.rouge import Rouge
 14 | from pycocoevalcap.vqa import vqaEval, visual_qa
 15 | 
 16 | # ROOT_PATH = '/home/lvapeab/smt/tasks/image_desc/'
 17 | ROOT_PATH = '/media/HDD_2TB/DATASETS/'
 18 | 
 19 | questions = ROOT_PATH + '/VQA/Questions/OpenEnded_mscoco_val2014_questions.json'
 20 | annotations = ROOT_PATH + '/VQA/Annotations/mscoco_val2014_annotations.json'
 21 | 
 22 | parser = argparse.ArgumentParser(
 23 |     description="""This takes two files and a path the references (source, references),
 24 |      computes bleu, meteor, rouge and cider metrics""", formatter_class=argparse.RawTextHelpFormatter)
 25 | parser.add_argument('-vqa', default=False, action="store_true", help='Compute VQA metrics')
 26 | 
 27 | parser.add_argument('-q', type=str, default=questions, help='Path to questions file (only if the -vqa flag is active)')
 28 | parser.add_argument('-a', type=str, default=annotations,
 29 |                     help='Path to annotations file (only if the -vqa flag is active)')
 30 | parser.add_argument('-hyp', type=str, help='Hypotheses file')
 31 | 
 32 | parser.add_argument('-l', type=str, default='en', help='Meteor language')
 33 | parser.add_argument('-r', type=argparse.FileType('r'), nargs="+",
 34 |                     help='Path to all the reference files (single-reference files)')
 35 | 
 36 | 
 37 | def score_vqa(resFile, quesFile, annFile):
 38 |     # create vqa object and vqaRes object
 39 |     vqa_ = visual_qa.VQA(annFile, quesFile)
 40 |     vqaRes = vqa_.loadRes(resFile, quesFile)
 41 |     vqaEval_ = vqaEval.VQAEval(vqa_, vqaRes,
 42 |                                n=2)  # n is precision of accuracy (number of places after decimal), default is 2
 43 |     vqaEval_.evaluate()
 44 |     print "Overall Accuracy is: %.02f\n" % (vqaEval_.accuracy['overall'])
 45 |     return vqaEval_.accuracy['overall']
 46 | 
 47 | 
 48 | def load_textfiles(references, hypothesis):
 49 |     print "The number of references is {}".format(len(references))
 50 |     hypo = {idx: [lines.strip()] for (idx, lines) in enumerate(hypothesis)}
 51 |     # take out newlines before creating dictionary
 52 |     raw_refs = [map(str.strip, r) for r in zip(*references)]
 53 |     refs = {idx: rr for idx, rr in enumerate(raw_refs)}
 54 |     # sanity check that we have the same number of references as hypothesis
 55 |     if len(hypo) != len(refs):
 56 |         raise ValueError("There is a sentence number mismatch between the inputs: \n"
 57 |                          "\t # sentences in references: %d\n"
 58 |                          "\t # sentences in hypothesis: %d" % (len(refs), len(hypo)))
 59 |     return refs, hypo
 60 | 
 61 | 
 62 | def CocoScore(ref, hypo, language='en'):
 63 |     """
 64 |     ref, dictionary of reference sentences (id, sentence)
 65 |     hypo, dictionary of hypothesis sentences (id, sentence)
 66 |     score, dictionary of scores
 67 |     """
 68 |     scorers = [
 69 |         (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
 70 |         (Meteor(language), "METEOR"),
 71 |         (Rouge(), "ROUGE_L"),
 72 |         (Cider(), "CIDEr")
 73 |     ]
 74 |     final_scores = {}
 75 |     for scorer, method in scorers:
 76 |         score, scores = scorer.compute_score(ref, hypo)
 77 |         if type(score) == list:
 78 |             for m, s in zip(method, score):
 79 |                 final_scores[m] = s
 80 |         else:
 81 |             final_scores[method] = score
 82 |     return final_scores
 83 | 
 84 | 
 85 | if __name__ == "__main__":
 86 | 
 87 |     args = parser.parse_args()
 88 |     vqa_evaluation = args.vqa
 89 |     if vqa_evaluation:
 90 |         questions = args.q
 91 |         annotations = args.a
 92 |         hypotheses = args.hyp
 93 |         print "hypotheses file:", hypotheses
 94 |         score = score_vqa(hypotheses, questions, annotations)
 95 |         print "Score: ", score
 96 |     else:
 97 |         language = args.l
 98 |         hypotheses = open(args.hyp, 'r')
 99 |         ref, hypo = load_textfiles(args.r, hypotheses)
100 |         score = CocoScore(ref, hypo, language=language)
101 |         print "Score: ", score
102 | 


--------------------------------------------------------------------------------
/utils/plot_metric.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash                                                                                                                                                                     
 2 | 
 3 | # Read and plot several logs from cococaption                                                                                                                                   
 4 | 
 5 | if [ $# -lt 1 ];
 6 | then
 7 |     echo "Usage $0 [train.log] [val.log] [test.log]"
 8 | fi
 9 | 
10 | metric_pos="3"
11 | metric_name="Bleu_4"
12 | out_name="./${metric_name}_plot"
13 |  tail -n +2 $1 | awk 'BEGIN{FS=","}{print 1}'>/tmp/epochs;
14 | 
15 | i=1
16 | for result in "$@"; do
17 |     basename=$(basename $result)
18 |     tail -n +2 $result | awk -v pos=${metric_pos} 'BEGIN{FS=","}{print $pos}'>/tmp/${basename};
19 |     names[$i]="${basename%.*}"
20 |     i=$(( i + 1 ))
21 | basenames=${basenames}" /tmp/`basename $result`"
22 | done
23 | echo "Epoch ${names[*]}" > /tmp/scores
24 | 
25 | paste -d " " /tmp/epochs $basenames  >> /tmp/scores
26 | 
27 | echo "set encoding iso_8859_1
28 | 
29 | set style data lines
30 | set key font ',20'   height 2
31 | set xtics font ',18' 
32 | set ytics font ',18' 
33 | set xlabel font ',20'  '# Epoch' 
34 | set ylabel font ',20' '${metric_name}';
35 | 
36 | set title ''
37 | set terminal pdf enhanced
38 | set termoption dash
39 | set output '${out_name}.pdf'
40 | set key left
41 | 
42 | set yrange[0:1]
43 | set ytics (0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0)
44 | 
45 | set bmargin 4
46 | plot for [col=2:$(( $# + 1 ))] '/tmp/scores' using 0:col with lines lt col lw 5 title columnheader " | gnuplot
47 |  
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/utils/prepare_features.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from common import create_dir_if_not_exists
 4 | 
 5 | ###### Parameters
 6 | 
 7 | ROOT_PATH = '/media/HDD_2TB/DATASETS/'
 8 | 
 9 | base_path = ROOT_PATH + '/Flickr8k/Features/'
10 | features = 'KCNN'  # KCNN, Scenes, Objects
11 | base_path_save = base_path + features
12 | 
13 | feats_paths = ['train_' + features + '_features.csv',
14 |                'val_' + features + '_features.csv',
15 |                'test_' + features + '_features.csv']
16 | 
17 | names_lists = ['train_list.txt', 'val_list.txt', 'test_list.txt']
18 | folders_save = ['train', 'val', 'test']
19 | 
20 | apply_L2 = False
21 | n_feats = 1024
22 | 
23 | ############
24 | 
25 | if apply_L2:
26 |     file_save = features + '_L2'
27 | else:
28 |     file_save = features
29 | 
30 | 
31 | def csv2npy():
32 |     # Process each data split separately
33 |     for n, f, fs in zip(names_lists, feats_paths, folders_save):
34 |         print "Preparing features %s" % f
35 |         feats_dict = dict()
36 |         # Get file names
37 |         names = []
38 |         with open(base_path + '/' + n, 'r') as file:
39 |             for line in file:
40 |                 line = line.rstrip('\n')
41 |                 line = line.split('.')[0]
42 |                 names.append(line)
43 |         # Get features
44 |         with open(base_path + '/' + f, 'r') as file:
45 |             for i, line in enumerate(file):
46 |                 feats = np.fromstring(line.rstrip('\n'), sep=',')
47 |                 if (apply_L2):
48 |                     feats = feats / np.linalg.norm(feats, ord=2)
49 |                 # Insert in dictionary
50 |                 feats_dict[names[i]] = feats[:n_feats]
51 | 
52 |         # Store dict
53 |         print "Saving features in %s" % (base_path_save + '/' + fs + '/' + file_save + '.npy')
54 |         create_dir_if_not_exists(base_path_save + '/' + fs)
55 |         np.save(base_path_save + '/' + fs + '/' + file_save + '.npy', feats_dict)
56 |         print
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     csv2npy()
61 | 


--------------------------------------------------------------------------------
/utils/pretrain_word_vectors.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | # Parameters
 4 | # ROOT_PATH =  '/home/lvapeab/smt/tasks/image_desc/VQA/'
 5 | ROOT_PATH = '/media/HDD_2TB/DATASETS/VQA/'
 6 | base_path = ROOT_PATH + 'Glove/'
 7 | glove_path = base_path + 'glove.42B.300d.txt'
 8 | dest_file = 'glove_300'
 9 | 
10 | 
11 | def glove2npy(glove_path, base_path_save, dest_file):
12 |     vecs_dict = dict()
13 |     print "Loading vectors from %s" % (glove_path)
14 | 
15 |     glove_vectors = [x[:-1] for x in open(glove_path).readlines()]
16 |     n_vecs = len(glove_vectors)
17 |     print "Found %d vectors in %s" % (n_vecs, glove_path)
18 |     i = 0
19 |     for vector in glove_vectors:
20 |         v = vector.split()
21 |         word = v[0]
22 |         vec = np.asarray(v[1:], dtype='float32')
23 |         vecs_dict[word] = vec
24 |         i += 1
25 |         if i % 1000 == 0:
26 |             print "Processed", i, "vectors (", 100 * float(i) / n_vecs, "%)\r",
27 |     print
28 |     # Store dict
29 |     print "Saving word vectors in %s" % (base_path_save + '/' + dest_file + '.npy')
30 |     # create_dir_if_not_exists(base_path_save)
31 |     np.save(base_path_save + '/' + dest_file + '.npy', vecs_dict)
32 |     print
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     glove2npy(glove_path, base_path, dest_file)
37 | 


--------------------------------------------------------------------------------
/utils/sort_by_split.py:
--------------------------------------------------------------------------------
 1 | # Retrieves the images of a given split and sorts them according to that split
 2 | import shutil
 3 | 
 4 | from common import create_dir_if_not_exists
 5 | 
 6 | image_dir = '/data/DATASETS/Flickr8k/Images'
 7 | annotatios_dir = '/data/DATASETS/Flickr8k/Annotations'
 8 | split_name = 'val'
 9 | dest_dir = image_dir + '/' + split_name + '_images'
10 | ext = '.jpg'
11 | 
12 | with open(annotatios_dir + '/' + split_name + '_list_ids.txt') as f:
13 |     lines = f.readlines()
14 | 
15 | create_dir_if_not_exists(dest_dir)
16 | n_items = len(str(len(lines))) + 1
17 | i = 0
18 | for filename in lines:
19 |     i += 1
20 |     shutil.copyfile(image_dir + '/' + filename[:-1] + ext, dest_dir + '/' + str(i).zfill(n_items) + ext)
21 | 


--------------------------------------------------------------------------------
/utils/split_features.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def iter_loadtxt(filename, delimiter=',', skiprows=0, dtype=np.float32):
 5 |     def iter_func():
 6 |         with open(filename, 'r') as infile:
 7 |             for _ in range(skiprows):
 8 |                 next(infile)
 9 |             for line in infile:
10 |                 line = line.rstrip().split(delimiter)
11 |                 for item in line:
12 |                     yield dtype(item)
13 |         iter_loadtxt.rowlength = len(line)
14 | 
15 |     data = np.fromiter(iter_func(), dtype=dtype)
16 |     data = data.reshape((-1, iter_loadtxt.rowlength))
17 |     return data
18 | 
19 | 
20 | base_path = '/media/HDD_2TB/DATASETS/MSVD/Features/'
21 | feature = 'ImageNetFV_Places_C3Dfc8'
22 | out_feature = 'ImageNetFV'
23 | 
24 | for split in ['train', 'val', 'test']:
25 |     print "Loading %s features" % str(split + '_' + feature)
26 |     # feats = np.genfromtxt(open(base_path + split + '_' + feature + "_features.csv", "rb"), delimiter=",", dtype='float32')
27 |     feats = iter_loadtxt(base_path + split + '_' + feature + "_features.csv")
28 |     new_feats = feats[:, :1024]  # Modify this instruction to get the desired features!
29 |     print "Saving %s features" % str(split + '_' + feature)
30 |     np.savetxt(base_path + split + '_' + out_feature + "_features.csv", new_feats, delimiter=",")
31 | 


--------------------------------------------------------------------------------
/utils/vocabulary_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | if [ $# -lt 1 ] 
 5 | then 
 6 |     echo "Usage: $0 text_file"
 7 |     echo "Computes the vocabulary size of text_file"
 8 |     exit 1
 9 | fi
10 | 
11 | 
12 | for file in  $* ;do
13 |   vocab=`cat $file |  tr " " '\n' | sort -u |wc -l`
14 |   echo "$file: $vocab"
15 | done
16 | 


--------------------------------------------------------------------------------