├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── config.py
├── data_engine
├── README.md
├── __init__.py
├── generate_corpus_full_history.py
├── generate_descriptions_lists.py
├── generate_features_lists.py
├── generate_img_lists.py
├── generate_img_lists_from_split.py
├── generate_link_lists.py
├── generate_parallel_corpus.py
├── prepare_data.py
├── split_data.py
└── subsample_frames_features.py
├── docs
└── model.png
├── main.py
├── meta-optimizers
└── spearmint
│ ├── README.md
│ ├── __init__.py
│ ├── config.json
│ ├── launch_spearmint.sh
│ └── spearmint_opt.py
├── train.sh
├── turing_test.py
├── utils
├── __init__.py
├── common.py
├── evaluate_from_file.py
├── plot_metric.sh
├── prepare_features.py
├── pretrain_word_vectors.py
├── sort_by_split.py
├── split_features.py
└── vocabulary_size.sh
├── viddesc_model.py
└── visualization.ipynb
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 |
3 | .idea
4 |
5 | /meta-optimizers/spearmint/db/
6 | /meta-optimizers/spearmint/trained_models/
7 | /meta-optimizers/spearmint/output/
8 |
9 | ### Python template
10 | # Byte-compiled / optimized / DLL files
11 | __pycache__/
12 | *.py[cod]
13 | *$py.class
14 |
15 | # C extensions
16 | *.so
17 |
18 | # Distribution / packaging
19 | .Python
20 | env/
21 | build/
22 | develop-eggs/
23 | dist/
24 | downloads/
25 | eggs/
26 | .eggs/
27 | lib/
28 | lib64/
29 | parts/
30 | sdist/
31 | var/
32 | *.egg-info/
33 | .installed.cfg
34 | *.egg
35 |
36 | # PyInstaller
37 | # Usually these files are written by a python script from a template
38 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
39 | *.manifest
40 | *.spec
41 |
42 | # Installer logs
43 | pip-log.txt
44 | pip-delete-this-directory.txt
45 |
46 | # Unit test / coverage reports
47 | htmlcov/
48 | .tox/
49 | .coverage
50 | .coverage.*
51 | .cache
52 | nosetests.xml
53 | coverage.xml
54 | *,cover
55 |
56 | # Translations
57 | *.mo
58 | *.pot
59 |
60 | # Django stuff:
61 | *.log
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 | ### Emacs template
69 | # -*- mode: gitignore; -*-
70 | *~
71 | \#*\#
72 | /.emacs.desktop
73 | /.emacs.desktop.lock
74 | *.elc
75 | auto-save-list
76 | tramp
77 | .\#*
78 |
79 | # Org-mode
80 | .org-id-locations
81 | *_archive
82 |
83 | # flymake-mode
84 | *_flymake.*
85 |
86 | # eshell files
87 | /eshell/history
88 | /eshell/lastdir
89 |
90 | # elpa packages
91 | /elpa/
92 |
93 | # reftex files
94 | *.rel
95 |
96 | # AUCTeX auto folder
97 | /auto/
98 |
99 | # cask packages
100 | .cask/
101 |
102 | # Models
103 | *.pkl
104 | *.json
105 | *.h5
106 | *.npy
107 | *.zip
108 |
109 | # Training results
110 | *.vqa
111 | *.coco
112 | *.multiclass
113 | *.pred
114 | *.txt
115 |
116 | # Visualization files
117 | .ipynb_checkpoints
118 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | COPYRIGHT
2 |
3 | Copyright (c) 2016, the respective contributors
4 | All rights reserved.
5 |
6 | ABiViRNet uses a shared copyright model: each contributor
7 | holds copyright over their contributions to ABiViRNet. The project versioning records
8 | all such contribution and copyright details. If a contributor wants to further
9 | mark their specific copyright on a particular contribution, they should
10 | indicate their copyright solely in the commit message of the change when it
11 | is committed.
12 |
13 |
14 | GNU GENERAL PUBLIC LICENSE
15 | Version 2, June 1991
16 |
17 | Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
18 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 | Everyone is permitted to copy and distribute verbatim copies
20 | of this license document, but changing it is not allowed.
21 |
22 | Preamble
23 |
24 | The licenses for most software are designed to take away your
25 | freedom to share and change it. By contrast, the GNU General Public
26 | License is intended to guarantee your freedom to share and change free
27 | software--to make sure the software is free for all its users. This
28 | General Public License applies to most of the Free Software
29 | Foundation's software and to any other program whose authors commit to
30 | using it. (Some other Free Software Foundation software is covered by
31 | the GNU Lesser General Public License instead.) You can apply it to
32 | your programs, too.
33 |
34 | When we speak of free software, we are referring to freedom, not
35 | price. Our General Public Licenses are designed to make sure that you
36 | have the freedom to distribute copies of free software (and charge for
37 | this service if you wish), that you receive source code or can get it
38 | if you want it, that you can change the software or use pieces of it
39 | in new free programs; and that you know you can do these things.
40 |
41 | To protect your rights, we need to make restrictions that forbid
42 | anyone to deny you these rights or to ask you to surrender the rights.
43 | These restrictions translate to certain responsibilities for you if you
44 | distribute copies of the software, or if you modify it.
45 |
46 | For example, if you distribute copies of such a program, whether
47 | gratis or for a fee, you must give the recipients all the rights that
48 | you have. You must make sure that they, too, receive or can get the
49 | source code. And you must show them these terms so they know their
50 | rights.
51 |
52 | We protect your rights with two steps: (1) copyright the software, and
53 | (2) offer you this license which gives you legal permission to copy,
54 | distribute and/or modify the software.
55 |
56 | Also, for each author's protection and ours, we want to make certain
57 | that everyone understands that there is no warranty for this free
58 | software. If the software is modified by someone else and passed on, we
59 | want its recipients to know that what they have is not the original, so
60 | that any problems introduced by others will not reflect on the original
61 | authors' reputations.
62 |
63 | Finally, any free program is threatened constantly by software
64 | patents. We wish to avoid the danger that redistributors of a free
65 | program will individually obtain patent licenses, in effect making the
66 | program proprietary. To prevent this, we have made it clear that any
67 | patent must be licensed for everyone's free use or not licensed at all.
68 |
69 | The precise terms and conditions for copying, distribution and
70 | modification follow.
71 |
72 | GNU GENERAL PUBLIC LICENSE
73 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
74 |
75 | 0. This License applies to any program or other work which contains
76 | a notice placed by the copyright holder saying it may be distributed
77 | under the terms of this General Public License. The "Program", below,
78 | refers to any such program or work, and a "work based on the Program"
79 | means either the Program or any derivative work under copyright law:
80 | that is to say, a work containing the Program or a portion of it,
81 | either verbatim or with modifications and/or translated into another
82 | language. (Hereinafter, translation is included without limitation in
83 | the term "modification".) Each licensee is addressed as "you".
84 |
85 | Activities other than copying, distribution and modification are not
86 | covered by this License; they are outside its scope. The act of
87 | running the Program is not restricted, and the output from the Program
88 | is covered only if its contents constitute a work based on the
89 | Program (independent of having been made by running the Program).
90 | Whether that is true depends on what the Program does.
91 |
92 | 1. You may copy and distribute verbatim copies of the Program's
93 | source code as you receive it, in any medium, provided that you
94 | conspicuously and appropriately publish on each copy an appropriate
95 | copyright notice and disclaimer of warranty; keep intact all the
96 | notices that refer to this License and to the absence of any warranty;
97 | and give any other recipients of the Program a copy of this License
98 | along with the Program.
99 |
100 | You may charge a fee for the physical act of transferring a copy, and
101 | you may at your option offer warranty protection in exchange for a fee.
102 |
103 | 2. You may modify your copy or copies of the Program or any portion
104 | of it, thus forming a work based on the Program, and copy and
105 | distribute such modifications or work under the terms of Section 1
106 | above, provided that you also meet all of these conditions:
107 |
108 | a) You must cause the modified files to carry prominent notices
109 | stating that you changed the files and the date of any change.
110 |
111 | b) You must cause any work that you distribute or publish, that in
112 | whole or in part contains or is derived from the Program or any
113 | part thereof, to be licensed as a whole at no charge to all third
114 | parties under the terms of this License.
115 |
116 | c) If the modified program normally reads commands interactively
117 | when run, you must cause it, when started running for such
118 | interactive use in the most ordinary way, to print or display an
119 | announcement including an appropriate copyright notice and a
120 | notice that there is no warranty (or else, saying that you provide
121 | a warranty) and that users may redistribute the program under
122 | these conditions, and telling the user how to view a copy of this
123 | License. (Exception: if the Program itself is interactive but
124 | does not normally print such an announcement, your work based on
125 | the Program is not required to print an announcement.)
126 |
127 | These requirements apply to the modified work as a whole. If
128 | identifiable sections of that work are not derived from the Program,
129 | and can be reasonably considered independent and separate works in
130 | themselves, then this License, and its terms, do not apply to those
131 | sections when you distribute them as separate works. But when you
132 | distribute the same sections as part of a whole which is a work based
133 | on the Program, the distribution of the whole must be on the terms of
134 | this License, whose permissions for other licensees extend to the
135 | entire whole, and thus to each and every part regardless of who wrote it.
136 |
137 | Thus, it is not the intent of this section to claim rights or contest
138 | your rights to work written entirely by you; rather, the intent is to
139 | exercise the right to control the distribution of derivative or
140 | collective works based on the Program.
141 |
142 | In addition, mere aggregation of another work not based on the Program
143 | with the Program (or with a work based on the Program) on a volume of
144 | a storage or distribution medium does not bring the other work under
145 | the scope of this License.
146 |
147 | 3. You may copy and distribute the Program (or a work based on it,
148 | under Section 2) in object code or executable form under the terms of
149 | Sections 1 and 2 above provided that you also do one of the following:
150 |
151 | a) Accompany it with the complete corresponding machine-readable
152 | source code, which must be distributed under the terms of Sections
153 | 1 and 2 above on a medium customarily used for software interchange; or,
154 |
155 | b) Accompany it with a written offer, valid for at least three
156 | years, to give any third party, for a charge no more than your
157 | cost of physically performing source distribution, a complete
158 | machine-readable copy of the corresponding source code, to be
159 | distributed under the terms of Sections 1 and 2 above on a medium
160 | customarily used for software interchange; or,
161 |
162 | c) Accompany it with the information you received as to the offer
163 | to distribute corresponding source code. (This alternative is
164 | allowed only for noncommercial distribution and only if you
165 | received the program in object code or executable form with such
166 | an offer, in accord with Subsection b above.)
167 |
168 | The source code for a work means the preferred form of the work for
169 | making modifications to it. For an executable work, complete source
170 | code means all the source code for all modules it contains, plus any
171 | associated interface definition files, plus the scripts used to
172 | control compilation and installation of the executable. However, as a
173 | special exception, the source code distributed need not include
174 | anything that is normally distributed (in either source or binary
175 | form) with the major components (compiler, kernel, and so on) of the
176 | operating system on which the executable runs, unless that component
177 | itself accompanies the executable.
178 |
179 | If distribution of executable or object code is made by offering
180 | access to copy from a designated place, then offering equivalent
181 | access to copy the source code from the same place counts as
182 | distribution of the source code, even though third parties are not
183 | compelled to copy the source along with the object code.
184 |
185 | 4. You may not copy, modify, sublicense, or distribute the Program
186 | except as expressly provided under this License. Any attempt
187 | otherwise to copy, modify, sublicense or distribute the Program is
188 | void, and will automatically terminate your rights under this License.
189 | However, parties who have received copies, or rights, from you under
190 | this License will not have their licenses terminated so long as such
191 | parties remain in full compliance.
192 |
193 | 5. You are not required to accept this License, since you have not
194 | signed it. However, nothing else grants you permission to modify or
195 | distribute the Program or its derivative works. These actions are
196 | prohibited by law if you do not accept this License. Therefore, by
197 | modifying or distributing the Program (or any work based on the
198 | Program), you indicate your acceptance of this License to do so, and
199 | all its terms and conditions for copying, distributing or modifying
200 | the Program or works based on it.
201 |
202 | 6. Each time you redistribute the Program (or any work based on the
203 | Program), the recipient automatically receives a license from the
204 | original licensor to copy, distribute or modify the Program subject to
205 | these terms and conditions. You may not impose any further
206 | restrictions on the recipients' exercise of the rights granted herein.
207 | You are not responsible for enforcing compliance by third parties to
208 | this License.
209 |
210 | 7. If, as a consequence of a court judgment or allegation of patent
211 | infringement or for any other reason (not limited to patent issues),
212 | conditions are imposed on you (whether by court order, agreement or
213 | otherwise) that contradict the conditions of this License, they do not
214 | excuse you from the conditions of this License. If you cannot
215 | distribute so as to satisfy simultaneously your obligations under this
216 | License and any other pertinent obligations, then as a consequence you
217 | may not distribute the Program at all. For example, if a patent
218 | license would not permit royalty-free redistribution of the Program by
219 | all those who receive copies directly or indirectly through you, then
220 | the only way you could satisfy both it and this License would be to
221 | refrain entirely from distribution of the Program.
222 |
223 | If any portion of this section is held invalid or unenforceable under
224 | any particular circumstance, the balance of the section is intended to
225 | apply and the section as a whole is intended to apply in other
226 | circumstances.
227 |
228 | It is not the purpose of this section to induce you to infringe any
229 | patents or other property right claims or to contest validity of any
230 | such claims; this section has the sole purpose of protecting the
231 | integrity of the free software distribution system, which is
232 | implemented by public license practices. Many people have made
233 | generous contributions to the wide range of software distributed
234 | through that system in reliance on consistent application of that
235 | system; it is up to the author/donor to decide if he or she is willing
236 | to distribute software through any other system and a licensee cannot
237 | impose that choice.
238 |
239 | This section is intended to make thoroughly clear what is believed to
240 | be a consequence of the rest of this License.
241 |
242 | 8. If the distribution and/or use of the Program is restricted in
243 | certain countries either by patents or by copyrighted interfaces, the
244 | original copyright holder who places the Program under this License
245 | may add an explicit geographical distribution limitation excluding
246 | those countries, so that distribution is permitted only in or among
247 | countries not thus excluded. In such case, this License incorporates
248 | the limitation as if written in the body of this License.
249 |
250 | 9. The Free Software Foundation may publish revised and/or new versions
251 | of the General Public License from time to time. Such new versions will
252 | be similar in spirit to the present version, but may differ in detail to
253 | address new problems or concerns.
254 |
255 | Each version is given a distinguishing version number. If the Program
256 | specifies a version number of this License which applies to it and "any
257 | later version", you have the option of following the terms and conditions
258 | either of that version or of any later version published by the Free
259 | Software Foundation. If the Program does not specify a version number of
260 | this License, you may choose any version ever published by the Free Software
261 | Foundation.
262 |
263 | 10. If you wish to incorporate parts of the Program into other free
264 | programs whose distribution conditions are different, write to the author
265 | to ask for permission. For software which is copyrighted by the Free
266 | Software Foundation, write to the Free Software Foundation; we sometimes
267 | make exceptions for this. Our decision will be guided by the two goals
268 | of preserving the free status of all derivatives of our free software and
269 | of promoting the sharing and reuse of software generally.
270 |
271 | NO WARRANTY
272 |
273 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
274 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
275 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
276 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
277 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
278 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
279 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
280 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
281 | REPAIR OR CORRECTION.
282 |
283 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
284 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
285 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
286 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
287 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
288 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
289 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
290 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
291 | POSSIBILITY OF SUCH DAMAGES.
292 |
293 | END OF TERMS AND CONDITIONS
294 |
295 | How to Apply These Terms to Your New Programs
296 |
297 | If you develop a new program, and you want it to be of the greatest
298 | possible use to the public, the best way to achieve this is to make it
299 | free software which everyone can redistribute and change under these terms.
300 |
301 | To do so, attach the following notices to the program. It is safest
302 | to attach them to the start of each source file to most effectively
303 | convey the exclusion of warranty; and each file should have at least
304 | the "copyright" line and a pointer to where the full notice is found.
305 |
306 | {description}
307 | Copyright (C) {year} {fullname}
308 |
309 | This program is free software; you can redistribute it and/or modify
310 | it under the terms of the GNU General Public License as published by
311 | the Free Software Foundation; either version 2 of the License, or
312 | (at your option) any later version.
313 |
314 | This program is distributed in the hope that it will be useful,
315 | but WITHOUT ANY WARRANTY; without even the implied warranty of
316 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
317 | GNU General Public License for more details.
318 |
319 | You should have received a copy of the GNU General Public License along
320 | with this program; if not, write to the Free Software Foundation, Inc.,
321 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
322 |
323 | Also add information on how to contact you by electronic and paper mail.
324 |
325 | If the program is interactive, make it output a short notice like this
326 | when it starts in an interactive mode:
327 |
328 | Gnomovision version 69, Copyright (C) year name of author
329 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
330 | This is free software, and you are welcome to redistribute it
331 | under certain conditions; type `show c' for details.
332 |
333 | The hypothetical commands `show w' and `show c' should show the appropriate
334 | parts of the General Public License. Of course, the commands you use may
335 | be called something other than `show w' and `show c'; they could even be
336 | mouse-clicks or menu items--whatever suits your program.
337 |
338 | You should also get your employer (if you work as a programmer) or your
339 | school, if any, to sign a "copyright disclaimer" for the program, if
340 | necessary. Here is a sample; alter the names:
341 |
342 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program
343 | `Gnomovision' (which makes passes at compilers) written by James Hacker.
344 |
345 | {signature of Ty Coon}, 1 April 1989
346 | Ty Coon, President of Vice
347 |
348 | This General Public License does not permit incorporating your program into
349 | proprietary programs. If your program is a subroutine library, you may
350 | consider it more useful to permit linking proprietary applications with the
351 | library. If this is what you want to do, use the GNU Lesser General
352 | Public License instead of this License.
353 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Egocentric Video Description based on Temporally-Linked Sequences
2 |
3 | This repository contains the code for building the Temporally-linked Multi-input Attention (TMA) model, which was presented in
4 | the work [Egocentric Video Description based on Temporally-Linked Sequences](),
5 | submitted to the [Journal of Visual Communication and Image Representation](https://www.journals.elsevier.com/journal-of-visual-communication-and-image-representation).
6 | With this module, you can replicate our experiments and easily deploy new models. TMA is built upon our fork of
7 | [Keras](https://github.com/MarcBS/keras) framework ([version 1.2](https://github.com/MarcBS/keras/tree/Keras-1.2-(stable))) and tested for the [Theano](http://deeplearning.net/software/theano)
8 | backend.
9 |
10 | ## Features:
11 |
12 | * Temporally-linked mechanism for learning using information from previous events.
13 | * Multi-input Attention LSTM model over any of the input multimodal sequences.
14 | * Peeked decoder LSTM: The previously generated word is an input of the current LSTM timestep
15 | * MLPs for initializing the LSTM hidden and memory state
16 | * Beam search decoding
17 |
18 | ## Architecture
19 |
20 | 
21 |
22 | ## Requirements
23 |
24 | TMA requires the following libraries:
25 |
26 | - [Our version of Keras](https://github.com/MarcBS/keras) >= 1.2.3
27 | - [Multimodal Keras Wrapper](https://github.com/MarcBS/multimodal_keras_wrapper) >= 0.7
28 | - [Coco-caption evaluation package](https://github.com/lvapeab/coco-caption/tree/master/pycocoevalcap/)
29 |
30 | ## Instructions:
31 |
32 | Assuming you have a dataset and features extracted from the video frames:
33 |
34 | 0) Set the paths to Keras and Multimodal Keras Wraper in train.sh
35 |
36 | 1) Prepare data:
37 |
38 | ``
39 | python data_engine/subsample_frames_features.py
40 | ``
41 |
42 | ``
43 | python data_engine/generate_features_lists.py
44 | ``
45 |
46 | ``
47 | python data_engine/generate_descriptions_lists.py
48 | ``
49 |
50 | See [data_engine/README.md](data_engine/README.md) for detailed information.
51 |
52 | 2) Prepare the inputs/outputs of your model in `data_engine/prepare_data.py`
53 |
54 | 3) Set a model configuration in `config.py`
55 |
56 | 4) Train!:
57 |
58 | ``
59 | python main.py
60 | ``
61 |
62 | ## Dataset
63 |
64 | The dataset [EDUB-SegDesc](http://www.ub.edu/cvub/edub-segdesc/) was used to evaluate this model. It was acquired by the wearable camera Narrative Clip, taking a picture every 30 seconds (2 fpm). It consists of 55 days acquired by 9 people. Containing a total of 48,717 images, divided in 1,339 events (or image sequences) and 3,991 captions.
65 |
66 | ## Citation
67 |
68 | If you use this code for any purpose, please, do not forget to cite the following paper:
69 |
70 | ```
71 | Marc Bolaños, Álvaro Peris, Francisco Casacuberta, Sergi Soler and Petia Radeva.
72 | Egocentric Video Description based on Temporally-Linked Sequences
73 | In Special Issue on Egocentric Vision and Lifelogging Tools.
74 | Journal of Visual Communication and Image Representation (VCIR), (SUBMITTED).
75 | ```
76 |
77 | ## About
78 |
79 | Joint collaboration between the [Computer Vision at the University of Barcelona (CVUB)](http://www.ub.edu/cvub/) group at [Universitat de Barcelona](www.ub.edu)-[CVC](http://www.cvc.uab.es) and the [PRHLT Research Center](https://www.prhlt.upv.es) at [Universitat Politècnica de València](https://www.upv.es).
80 |
81 |
82 | ## Contact
83 |
84 | Marc Bolaños ([web page](http://www.ub.edu/cvub/marcbolanos/)): marc.bolanos@ub.edu
85 |
86 | Álvaro Peris ([web page](http://lvapeab.github.io/)): lvapeab@prhlt.upv.es
87 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarcBS/TMA/76f4e16b3f10c056c45ada5df4a64cc564b69011/__init__.py
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | def load_parameters():
2 | """
3 | Loads the defined parameters
4 | """
5 | # Input data params
6 | DATA_ROOT_PATH = '/media/HDD_3TB/DATASETS/EDUB-SegDesc/'
7 |
8 | # preprocessed features
9 | DATASET_NAME = 'EDUB-SegDesc_features' # Dataset name (add '-linked' suffix for using
10 | # dataset with temporally-linked training data)
11 | #
12 | # -linked
13 | # -linked-upperbound
14 | # -linked-upperbound-copy
15 | # -linked-upperbound-prev
16 | # -linked-upperbound-nocopy
17 | # -linked-video
18 | # -linked-vidtext
19 | # -vidtext-embed
20 | #
21 |
22 | PRE_TRAINED_DATASET_NAME = None #'MSVD_features' # Dataset name for reusing vocabulary of pre-trained model (set to None for disabling)
23 | # (only applicable if we are using a pre-trained model, default None)
24 | VOCABULARIES_MAPPING = {'description': 'description',
25 | 'state_below': 'description',
26 | 'prev_description': 'description'}
27 |
28 | PRE_TRAINED_VOCABULARY_NAME = None #'1BillionWords_vocabulary' # Dataset name for reusing vocabulary of pre-trained model
29 |
30 | # Input data
31 | INPUT_DATA_TYPE = 'video-features' # 'video-features' or 'video'
32 | NUM_FRAMES = 26 # fixed number of input frames per video
33 |
34 | if '-noninfo' in DATASET_NAME:
35 | suffix_annotations = '_without_noninfo'
36 | suffix_features = '_Without_NonInfo'
37 | else:
38 | suffix_annotations = ''
39 | suffix_features = ''
40 |
41 | #### Features from video frames
42 | FRAMES_LIST_FILES = {'train': 'Annotations/%s/train_feat_list'+suffix_annotations+'.txt', # Feature frames list files
43 | 'val': 'Annotations/%s/val_feat_list'+suffix_annotations+'.txt',
44 | 'test': 'Annotations/%s/test_feat_list'+suffix_annotations+'.txt',
45 | }
46 | FRAMES_COUNTS_FILES = { 'train': 'Annotations/%s/train_feat_counts'+suffix_annotations+'.txt', # Frames counts files
47 | 'val': 'Annotations/%s/val_feat_counts'+suffix_annotations+'.txt',
48 | 'test': 'Annotations/%s/test_feat_counts'+suffix_annotations+'.txt',
49 | }
50 | FEATURE_NAMES = ['ImageNet'
51 | + suffix_features] # append '_L2' at the end of each feature type if using their L2 version
52 |
53 | # Output data
54 | DESCRIPTION_FILES = {'train': 'Annotations/train_descriptions'+suffix_annotations+'.txt', # Description files
55 | 'val': 'Annotations/val_descriptions'+suffix_annotations+'.txt',
56 | 'test': 'Annotations/test_descriptions'+suffix_annotations+'.txt',
57 | }
58 | DESCRIPTION_COUNTS_FILES = { 'train': 'Annotations/train_descriptions_counts'+suffix_annotations+'.npy', # Description counts files
59 | 'val': 'Annotations/val_descriptions_counts'+suffix_annotations+'.npy',
60 | 'test': 'Annotations/test_descriptions_counts'+suffix_annotations+'.npy',
61 | }
62 |
63 | # Dataset parameters
64 | if not '-vidtext-embed' in DATASET_NAME:
65 | INPUTS_IDS_DATASET = ['video', 'state_below'] # Corresponding inputs of the dataset
66 | OUTPUTS_IDS_DATASET = ['description'] # Corresponding outputs of the dataset
67 | INPUTS_IDS_MODEL = ['video', 'state_below'] # Corresponding inputs of the built model
68 | OUTPUTS_IDS_MODEL = ['description'] # Corresponding outputs of the built model
69 | else:
70 | INPUTS_IDS_DATASET = ['video', 'description'] # Corresponding inputs of the dataset
71 | OUTPUTS_IDS_DATASET = ['match'] # Corresponding outputs of the dataset
72 | INPUTS_IDS_MODEL = ['video', 'description'] # Corresponding inputs of the built model
73 | OUTPUTS_IDS_MODEL = ['match'] # Corresponding outputs of the built model
74 |
75 |
76 | if '-linked' in DATASET_NAME:
77 |
78 | LINK_SAMPLE_FILES = {'train': 'Annotations/train_link_samples'+suffix_annotations+'.txt', # Links index files
79 | 'val': 'Annotations/val_link_samples'+suffix_annotations+'.txt',
80 | 'test': 'Annotations/test_link_samples'+suffix_annotations+'.txt',
81 | }
82 |
83 | INPUTS_IDS_DATASET.append('prev_description')
84 | INPUTS_IDS_MODEL.append('prev_description')
85 |
86 | if '-vidtext' in DATASET_NAME:
87 | INPUTS_IDS_DATASET.append('prev_video')
88 | INPUTS_IDS_MODEL.append('prev_video')
89 |
90 | if '-upperbound' not in DATASET_NAME and '-video' not in DATASET_NAME:
91 | INPUTS_IDS_DATASET.append('link_index')
92 | INPUTS_IDS_MODEL.append('link_index')
93 |
94 |
95 | # Evaluation params
96 | if not '-vidtext-embed' in DATASET_NAME:
97 | METRICS = ['coco'] # Metric used for evaluating model after each epoch (leave empty if only prediction is required)
98 | else:
99 | METRICS = ['multiclass_metrics']
100 | EVAL_ON_SETS = ['val', 'test'] # Possible values: 'train', 'val' and 'test' (external evaluator)
101 | EVAL_ON_SETS_KERAS = [] # Possible values: 'train', 'val' and 'test' (Keras' evaluator)
102 | START_EVAL_ON_EPOCH = 0 # First epoch where the model will be evaluated
103 | EVAL_EACH_EPOCHS = False # Select whether evaluate between N epochs or N updates
104 | EVAL_EACH = 50 # Sets the evaluation frequency (epochs or updates)
105 |
106 | # Search parameters
107 | SAMPLING = 'max_likelihood' # Possible values: multinomial or max_likelihood (recommended)
108 | TEMPERATURE = 1 # Multinomial sampling parameter
109 | if not '-vidtext-embed' in DATASET_NAME:
110 | BEAM_SEARCH = True # Switches on-off the beam search procedure
111 | else:
112 | BEAM_SEARCH = False
113 | BEAM_SIZE = 10 # Beam size (in case of BEAM_SEARCH == True)
114 | BEAM_SEARCH_COND_INPUT = 1 # Index of the conditional input used in beam search (i.e., state_below)
115 | OPTIMIZED_SEARCH = True # Compute annotations only a single time per sample
116 | NORMALIZE_SAMPLING = False # Normalize hypotheses scores according to their length
117 | ALPHA_FACTOR = .6 # Normalization according to length**ALPHA_FACTOR
118 | # (see: arxiv.org/abs/1609.08144)
119 |
120 | # Sampling params: Show some samples during training
121 | if not '-vidtext-embed' in DATASET_NAME:
122 | SAMPLE_ON_SETS = ['train', 'val'] # Possible values: 'train', 'val' and 'test'
123 | else:
124 | SAMPLE_ON_SETS = []
125 | N_SAMPLES = 5 # Number of samples generated
126 | START_SAMPLING_ON_EPOCH = 0 # First epoch where the model will be evaluated
127 | SAMPLE_EACH_UPDATES = 50 # Sampling frequency (default 450)
128 |
129 | # Word representation params
130 | TOKENIZATION_METHOD = 'tokenize_icann' # Select which tokenization we'll apply:
131 | # tokenize_basic, tokenize_aggressive, tokenize_soft,
132 | # tokenize_icann or tokenize_questions
133 |
134 | FILL = 'end' # whether we fill the 'end' or the 'start' of the sentence with 0s
135 | TRG_LAN = 'en' # Language of the outputs (mainly used for the Meteor evaluator)
136 | PAD_ON_BATCH = True # Whether we take as many timesteps as the longes sequence of the batch
137 | # or a fixed size (MAX_OUTPUT_TEXT_LEN)
138 |
139 | # Input image parameters
140 | DATA_AUGMENTATION = False # Apply data augmentation on input data (noise on features)
141 | DATA_AUGMENTATION_TYPE = ['random_selection'] # 'random_selection', 'noise'
142 | IMG_FEAT_SIZE = 1024 # Size of the image features
143 |
144 | # Output text parameters
145 | OUTPUT_VOCABULARY_SIZE = 0 # Size of the input vocabulary. Set to 0 for using all,
146 | # otherwise it will be truncated to these most frequent words.
147 | MAX_OUTPUT_TEXT_LEN = 30 # Maximum length of the output sequence
148 | # set to 0 if we want to use the whole answer as a single class
149 | MAX_OUTPUT_TEXT_LEN_TEST = 50 # Maximum length of the output sequence during test time
150 | MIN_OCCURRENCES_VOCAB = 0 # Minimum number of occurrences allowed for the words in the vocabulay.
151 |
152 | # Optimizer parameters (see model.compile() function)
153 | LOSS = 'categorical_crossentropy'
154 | CLASSIFIER_ACTIVATION = 'softmax'
155 |
156 | OPTIMIZER = 'Adadelta' # Optimizer
157 | LR = 1. # Learning rate. Recommended values - Adam 0.001 - Adadelta 1.0
158 | CLIP_C = 10. # During training, clip gradients to this norm
159 | if not '-vidtext-embed' in DATASET_NAME:
160 | SAMPLE_WEIGHTS = True # Select whether we use a weights matrix (mask) for the data outputs
161 | LR_DECAY = None # Minimum number of epochs before the next LR decay. Set to None if don't want to decay the learning rate
162 | LR_GAMMA = 0.995 # Multiplier used for decreasing the LR
163 |
164 | # Training parameters
165 | MAX_EPOCH = 200 # Stop when computed this number of epochs
166 | BATCH_SIZE = 64 # ABiViRNet trained with BATCH_SIZE = 64
167 |
168 | HOMOGENEOUS_BATCHES = False # Use batches with homogeneous output lengths for every minibatch (Possibly buggy!)
169 | PARALLEL_LOADERS = 8 # Parallel data batch loaders
170 | EPOCHS_FOR_SAVE = 1 if EVAL_EACH_EPOCHS else None # Number of epochs between model saves (None for disabling epoch save)
171 | WRITE_VALID_SAMPLES = True # Write valid samples in file
172 | SAVE_EACH_EVALUATION = True if not EVAL_EACH_EPOCHS else False # Save each time we evaluate the model
173 |
174 | # Early stop parameters
175 | EARLY_STOP = True # Turns on/off the early stop protocol
176 | PATIENCE = 20 # We'll stop if the val STOP_METRIC does not improve after this
177 | # number of evaluations
178 |
179 | if not '-vidtext-embed' in DATASET_NAME:
180 | STOP_METRIC = 'Bleu_4' # Metric for the stop
181 | else:
182 | STOP_METRIC = 'accuracy'
183 |
184 | # Model parameters
185 | MODEL_TYPE = 'TemporallyLinkedVideoDescriptionAttDoublePrev' # 'ArcticVideoCaptionWithInit'
186 | # 'ArcticVideoCaptionNoLSTMEncWithInit'
187 | # 'TemporallyLinkedVideoDescriptionNoAtt'
188 | # 'TemporallyLinkedVideoDescriptionAtt'
189 | # 'TemporallyLinkedVideoDescriptionAttDoublePrev'
190 | # 'VideoTextEmbedding'
191 | # 'DeepSeek'
192 |
193 | RNN_TYPE = 'LSTM' # RNN unit type ('LSTM' supported)
194 |
195 | # Input text parameters
196 | TARGET_TEXT_EMBEDDING_SIZE = 301 # Source language word embedding size (ABiViRNet 301)
197 | TRG_PRETRAINED_VECTORS = None # Path to pretrained vectors. (e.g. DATA_ROOT_PATH + '/DATA/word2vec.%s.npy' % TRG_LAN)
198 | # Set to None if you don't want to use pretrained vectors.
199 | # When using pretrained word embeddings, the size of the pretrained word embeddings must match with the word embeddings size.
200 | TRG_PRETRAINED_VECTORS_TRAINABLE = True # Finetune or not the target word embedding vectors.
201 |
202 | # Encoder configuration
203 | ENCODER_HIDDEN_SIZE = 717 # For models with RNN encoder (ABiViRNet 717)
204 | BIDIRECTIONAL_ENCODER = True # Use bidirectional encoder
205 | N_LAYERS_ENCODER = 1 # Stack this number of encoding layers (default 1)
206 | BIDIRECTIONAL_DEEP_ENCODER = True # Use bidirectional encoder in all encoding layers
207 |
208 |
209 | # Previous sentence encoder
210 | PREV_SENT_ENCODER_HIDDEN_SIZE = 717 # For models with previous sentence RNN encoder (484)
211 | BIDIRECTIONAL_PREV_SENT_ENCODER = True # Use bidirectional encoder
212 | N_LAYERS_PREV_SENT_ENCODER = 1 # Stack this number of encoding layers
213 | BIDIRECTIONAL_DEEP_PREV_SENT_ENCODER = True # Use bidirectional encoder in all encoding layers
214 |
215 | DECODER_HIDDEN_SIZE = 484 # For models with LSTM decoder (ABiViRNet 484)
216 | SKIP_VECTORS_HIDDEN_SIZE = TARGET_TEXT_EMBEDDING_SIZE
217 | ADDITIONAL_OUTPUT_MERGE_MODE = 'sum' # Merge mode for the skip connections
218 | WEIGHTED_MERGE = False # Wether we want to apply a conventional or a weighted merge
219 |
220 |
221 | AFFINE_LAYERS_DIM = 500 # Dimensionality of the affine layers in 'DeepSeek' model
222 |
223 | IMG_EMBEDDING_LAYERS = [] # FC layers for visual embedding
224 | # Here we should specify the activation function and the output dimension
225 | # (e.g IMG_EMBEDDING_LAYERS = [('linear', 1024)]
226 |
227 | # Fully-Connected layers for initializing the first RNN state
228 | # Here we should only specify the activation function of each layer
229 | # (as they have a potentially fixed size)
230 | # (e.g INIT_LAYERS = ['tanh', 'relu'])
231 | INIT_LAYERS = ['tanh']
232 |
233 | # Additional Fully-Connected layers's sizes applied before softmax.
234 | # Here we should specify the activation function and the output dimension
235 | # (e.g DEEP_OUTPUT_LAYERS = [('tanh', 600), ('relu', 400), ('relu', 200)])
236 | DEEP_OUTPUT_LAYERS = []
237 |
238 | # Regularizers
239 | WEIGHT_DECAY = 1e-4 # L2 regularization
240 | RECURRENT_WEIGHT_DECAY = 0. # L2 regularization in recurrent layers
241 |
242 | USE_DROPOUT = True # Use dropout
243 | DROPOUT_P = 0.5 # Percentage of units to drop
244 |
245 | USE_RECURRENT_DROPOUT = False # Use dropout in recurrent layers # DANGEROUS!
246 | RECURRENT_DROPOUT_P = 0.5 # Percentage of units to drop in recurrent layers
247 |
248 | USE_NOISE = True # Use gaussian noise during training
249 | NOISE_AMOUNT = 0.01 # Amount of noise
250 |
251 | USE_BATCH_NORMALIZATION = True # If True it is recommended to deactivate Dropout
252 | BATCH_NORMALIZATION_MODE = 1 # See documentation in Keras' BN
253 |
254 | USE_PRELU = False # use PReLU activations as regularizer
255 | USE_L2 = False # L2 normalization on the features
256 |
257 | # Results plot and models storing parameters
258 | EXTRA_NAME = '' # This will be appended to the end of the model name
259 | MODEL_NAME = DATASET_NAME + '_' + MODEL_TYPE +\
260 | '_txtemb_' + str(TARGET_TEXT_EMBEDDING_SIZE) + \
261 | '_imgemb_' + '_'.join([layer[0] for layer in IMG_EMBEDDING_LAYERS]) + \
262 | '_lstmenc_' + str(ENCODER_HIDDEN_SIZE) + \
263 | '_lstm_' + str(DECODER_HIDDEN_SIZE) + \
264 | '_additional_output_mode_' + str(ADDITIONAL_OUTPUT_MERGE_MODE) + \
265 | '_deepout_' + '_'.join([layer[0] for layer in DEEP_OUTPUT_LAYERS]) + \
266 | '_' + OPTIMIZER + '_decay_' + str(LR_DECAY) + '-' + str(LR_GAMMA)
267 |
268 | MODEL_NAME += '_' + EXTRA_NAME
269 |
270 | # Name and location of the pre-trained model (only if RELOAD > 0)
271 | PRE_TRAINED_MODELS = ['MSVD_best_model']
272 | # default: MODEL_NAME
273 | # ['EDUB-SegDesc_features-vidtext-embed_VideoTextEmbedding_txtemb_301_imgemb__lstmenc_717_lstm_484_additional_output_mode_sum_deepout__Adadelta_decay_None-0.95_vidtext_classification_BLSTM_text']
274 | # ['EDUB-SegDesc_features-vidtext-embed_VideoTextEmbedding_txtemb_301_imgemb__lstmenc_717_lstm_484_additional_output_mode_sum_deepout__Adadelta_decay_None-0.95_vidtext_classification']
275 | # ['EDUB-SegDesc_features-vidtext-embed_VideoTextEmbedding_txtemb_301_imgemb__lstmenc_717_lstm_484_additional_output_mode_sum_deepout__Adam_decay_1-0.95vidtext_embed']
276 | # ['MSVD_best_model']
277 | # ['MSVD_best_model', '1BillionWords']
278 | PRE_TRAINED_MODEL_STORE_PATHS = map(lambda x: 'trained_models/' + x + '/', PRE_TRAINED_MODELS) if isinstance(PRE_TRAINED_MODELS, list) else 'trained_models/'+PRE_TRAINED_MODELS+'/'
279 | LOAD_WEIGHTS_ONLY = True # Load weights of pre-trained model or complete Model_Wrapper instance
280 | # Layers' mapping from old to new model if LOAD_WEIGHTS_ONLY
281 | # You can check the layers of a model with [layer.name for layer in model_wrapper.model.layers]
282 | if '-video' in DATASET_NAME:
283 | # Pre-train MSVD
284 | LAYERS_MAPPING = [{'bidirectional_encoder': 'bidirectional_encoder_LSTM',
285 | 'initial_state': 'initial_state',
286 | 'initial_memory': 'initial_memory',
287 | 'attlstmcond_1': 'decoder_AttLSTMCond2Inputs', # 'decoder_AttLSTMCond',
288 | 'target_word_embedding': 'target_word_embedding',
289 | 'logit_ctx': 'logit_ctx',
290 | 'logit_lstm': 'logit_lstm',
291 | 'description': 'description'
292 | }
293 | ]
294 | # Pre-train vidtext embedding
295 | """
296 | LAYERS_MAPPING = [{'bidirectional_encoder_LSTM': 'bidirectional_encoder_LSTM',
297 | 'bidirectional_encoder_LSTM': 'prev_desc_emb_bidirectional_encoder_LSTM',
298 | 'target_word_embedding': 'target_word_embedding',
299 | 'logit_ctx': 'logit_ctx',
300 | 'logit_prev': 'logit_prev',
301 | }
302 | ]
303 | """
304 |
305 | elif '-vidtext-embed' in DATASET_NAME:
306 | LAYERS_MAPPING = [{'bidirectional_encoder': 'bidirectional_encoder_LSTM',
307 | 'target_word_embedding': 'target_word_embedding',
308 | 'logit_ctx': 'logit_ctx',
309 | }
310 | ]
311 | else:
312 | if MODEL_TYPE == 'ArcticVideoCaptionWithInit':
313 | LAYERS_MAPPING = [{'bidirectional_encoder': 'bidirectional_encoder_LSTM',
314 | 'initial_state': 'initial_state',
315 | 'initial_memory': 'initial_memory',
316 | 'attlstmcond_1': 'decoder_AttLSTMCond',
317 | 'target_word_embedding': 'target_word_embedding',
318 | 'logit_ctx': 'logit_ctx',
319 | 'logit_lstm': 'logit_lstm',
320 | 'description': 'description'
321 | }
322 | ]
323 |
324 | elif MODEL_TYPE == 'TemporallyLinkedVideoDescriptionAttDoublePrev':
325 | LAYERS_MAPPING = [{'bidirectional_encoder': 'bidirectional_encoder_LSTM',
326 | 'initial_state': 'initial_state',
327 | 'initial_memory': 'initial_memory',
328 | 'attlstmcond_1': 'decoder_AttLSTMCond3Inputs', # 'decoder_AttLSTMCond',
329 | 'target_word_embedding': 'target_word_embedding',
330 | 'logit_ctx': 'logit_ctx',
331 | 'logit_lstm': 'logit_lstm',
332 | 'description': 'description'
333 | }
334 | ]
335 |
336 | elif len(PRE_TRAINED_MODELS) == 2:
337 | LAYERS_MAPPING = [{'bidirectional_encoder': 'bidirectional_encoder_LSTM',
338 | 'initial_state': 'initial_state',
339 | 'initial_memory': 'initial_memory',
340 | 'attlstmcond_1': 'decoder_AttLSTMCond2Inputs', # 'decoder_AttLSTMCond',
341 | #'target_word_embedding': 'target_word_embedding',
342 | 'logit_ctx': 'logit_ctx',
343 | 'logit_lstm': 'logit_lstm',
344 | #'description': 'description'
345 | },
346 | {'bidirectional_encoder_LSTM': 'prev_desc_emb_bidirectional_encoder_LSTM', #'prev_desc_emb_encoder_LSTM',
347 | 'target_word_embedding': 'target_word_embedding',
348 | 'decoder_AttLSTMCond': 'decoder_AttLSTMCond2Inputs', #'decoder_AttLSTMCond',
349 | 'target_text': 'description'
350 | }
351 | ]
352 |
353 | elif len(PRE_TRAINED_MODELS) == 1: # reuse data from vidtext-embedding model
354 |
355 | LAYERS_MAPPING = [{'bidirectional_encoder_LSTM': 'bidirectional_encoder_LSTM',
356 | 'prev_desc_emb_bidirectional_encoder_LSTM': 'prev_desc_emb_bidirectional_encoder_LSTM',
357 | 'target_word_embedding': 'target_word_embedding',
358 | 'logit_ctx': 'logit_ctx',
359 | 'logit_prev': 'logit_prev',
360 | }
361 | ]
362 |
363 |
364 | STORE_PATH = 'trained_models/' + MODEL_NAME + '/' # Models and evaluation results will be stored here
365 | DATASET_STORE_PATH = 'datasets/' # Dataset instance will be stored here
366 |
367 | SAMPLING_SAVE_MODE = 'list' # 'list' or 'vqa'
368 | VERBOSE = 1 # Vqerbosity level
369 | RELOAD = 0 # If 0 start training from scratch, otherwise the model
370 | # Saved on epoch 'RELOAD' will be used
371 | REBUILD_DATASET = True # Build again or use stored instance
372 | MODE = 'training' # 'training' or 'sampling' (if 'sampling' then RELOAD must
373 | # be greater than 0 and EVAL_ON_SETS will be used)
374 | RELOAD_PATH = None
375 | SAMPLING_RELOAD_EPOCH = False
376 | SAMPLING_RELOAD_POINT = 0
377 | # Extra parameters for special trainings
378 | TRAIN_ON_TRAINVAL = False # train the model on both training and validation sets combined
379 | FORCE_RELOAD_VOCABULARY = False # force building a new vocabulary from the training samples applicable if RELOAD > 1
380 |
381 | # ============================================
382 | parameters = locals().copy()
383 | return parameters
384 |
--------------------------------------------------------------------------------
/data_engine/README.md:
--------------------------------------------------------------------------------
1 | # Preprocessing of EDUB-SegDesc dataset
2 |
3 | The scripts stored in this folder 'data_engine' are intended to preprocess the data from the [EDUB-SegDesc](http://www.ub.edu/cvub/edub-segdesc/) dataset in order to use them as an input for building a Dataset object instance (see [staged_keras_wrapper](https://github.com/MarcBS/staged_keras_wrapper)).
4 |
5 | Two different kinds of inputs can be used for training the video description models:
6 |
7 | 1) Raw video frames (see section 'Image lists generation')
8 | 2) Features from video frames (see section 'Image features generation')
9 |
10 | Additionally, we can train a model for temporally-linked samples, in that case we have to run an additional pre-processing script.
11 |
12 | ## Folder structure
13 |
14 | Following we describe the desired folder structure for storing the dataset-related information:
15 |
16 | ./Images
17 | video_[video_id]
18 | [num_image].jpg
19 | [num_image].jpg
20 | ./Annotations
21 | test_list.txt
22 | train_list.txt
23 | val_list.txt
24 | captions.id.en
25 | ./Features
26 | test_[name_feat].csv
27 | train_[name_feat].csv
28 | val_[name_feat].csv
29 |
30 | The folder ./Images contains a set of folders 'video_[video_id]', where each folder represents a video and contains a set of frames '[num_image].jpg'.
31 |
32 | The folder ./Annotations contains, for each set split {train, val, test}, a file with the suffix _list.txt. Containing the list of videos 'video_[video_id]' belonging to the respective split. It also contains the file 'captions.id.en', which lists all the available captions for all the videos.
33 |
34 | The folder ./Features contains any kind of features extracted from the respective set splits (only needed if using image features instead of raw images).
35 |
36 |
37 | ## Descriptions generation
38 |
39 | This step will be needed either if we are using raw video frames or video features.
40 |
41 | Script name:
42 | generate_descriptions_lists.py
43 | Description:
44 | Extracts and counts the available descriptions for each video.
45 | Output:
46 | - A file per split with the suffix _descriptions.txt.
47 | Containing a list of descriptions for all videos.
48 | - A file per split with the suffix _descriptions_counts.npy.
49 | Containing a python list with the counts of descriptions per video.
50 | The output will be stored in ./Annotations.
51 |
52 |
53 | ## Image lists generation
54 |
55 | This step will be needed if we are using raw video frames only.
56 |
57 | Script name:
58 | generate_img_lists.py
59 | Description:
60 | Lists and counts the frames belonging to each video.
61 | Output:
62 | - A file per split with the suffix _imgs_list.txt.
63 | Containing the list of frames for all videos.
64 | - A file per split with the suffix _imgs_counts.txt.
65 | Containing a list of frame counts per video.
66 | The output will be stored in ./Annotations.
67 |
68 |
69 | ## Image features generation
70 |
71 | This step will be needed if we are using image features only. The number of feature vectors per video does not need to match the number of frames.
72 |
73 | Script name:
74 | generate_features_lists.py
75 | Description:
76 | Stores each feature vector contained in the corresponding .Features/[split_name]_[name_feat].csv in a separate .npy file and counts them.
77 | Output:
78 | - A file per split with the suffix _feat_list.txt.
79 | Containing the path to each feature vector.
80 | - A file per split with the suffix _feat_counts.txt.
81 | Containing the counts of vectors per video.
82 | The output .txt files will be stored in ./Annotations/[name_feat]/. And the .npy files in ./Features/[name_feat]/
83 |
84 | ## Temporally-linked samples
85 |
86 | This step will be needed if we are using temporally-linked samples.
87 |
88 | Script name:
89 | generate_link_lists.py
90 | Description:
91 | Stores a separate list .Annotations/[split_list]_link_samples.txt with the indices to the previous samples in the temporal link.
92 | Output:
93 | - A file per split with the suffix _link_samples.txt.
94 | Containing the index to the previous sample in the link (or -1) if it is the first sample in the link.
95 |
--------------------------------------------------------------------------------
/data_engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarcBS/TMA/76f4e16b3f10c056c45ada5df4a64cc564b69011/data_engine/__init__.py
--------------------------------------------------------------------------------
/data_engine/generate_corpus_full_history.py:
--------------------------------------------------------------------------------
1 | """
2 | the file id_seg_cap.txt has been generated with the folloing script
3 |
4 | awk '{print substr(FILENAME, 1, length(FILENAME)-4) "," $0}' * > ../id_seg_cap.txt
5 |
6 | and its format is:
7 | file_id, segment_number, caption
8 | """
9 |
10 | base_path = '/media/HDD_2TB/DATASETS/EDUB-SegDesc/GT/'
11 |
12 | txt_files = base_path + 'id_seg_cap.txt'
13 | dest_files = base_path + 'captions.id.full_history.txt'
14 |
15 | file = open(txt_files, mode='r')
16 | dest_file = open(dest_files + 'curr', mode='w')
17 |
18 | separator = '----'
19 | space_sym = ' '
20 |
21 | prev_id = 'Segment1'
22 | caps_txt = []
23 | prev_caps = []
24 | j = 0
25 | for line in file:
26 | id_text = line.split(",")
27 | user_id = id_text[0]
28 | segment_id = id_text[1]
29 | text = ' '.join(id_text[2:]).strip()
30 | j += 1
31 | if j % 1000 == 0:
32 | print "Processed", j, "lines"
33 | if segment_id == prev_id:
34 | caps_txt.append(text)
35 |
36 | # for prev_cap in prev_caps:
37 | # caps_txt.append(prev_cap + space_sym + text)
38 | elif segment_id == 'Segment1': # Start of day
39 | prev_id = segment_id
40 | i = 0
41 | for curr_cap in caps_txt:
42 | dest_file.write(user_id + '_' + segment_id + '#' + str(i) + separator + curr_cap + '\n')
43 | i += 1
44 | prev_caps = caps_txt
45 | else:
46 | # Different segment
47 | # We combine
48 | prev_id = segment_id
49 | # for prev_cap in prev_caps:
50 | # prev_caps2.append(prev_cap + space_sym + cap)
51 | caps_txt = []
52 | caps_txt.append(text)
53 | i = 0
54 | for prev_cap in prev_caps:
55 | for curr_cap in caps_txt:
56 | dest_file.write(
57 | user_id + '_' + segment_id + '#' + str(i) + separator + prev_cap + space_sym + curr_cap + '\n')
58 | i += 1
59 | prev_caps = [prev_cap + space_sym + curr_cap for curr_cap in caps_txt for prev_cap in prev_caps]
60 |
--------------------------------------------------------------------------------
/data_engine/generate_descriptions_lists.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def main():
5 | # base_path = '/media/HDD_2TB/DATASETS/MSVD/'
6 | base_path = '/media/HDD_3TB/DATASETS/EDUB-SegDesc/'
7 |
8 | without_noninfo = True
9 |
10 | path_files = 'Annotations'
11 |
12 | # Inputs
13 | # text = 'captions.id.en'
14 | if without_noninfo:
15 | text = 'captions_final_without_noninfo.id.en'
16 | else:
17 | text = 'captions_final.id.en'
18 | separator = '----'
19 |
20 | # train = 'train_list.txt'
21 | # val = 'val_list.txt'
22 | # test = 'test_list.txt'
23 |
24 | if without_noninfo:
25 | train = 'train_list_final_without_noninfo.txt'
26 | val = 'val_list_final_without_noninfo.txt'
27 | test = 'test_list_final_without_noninfo.txt'
28 |
29 | # Outputs
30 | train_out = 'train_descriptions_without_noninfo.txt'
31 | val_out = 'val_descriptions_without_noninfo.txt'
32 | test_out = 'test_descriptions_without_noninfo.txt'
33 |
34 | train_out_counts = 'train_descriptions_counts_without_noninfo.npy'
35 | val_out_counts = 'val_descriptions_counts_without_noninfo.npy'
36 | test_out_counts = 'test_descriptions_counts_without_noninfo.npy'
37 |
38 | else:
39 | train = 'train_list_final.txt'
40 | val = 'val_list_final.txt'
41 | test = 'test_list_final.txt'
42 |
43 | # Outputs
44 | train_out = 'train_descriptions.txt'
45 | val_out = 'val_descriptions.txt'
46 | test_out = 'test_descriptions.txt'
47 |
48 | train_out_counts = 'train_descriptions_counts.npy'
49 | val_out_counts = 'val_descriptions_counts.npy'
50 | test_out_counts = 'test_descriptions_counts.npy'
51 |
52 | #################################
53 |
54 | # Code
55 |
56 | text = path_files + '/' + text
57 | splits = [path_files + '/' + train, path_files + '/' + val, path_files + '/' + test]
58 | splits_out = [path_files + '/' + train_out, path_files + '/' + val_out, path_files + '/' + test_out]
59 | splits_counts = [path_files + '/' + train_out_counts, path_files + '/' + val_out_counts,
60 | path_files + '/' + test_out_counts]
61 |
62 | # read video names
63 | img_splits = [[], [], []]
64 | for i, s in enumerate(splits):
65 | with open(base_path + s, 'r') as f:
66 | for line in f:
67 | line = line.rstrip('\n')
68 | img_splits[i].append(line)
69 |
70 | # print img_splits
71 |
72 |
73 | # read descriptions and assign them to a split
74 | desc_splits = []
75 | counts_splits = []
76 | for i_s, s in enumerate(splits):
77 | desc_splits.append([[] for i in range(len(img_splits[i_s]))])
78 | counts_splits.append([0 for i in range(len(img_splits[i_s]))])
79 | with open(base_path + text, 'r') as f:
80 | for line in f:
81 | line = line.rstrip('\n')
82 | line = line.split('#')
83 | img = line[0]
84 | line = line[1].split(separator)
85 | desc = line[1]
86 |
87 | found = False
88 | i = 0
89 | while (not found and i < len(splits)):
90 | if (img in img_splits[i]):
91 | found = True
92 | idx = img_splits[i].index(img)
93 | desc_splits[i][idx].append(desc)
94 | counts_splits[i][idx] += 1
95 | i += 1
96 |
97 | if (not found):
98 | print 'Warning: Video ' + img + ' does not exist in lists'
99 |
100 | # write descriptions in separate files
101 | for f, d in zip(splits_out, desc_splits):
102 | f = open(base_path + f, 'w')
103 | for im in d:
104 | for desc in im:
105 | f.write(desc + '\n')
106 | f.close()
107 |
108 | # store description counts for each video
109 | for c, s in zip(counts_splits, splits_counts):
110 | np.save(base_path + s, c)
111 |
112 | print 'Done'
113 |
114 |
115 | main()
116 |
--------------------------------------------------------------------------------
/data_engine/generate_features_lists.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 |
4 | import numpy as np
5 |
6 | base_path = '/media/HDD_3TB/DATASETS/EDUB-SegDesc/'
7 | path_features = 'Features'
8 | path_annotations = 'Annotations'
9 | without_noninfo = True
10 |
11 | # Inputs
12 | if without_noninfo:
13 | features_name = 'ImageNet_Without_NonInfo'
14 | else:
15 | features_name = 'ImageNet'
16 |
17 | ###### Files with fixed number of frames per video
18 | # features_files = ['train_' + features_name + '.csv', 'val_' + features_name + '.csv', 'test_' + features_name + '.csv']
19 | # features_counts = ['train_' + features_name + '_counts.txt', 'val_' + features_name + '_counts.txt', 'test_' + features_name + '_counts.txt']
20 |
21 | ###### Files all original frames of videos
22 | # features_files = ['train_' + features_name + '.csv',
23 | # 'val_' + features_name + '.csv',
24 | # 'test_' + features_name + '.csv']
25 | # features_counts = ['train_' + features_name + '_counts.txt',
26 | # 'val_' + features_name + '_counts.txt',
27 | # 'test_' + features_name + '_all_frames_counts.txt']
28 |
29 |
30 | if without_noninfo:
31 | features_files = ['train_' + features_name + '_all_frames_without_noninfo.csv',
32 | 'val_' + features_name + '_all_frames_without_noninfo.csv',
33 | 'test_' + features_name + '_all_frames_without_noninfo.csv']
34 | features_counts = ['train_' + features_name + '_all_frames_counts_without_noninfo.txt',
35 | 'val_' + features_name + '_all_frames_counts_without_noninfo.txt',
36 | 'test_' + features_name + '_all_frames_counts_without_noninfo.txt']
37 | else:
38 | features_files = ['train_' + features_name + '_all_frames.csv',
39 | 'val_' + features_name + '_all_frames.csv',
40 | 'test_' + features_name + '_all_frames.csv']
41 | features_counts = ['train_' + features_name + '_all_frames_counts.txt',
42 | 'val_' + features_name + '_all_frames_counts.txt',
43 | 'test_' + features_name + '_all_frames_counts.txt']
44 |
45 | # features_name = 'C3D_fc8_ImageNet'
46 | # features_files = ['train_' + features_name + '.csv', 'val_' + features_name + '.csv', 'test_' + features_name + '.csv']
47 | # features_counts = ['train_' + features_name + '_counts.txt', 'val_' + features_name + '_counts.txt', 'test_' + features_name + '_counts.txt']
48 |
49 | # Outputs
50 | if without_noninfo:
51 | out_lists = ['train_feat_list_without_noninfo.txt',
52 | 'val_feat_list_without_noninfo.txt',
53 | 'test_feat_list_without_noninfo.txt']
54 | counts_lists = ['train_feat_counts_without_noninfo.txt',
55 | 'val_feat_counts_without_noninfo.txt',
56 | 'test_feat_counts_without_noninfo.txt']
57 | else:
58 | out_lists = ['train_feat_list.txt', 'val_feat_list.txt', 'test_feat_list.txt']
59 | counts_lists = ['train_feat_counts.txt', 'val_feat_counts.txt', 'test_feat_counts.txt']
60 |
61 | #########
62 |
63 | if os.path.isdir(base_path + '/' + path_features + '/' + features_name):
64 | shutil.rmtree(base_path + '/' + path_features + '/' + features_name)
65 | os.makedirs(base_path + '/' + path_features + '/' + features_name)
66 |
67 | if not os.path.isdir(base_path + '/' + path_annotations + '/' + features_name):
68 | os.makedirs(base_path + '/' + path_annotations + '/' + features_name)
69 |
70 | c_videos = 0
71 | for f, fc, o, c in zip(features_files, features_counts, out_lists, counts_lists):
72 | print "Processing " + f
73 |
74 | f = open(base_path + '/' + path_features + '/' + f, 'r')
75 | fc = open(base_path + '/' + path_features + '/' + fc, 'r')
76 | o = open(base_path + '/' + path_annotations + '/' + features_name + '/' + o, 'w')
77 | c = open(base_path + '/' + path_annotations + '/' + features_name + '/' + c, 'w')
78 |
79 | all_counts = list()
80 | for line in fc:
81 | line = line.strip('\n')
82 | all_counts.append(int(line))
83 |
84 | c_frame = 0
85 | c_videos_split = 0
86 | # Process each line in the file
87 | for enum, line in enumerate(f):
88 | frame = line.strip('\n')
89 | frame = np.fromstring(frame, sep=',') # covert csv line to numpy array
90 |
91 | this_path = "%s/video_%0.4d" % (path_features + '/' + features_name, c_videos)
92 | if not os.path.isdir(base_path + this_path):
93 | os.makedirs(base_path + this_path)
94 | this_path = "%s/video_%0.4d/frame_%0.4d.npy" % (path_features + '/' + features_name, c_videos, c_frame)
95 | # Save array in disk
96 | try:
97 | np.save(base_path + this_path, frame)
98 | except:
99 | print 'line file', enum
100 | print 'file name', base_path + this_path
101 | print 'lenvec', len(frame)
102 | print 'vec', frame
103 | print
104 | # Write path to file
105 | o.write(this_path + '\n')
106 |
107 | c_frame += 1
108 |
109 | # a complete video was processed
110 | if c_frame % all_counts[c_videos_split] == 0:
111 | c_videos += 1
112 | c.write(str(all_counts[c_videos_split]) + '\n') # store counts
113 | c_videos_split += 1
114 | c_frame = 0
115 |
116 | f.close()
117 | fc.close()
118 | o.close()
119 | c.close()
120 |
121 | print 'Done!'
122 |
--------------------------------------------------------------------------------
/data_engine/generate_img_lists.py:
--------------------------------------------------------------------------------
1 | import glob
2 |
3 | base_path = '/media/HDD_2TB/DATASETS/MSVD/'
4 |
5 | # Inputs
6 | split_lists = ['train_list.txt', 'val_list.txt', 'test_list.txt']
7 | imgs_format = '.jpg'
8 | path_imgs = 'Images'
9 | path_files = 'Annotations'
10 |
11 | # Outputs
12 | out_lists = ['train_imgs_list.txt', 'val_imgs_list.txt', 'test_imgs_list.txt']
13 | counts_lists = ['train_imgs_counts.txt', 'val_imgs_counts.txt', 'test_imgs_counts.txt']
14 |
15 | # Code
16 | print 'Listing all images from all videos...'
17 |
18 | len_base = len(base_path)
19 | for s, o, c in zip(split_lists, out_lists, counts_lists):
20 | s = open(base_path + '/' + path_files + '/' + s, 'r')
21 | o = open(base_path + '/' + path_files + '/' + o, 'w')
22 | c = open(base_path + '/' + path_files + '/' + c, 'w')
23 | for line in s:
24 | video = line.strip('\n')
25 | this_path = base_path + '/' + path_imgs + "/video_" + video + "/*" + imgs_format
26 | images = glob.glob(this_path)
27 | for im in images:
28 | # o.write(path_imgs+"/video_"+video+"/"+im+'\n') # store each image path
29 | o.write(im[len_base:] + '\n')
30 | c.write(str(len(images)) + '\n') # store counts
31 | s.close()
32 | o.close()
33 | c.close()
34 |
35 | print 'Done!'
36 |
--------------------------------------------------------------------------------
/data_engine/generate_img_lists_from_split.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 |
4 | import xlrd
5 |
6 | # Split the existent data in train, val and test
7 | data_path = '/media/HDD_3TB/DATASETS/EDUB-SegDesc'
8 |
9 | # input data paths
10 | in_descriptions_path = 'GT/descriptions'
11 | in_segments_path = 'GT/segmentations'
12 | in_images_path = 'Images' # //.jpg
13 | imgs_format = '.jpg'
14 |
15 | # output data paths
16 | out_features_path = 'Features' # __all_frames.csv & __all_frames_counts.txt
17 | out_descriptions_path = 'Annotations'
18 | out_image_lists_path = 'Annotations' # _imgs_list.txt & _imgs_counts.txt
19 |
20 | # Get day_sets for each data split
21 | sets = dict()
22 | for s in ['train', 'val', 'test']:
23 | sets[s] = []
24 | with open(data_path + '/' + out_descriptions_path + '/' + s + '_list_final.txt', 'r') as list_file:
25 | prev_set = -1
26 | for line in list_file:
27 | line = line.rstrip('\n')
28 | line = line.split('_')
29 | if line[0] != prev_set:
30 | sets[s].append(line[0])
31 | prev_set = line[0]
32 |
33 | # Get segments' IDs with errors
34 | errors = dict()
35 | for s in ['train', 'val', 'test']:
36 | errors[s] = dict()
37 | for day_split in sets[s]:
38 | errors[s][day_split] = []
39 | with open(data_path + '/' + in_descriptions_path + '/' + day_split + '.txt', 'r') as list_file:
40 | for line in list_file:
41 | line = line.rstrip('\n').split(',')
42 | segm_id = int(line[0][7:])
43 | desc = ','.join(line[1:])
44 | desc = desc.strip().lower()
45 | if desc == 'error':
46 | errors[s][day_split].append(segm_id)
47 |
48 | # Get events of correct segments
49 | for s in ['train', 'val', 'test']:
50 |
51 | file_imgs = open(data_path + '/' + out_image_lists_path + '/' + s + '_imgs_list.txt', 'w')
52 | file_counts = open(data_path + '/' + out_image_lists_path + '/' + s + '_imgs_counts.txt', 'w')
53 |
54 | for day_split in sets[s]:
55 | possible_names = ['/GT_' + day_split + '.xls', '/GT_' + day_split + '.xlsx', '/' + day_split + '.xls',
56 | '/' + day_split + '.xlsx']
57 | exists = False
58 | i = 0
59 | while not os.path.isfile(data_path + '/' + in_segments_path + possible_names[i]):
60 | i += 1
61 | file = xlrd.open_workbook(data_path + '/' + in_segments_path + possible_names[i])
62 | sheet = file.sheet_by_index(0)
63 |
64 | count_segments = 1
65 | these_events = []
66 | empty = False
67 | i = 2 # 1st row with info
68 | while not empty:
69 | try:
70 | evt = sheet.cell(i, 1).value.split()
71 | if len(evt) == 1:
72 | evt = sheet.cell(i, 1).value.split('-')
73 | if evt:
74 | if count_segments not in errors[s][day_split]: # avoid segments with errors (dark/blurry images)
75 | these_events.append([evt[0].strip(), evt[1].strip()])
76 | else:
77 | empty = True
78 | i += 1
79 | count_segments += 1
80 | except:
81 | empty = True
82 |
83 | # Get list of images
84 | these_images = glob.glob(data_path + '/' + in_images_path + '/' + day_split + '/*' + imgs_format)
85 | final_these_images = []
86 | for im in these_images:
87 | final_these_images.append(im.split('/')[-1].split('.')[0])
88 | final_these_images = sorted(final_these_images)
89 |
90 | for e in these_events:
91 | if e[1] not in final_these_images:
92 | e[1] = '0' + e[1]
93 | if e[0] not in final_these_images:
94 | e[0] = '0' + e[0]
95 |
96 | fin_idx = final_these_images.index(e[1]) + 1
97 | ini_idx = final_these_images.index(e[0])
98 | current_event_imgs = final_these_images[ini_idx:fin_idx]
99 |
100 | # Store in files
101 | this_count = 0
102 | for imid in current_event_imgs:
103 | file_imgs.write(in_images_path + '/' + day_split + '/' + imid + imgs_format + '\n')
104 | this_count += 1
105 | file_counts.write(str(this_count) + '\n')
106 |
107 | file_imgs.close()
108 | file_counts.close()
109 |
110 | print 'DONE!'
111 |
--------------------------------------------------------------------------------
/data_engine/generate_link_lists.py:
--------------------------------------------------------------------------------
1 | ## Parameters
2 |
3 | base_path = '/media/HDD_3TB/DATASETS/EDUB-SegDesc/'
4 |
5 | path_files = 'Annotations'
6 | without_noninfo = True
7 |
8 | # Names of the different samples
9 | # All samples belonging to the same day must accomplish the following requirements:
10 | # - Be referenced continuously, without mixing with other days
11 | # - Be stored in chronological order
12 | # - Include the day identifier at the beginning of the line separated by the symbol '_'
13 | # Example:
14 | # Day1_video_1
15 | # Day1_video_2
16 | # Day1_video_3
17 | # Day2_video_1
18 | # Day2_video_2
19 | ####
20 |
21 | if without_noninfo:
22 | suffix = '_without_noninfo'
23 | else:
24 | suffix = ''
25 |
26 | train = 'train_list_final' + suffix + '.txt'
27 | val = 'val_list_final' + suffix + '.txt'
28 | test = 'test_list_final' + suffix + '.txt'
29 |
30 | # Outputs
31 | train_out = 'train_link_samples' + suffix + '.txt'
32 | val_out = 'val_link_samples' + suffix + '.txt'
33 | test_out = 'test_link_samples' + suffix + '.txt'
34 |
35 | #################################
36 |
37 | ## Code
38 |
39 | # Generate temporal links between samples which belong to the same day
40 | for fin, fout in zip([train, val, test], [train_out, val_out, test_out]):
41 |
42 | with open(base_path + '/' + path_files + '/' + fin, 'r') as fi, open(base_path + '/' + path_files + '/' + fout,
43 | 'w') as fo:
44 | prev_day_name = ''
45 | lines_counter = -1
46 | for line in fi:
47 | day_name = line.split('_')[0]
48 | if day_name == prev_day_name:
49 | fo.write(str(lines_counter) + '\n')
50 | lines_counter += 1
51 | else:
52 | fo.write('-1\n')
53 | lines_counter += 1
54 |
55 | prev_day_name = day_name
56 |
57 | print 'Done'
58 |
--------------------------------------------------------------------------------
/data_engine/generate_parallel_corpus.py:
--------------------------------------------------------------------------------
1 | """
2 | Generates a parallel corpus from the EDUB-GT Annotations:
3 | A language is the image captions.
4 | The other language is the previous caption of each sentence.
5 | """
6 |
7 | base_path = '/media/HDD_2TB/DATASETS/EDUB-SegDesc/GT/'
8 |
9 | txt_files = base_path + 'text.clean.txt'
10 | dest_files = base_path + 'training.'
11 |
12 | file = open(txt_files, mode='r')
13 |
14 | file_prevs = open(dest_files + 'prev', mode='w')
15 | file_curr = open(dest_files + 'curr', mode='w')
16 |
17 | prev_id = 'Segment1'
18 | caps_txt = []
19 | prev_caps = ['None']
20 | i = 0
21 | for line in file:
22 | id_text = line.split(",")
23 | id = id_text[0]
24 | text = ' '.join(id_text[1:]).strip()
25 | if id == prev_id:
26 | caps_txt.append(text)
27 | elif id == 'Segment1':
28 | prev_id = id
29 | prev_caps = ['None']
30 | caps_txt.append(text)
31 | for curr_cap in caps_txt:
32 | for prev_cap in prev_caps:
33 | file_prevs.write(prev_cap + '\n')
34 | file_curr.write(curr_cap + '\n')
35 | i += 1
36 | else:
37 | caps_txt.append(text)
38 | for curr_cap in caps_txt:
39 | for prev_cap in prev_caps:
40 | file_prevs.write(prev_cap + '\n')
41 | file_curr.write(curr_cap + '\n')
42 | i += 1
43 |
44 | prev_id = id
45 | prev_caps = caps_txt
46 | caps_txt = []
47 |
--------------------------------------------------------------------------------
/data_engine/prepare_data.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import logging
3 |
4 | import numpy as np
5 |
6 | from keras_wrapper.dataset import Dataset, saveDataset, loadDataset
7 | from keras_wrapper.extra.read_write import pkl2dict
8 |
9 | logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(message)s', datefmt='%d/%m/%Y %H:%M:%S')
10 |
11 |
12 | def build_dataset(params):
13 | if params['REBUILD_DATASET']: # We build a new dataset instance
14 | if params['VERBOSE'] > 0:
15 | silence = False
16 | logging.info('Building ' + params['DATASET_NAME'] + ' dataset')
17 | else:
18 | silence = True
19 |
20 | base_path = params['DATA_ROOT_PATH']
21 | name = params['DATASET_NAME']
22 | ds = Dataset(name, base_path, silence=silence)
23 |
24 | if not '-vidtext-embed' in params['DATASET_NAME']:
25 | # OUTPUT DATA
26 | # Let's load the train, val and test splits of the descriptions (outputs)
27 | # the files include a description per line. In this dataset a variable number
28 | # of descriptions per video are provided.
29 | ds.setOutput(base_path + '/' + params['DESCRIPTION_FILES']['train'],
30 | 'train',
31 | type='text',
32 | id=params['OUTPUTS_IDS_DATASET'][0],
33 | build_vocabulary=True,
34 | tokenization=params['TOKENIZATION_METHOD'],
35 | fill=params['FILL'],
36 | pad_on_batch=True,
37 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
38 | sample_weights=params['SAMPLE_WEIGHTS'],
39 | min_occ=params['MIN_OCCURRENCES_VOCAB'])
40 |
41 | ds.setOutput(base_path + '/' + params['DESCRIPTION_FILES']['val'],
42 | 'val',
43 | type='text',
44 | id=params['OUTPUTS_IDS_DATASET'][0],
45 | build_vocabulary=True,
46 | pad_on_batch=True,
47 | tokenization=params['TOKENIZATION_METHOD'],
48 | sample_weights=params['SAMPLE_WEIGHTS'],
49 | max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'],
50 | min_occ=params['MIN_OCCURRENCES_VOCAB'])
51 |
52 | ds.setOutput(base_path + '/' + params['DESCRIPTION_FILES']['test'],
53 | 'test',
54 | type='text',
55 | id=params['OUTPUTS_IDS_DATASET'][0],
56 | build_vocabulary=True,
57 | pad_on_batch=True,
58 | tokenization=params['TOKENIZATION_METHOD'],
59 | sample_weights=params['SAMPLE_WEIGHTS'],
60 | max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'],
61 | min_occ=params['MIN_OCCURRENCES_VOCAB'])
62 |
63 | else:
64 | # Use descriptions as inputs instead --> 'matching'/'non-matching' as output
65 | ds.setInput(base_path + '/' + params['DESCRIPTION_FILES']['train'],
66 | 'train',
67 | type='text',
68 | id=params['INPUTS_IDS_DATASET'][1],
69 | build_vocabulary=True,
70 | tokenization=params['TOKENIZATION_METHOD'],
71 | fill=params['FILL'],
72 | pad_on_batch=True,
73 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
74 | min_occ=params['MIN_OCCURRENCES_VOCAB'])
75 |
76 | ds.setInput(base_path + '/' + params['DESCRIPTION_FILES']['val'],
77 | 'val',
78 | type='text',
79 | id=params['INPUTS_IDS_DATASET'][1],
80 | build_vocabulary=True,
81 | pad_on_batch=True,
82 | tokenization=params['TOKENIZATION_METHOD'],
83 | max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'],
84 | min_occ=params['MIN_OCCURRENCES_VOCAB'])
85 |
86 | ds.setInput(base_path + '/' + params['DESCRIPTION_FILES']['test'],
87 | 'test',
88 | type='text',
89 | id=params['INPUTS_IDS_DATASET'][1],
90 | build_vocabulary=True,
91 | pad_on_batch=True,
92 | tokenization=params['TOKENIZATION_METHOD'],
93 | max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'],
94 | min_occ=params['MIN_OCCURRENCES_VOCAB'])
95 |
96 | # INPUT DATA
97 | # Let's load the associated videos (inputs)
98 | # we must take into account that in this dataset we have a different number of sentences per video,
99 | # for this reason we introduce the parameter 'repeat_set'=num_captions, where num_captions is a list
100 | # containing the number of captions in each video.
101 |
102 | num_captions_train = np.load(base_path + '/' + params['DESCRIPTION_COUNTS_FILES']['train'])
103 | num_captions_val = np.load(base_path + '/' + params['DESCRIPTION_COUNTS_FILES']['val'])
104 | num_captions_test = np.load(base_path + '/' + params['DESCRIPTION_COUNTS_FILES']['test'])
105 |
106 | for feat_type in params['FEATURE_NAMES']:
107 | for split, num_cap in zip(['train', 'val', 'test'],
108 | [num_captions_train, num_captions_val, num_captions_test]):
109 | list_files = base_path + '/' + params['FRAMES_LIST_FILES'][split] % feat_type
110 | counts_files = base_path + '/' + params['FRAMES_COUNTS_FILES'][split] % feat_type
111 |
112 | ds.setInput([list_files, counts_files],
113 | split,
114 | type=params['INPUT_DATA_TYPE'],
115 | id=params['INPUTS_IDS_DATASET'][0],
116 | repeat_set=num_cap,
117 | max_video_len=params['NUM_FRAMES'],
118 | feat_len=params['IMG_FEAT_SIZE'],
119 | data_augmentation_types=params['DATA_AUGMENTATION_TYPE'])
120 |
121 | if not '-vidtext-embed' in params['DATASET_NAME'] and len(params['INPUTS_IDS_DATASET']) > 1:
122 | ds.setInput(base_path + '/' + params['DESCRIPTION_FILES']['train'],
123 | 'train',
124 | type='text',
125 | id=params['INPUTS_IDS_DATASET'][1],
126 | required=False,
127 | tokenization=params['TOKENIZATION_METHOD'],
128 | pad_on_batch=True,
129 | build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
130 | offset=1,
131 | fill=params['FILL'],
132 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
133 | max_words=params['OUTPUT_VOCABULARY_SIZE'],
134 | min_occ=params['MIN_OCCURRENCES_VOCAB'])
135 |
136 | ds.setInput(None, 'val', type='ghost', id=params['INPUTS_IDS_DATASET'][1], required=False)
137 | ds.setInput(None, 'test', type='ghost', id=params['INPUTS_IDS_DATASET'][1], required=False)
138 |
139 | # Set inputs for temporally-linked samples
140 | if not '-vidtext-embed' in params['DATASET_NAME'] and '-linked' in params['DATASET_NAME']:
141 | # Set input captions from previous event/video
142 | if '-upperbound' not in params['DATASET_NAME']:
143 | if '-vidtext' in params['DATASET_NAME']: # use both previous video and previous description
144 |
145 | ds, repeat_images = insertTemporallyLinkedCaptionsVidText(ds, params,
146 | vidtext_set_names={
147 | 'video': ['train', 'val', 'test'],
148 | 'text': ['train']})
149 | del repeat_images['test']
150 | del repeat_images['val']
151 | # Insert empty prev_descriptions on val and test sets
152 | ds.setInput([],
153 | 'val',
154 | type='text',
155 | id=params['INPUTS_IDS_DATASET'][2],
156 | build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
157 | tokenization=params['TOKENIZATION_METHOD'],
158 | fill=params['FILL'],
159 | pad_on_batch=True,
160 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
161 | min_occ=params['MIN_OCCURRENCES_VOCAB'],
162 | required=False,
163 | overwrite_split=True)
164 | ds.setInput([],
165 | 'test',
166 | type='text',
167 | id=params['INPUTS_IDS_DATASET'][2],
168 | build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
169 | tokenization=params['TOKENIZATION_METHOD'],
170 | fill=params['FILL'],
171 | pad_on_batch=True,
172 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
173 | min_occ=params['MIN_OCCURRENCES_VOCAB'],
174 | required=False,
175 | overwrite_split=True)
176 |
177 | elif '-video' in params['DATASET_NAME']:
178 | ds, repeat_images = insertTemporallyLinkedCaptions(ds, params,
179 | set_names=['train', 'val', 'test'],
180 | video=True)
181 | num_captions_val = repeat_images['val']
182 | num_captions_test = repeat_images['test']
183 | else:
184 | ds, repeat_images = insertTemporallyLinkedCaptions(ds, params)
185 | # Insert empty prev_descriptions on val and test sets
186 | ds.setInput([],
187 | 'val',
188 | type='text',
189 | id=params['INPUTS_IDS_DATASET'][2],
190 | build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
191 | tokenization=params['TOKENIZATION_METHOD'],
192 | fill=params['FILL'],
193 | pad_on_batch=True,
194 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
195 | min_occ=params['MIN_OCCURRENCES_VOCAB'],
196 | required=False,
197 | overwrite_split=True)
198 | ds.setInput([],
199 | 'test',
200 | type='text',
201 | id=params['INPUTS_IDS_DATASET'][2],
202 | build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
203 | tokenization=params['TOKENIZATION_METHOD'],
204 | fill=params['FILL'],
205 | pad_on_batch=True,
206 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
207 | min_occ=params['MIN_OCCURRENCES_VOCAB'],
208 | required=False,
209 | overwrite_split=True)
210 | else:
211 | ds, repeat_images = insertTemporallyLinkedCaptions(ds,
212 | params,
213 | set_names=['train', 'val', 'test'],
214 | upperbound=True,
215 | video='-video' in params['DATASET_NAME'],
216 | copy='-copy' in params['DATASET_NAME'],
217 | force_nocopy='-nocopy' in params['DATASET_NAME'],
218 | prev='-prev' in params['DATASET_NAME'])
219 | num_captions_val = repeat_images['val']
220 | num_captions_test = repeat_images['test']
221 |
222 | if not '-vidtext-embed' in params['DATASET_NAME']:
223 | # Process dataset for keeping only one caption per video and storing the rest in a dict() with the following format:
224 | # ds.extra_variables[set_name][id_output][img_position] = [cap1, cap2, cap3, ..., capN]
225 | keep_n_captions(ds, repeat=[num_captions_val, num_captions_test], n=1, set_names=['val', 'test'])
226 |
227 | else:
228 | # Set outputs for -vidtext-embed model
229 | insertVidTextEmbedNegativeSamples(ds, params,
230 | repeat=[num_captions_train, num_captions_val, num_captions_test])
231 |
232 | if not '-vidtext-embed' in params['DATASET_NAME'] and \
233 | '-linked' in params['DATASET_NAME'] and \
234 | '-upperbound' not in params['DATASET_NAME'] and \
235 | '-video' not in params['DATASET_NAME']:
236 | # Set previous data indices
237 | for s, file in params['LINK_SAMPLE_FILES'].iteritems():
238 | if s in repeat_images:
239 | rep = repeat_images[s]
240 | else:
241 | rep = 1
242 | ds.setInput(base_path + '/' + file,
243 | s,
244 | type='id',
245 | id=params['INPUTS_IDS_DATASET'][-1],
246 | repeat_set=rep)
247 |
248 | # We have finished loading the dataset, now we can store it for using it in the future
249 | saveDataset(ds, params['DATASET_STORE_PATH'])
250 | else:
251 | # We can easily recover it with a single line
252 | ds = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' + params['DATASET_NAME'] + '.pkl')
253 |
254 | # Load vocabulary-related parameters of dataset used for pre-training
255 | if params['PRE_TRAINED_DATASET_NAME'] is not None:
256 | logging.info('Re-using previous dataset vocabulary ' + params['PRE_TRAINED_DATASET_NAME'])
257 | dataset_pretrained = loadDataset(
258 | params['DATASET_STORE_PATH'] + 'Dataset_' + params['PRE_TRAINED_DATASET_NAME'] + '.pkl')
259 | for id_new, id_old in params['VOCABULARIES_MAPPING'].iteritems():
260 | ds.vocabulary[id_new] = copy.deepcopy(dataset_pretrained.vocabulary[id_old])
261 | ds.vocabulary_len[id_new] = copy.deepcopy(dataset_pretrained.vocabulary_len[id_old])
262 | elif params['PRE_TRAINED_VOCABULARY_NAME'] is not None:
263 | logging.info('Re-using previous vocabulary ' + params['PRE_TRAINED_VOCABULARY_NAME'])
264 | dataset_pretrained_vocabulary = pkl2dict(
265 | params['DATASET_STORE_PATH'] + params['PRE_TRAINED_VOCABULARY_NAME'] + '.pkl')
266 | for id_new, id_old in params['VOCABULARIES_MAPPING'].iteritems():
267 | ds.vocabulary[id_new] = copy.deepcopy(dataset_pretrained_vocabulary[id_old])
268 | ds.vocabulary_len[id_new] = len(dataset_pretrained_vocabulary[id_old]['idx2words'])
269 |
270 | return ds
271 |
272 |
273 | def keep_n_captions(ds, repeat, n=1, set_names=['val', 'test']):
274 | ''' Keeps only n captions per image and stores the rest in dictionaries for a later evaluation
275 | '''
276 |
277 | for s, r in zip(set_names, repeat):
278 | logging.info('Keeping ' + str(n) + ' captions per input on the ' + str(s) + ' set.')
279 |
280 | ds.extra_variables[s] = dict()
281 | exec ('n_samples = ds.len_' + s)
282 |
283 | # Process inputs
284 | for id_in in ds.ids_inputs:
285 | new_X = []
286 | if id_in in ds.optional_inputs:
287 | try:
288 | exec ('X = ds.X_' + s)
289 | i = 0
290 | for next_repeat in r:
291 | for j in range(n):
292 | new_X.append(X[id_in][i + j])
293 | i += next_repeat
294 | exec ('ds.X_' + s + '[id_in] = new_X')
295 | except:
296 | pass
297 | else:
298 | exec ('X = ds.X_' + s)
299 | i = 0
300 | for next_repeat in r:
301 | for j in range(n):
302 | new_X.append(X[id_in][i + j])
303 | i += next_repeat
304 | exec ('ds.X_' + s + '[id_in] = new_X')
305 | # Process outputs
306 | for id_out in ds.ids_outputs:
307 | new_Y = []
308 | exec ('Y = ds.Y_' + s)
309 | dict_Y = dict()
310 | count_samples = 0
311 | i = 0
312 | for next_repeat in r:
313 | dict_Y[count_samples] = []
314 | for j in range(next_repeat):
315 | if j < n:
316 | new_Y.append(Y[id_out][i + j])
317 | dict_Y[count_samples].append(Y[id_out][i + j])
318 | count_samples += 1
319 | i += next_repeat
320 | exec ('ds.Y_' + s + '[id_out] = new_Y')
321 | # store dictionary with vid_pos -> [cap1, cap2, cap3, ..., capNi]
322 | ds.extra_variables[s][id_out] = dict_Y
323 |
324 | new_len = len(new_Y)
325 | exec ('ds.len_' + s + ' = new_len')
326 | logging.info('Samples reduced to ' + str(new_len) + ' in ' + s + ' set.')
327 |
328 |
329 | def insertTemporallyLinkedCaptions(ds, params, set_names=['train'],
330 | upperbound=False,
331 | video=False, copy=False, force_nocopy=False, prev=False):
332 | """
333 | Inserts an additional input consisting of the desired captions from the previous segment/event
334 | in chronological order. Example:
335 | :
336 | :
337 | .
338 | .
339 | .
340 | :
341 | :
342 | .
343 | .
344 | .
345 |
346 | :param ds: dataset to modify
347 | :param params: parameters from config
348 | :param set_names: names of the splits that will be modified (default 'train' only)
349 | :param upperbound: whether we want to generate a dataset for an upper bound comparison by using the same captions both as input and output
350 | :param video: whether we use the previous' event video as input instead of the previous caption
351 | :param copy: generates an upperbound dataset only intending to copy giving only matching input-output sequences (only valid if upperbound=True)
352 | :param force_nocopy: generates an upperbound dataset using the same captions both as input and output but avoiding direct copies
353 | :param prev: indicates if we want to use the previous event's caption as input for the next, or use the current event's output instead
354 |
355 | :return: dataset modified with the additional input
356 | """
357 | base_path = params['DATA_ROOT_PATH']
358 | repeat_images = dict()
359 |
360 | for s in set_names:
361 | # retrieve number of output captions per sample
362 | num_cap = np.load(base_path + '/' + params['DESCRIPTION_COUNTS_FILES'][s])
363 |
364 | # get temporal links
365 | links = []
366 | with open(base_path + '/' + params['LINK_SAMPLE_FILES'][s], 'r') as f_links:
367 | for line in f_links:
368 | links.append(int(line.strip()))
369 |
370 | outputs = []
371 | with open(base_path + '/' + params['DESCRIPTION_FILES'][s], 'r') as f_outs:
372 | for line in f_outs:
373 | outputs.append(line.strip())
374 |
375 | # get outputs
376 | if video:
377 | prev_videos = []
378 | for feat_type in params['FEATURE_NAMES']:
379 | list_files = base_path + '/' + params['FRAMES_LIST_FILES'][s] % feat_type
380 | counts_files = base_path + '/' + params['FRAMES_COUNTS_FILES'][s] % feat_type
381 | with open(list_files, 'r') as f_outs, open(counts_files, 'r') as f_outs_counts:
382 | prev_videos.append(
383 | [[line.strip() for line in f_outs], [int(line.strip()) for line in f_outs_counts]])
384 |
385 | # modify outputs and prepare inputs
386 | images_repeat = []
387 | upperbound_images_repeat = []
388 | final_outputs = []
389 | if video:
390 | final_inputs = dict()
391 | for feat_type in params['FEATURE_NAMES']:
392 | final_inputs[feat_type] = [[], []]
393 | else:
394 | final_inputs = []
395 | for i, link in enumerate(links):
396 | ini_out = np.sum(num_cap[:i])
397 | these_outputs = outputs[ini_out:ini_out + num_cap[i]]
398 |
399 | if upperbound:
400 | if copy:
401 | images_repeat.append(num_cap[i])
402 | upperbound_images_repeat.append(num_cap[i])
403 | for out in these_outputs:
404 | final_outputs.append(out)
405 | final_inputs.append(out)
406 | elif prev:
407 | # first sample in the temporally-linked sequence
408 | if link == -1:
409 | images_repeat.append(num_cap[i])
410 | upperbound_images_repeat.append(num_cap[i])
411 | for out in these_outputs:
412 | final_outputs.append(out)
413 | final_inputs.append('')
414 | else:
415 | prev_ini_out = np.sum(num_cap[:link])
416 | prev_outputs = outputs[prev_ini_out:prev_ini_out + num_cap[link]]
417 | images_repeat.append(num_cap[i] * num_cap[link])
418 | for n in range(num_cap[link]):
419 | upperbound_images_repeat.append(num_cap[i])
420 | for out in these_outputs:
421 | final_outputs.append(out)
422 | final_inputs.append(prev_outputs[n])
423 | elif force_nocopy:
424 | raise NotImplementedError()
425 | prev_outputs = these_outputs
426 | images_repeat.append(num_cap[i] * (num_cap[i] - 1))
427 | for n in range(num_cap[i]):
428 | upperbound_images_repeat.append(num_cap[i] - 1)
429 | for nthese, out in enumerate(these_outputs):
430 | if nthese != n:
431 | final_outputs.append(out)
432 | final_inputs.append(prev_outputs[n])
433 | else:
434 | prev_outputs = these_outputs
435 | images_repeat.append(num_cap[i] * num_cap[i])
436 | for n in range(num_cap[i]):
437 | upperbound_images_repeat.append(num_cap[i])
438 | for out in these_outputs:
439 | final_outputs.append(out)
440 | final_inputs.append(prev_outputs[n])
441 | else:
442 | if video:
443 | # first sample in the temporally-linked sequence
444 | if link == -1:
445 | images_repeat.append(num_cap[i])
446 | for out in these_outputs:
447 | final_outputs.append(out)
448 | for ifeat, feat_type in enumerate(params['FEATURE_NAMES']):
449 | final_inputs[feat_type][1] += [0]
450 | else:
451 | images_repeat.append(num_cap[i])
452 | for out in these_outputs:
453 | final_outputs.append(out)
454 | for ifeat, feat_type in enumerate(params['FEATURE_NAMES']):
455 | if link > 0:
456 | init_frame = int(sum(prev_videos[ifeat][1][:link]))
457 | else:
458 | init_frame = 0
459 | this_count = prev_videos[ifeat][1][link]
460 | final_inputs[feat_type][0] += prev_videos[ifeat][0][init_frame:init_frame + this_count]
461 | final_inputs[feat_type][1] += [this_count]
462 | else:
463 | # first sample in the temporally-linked sequence
464 | if link == -1:
465 | images_repeat.append(num_cap[i])
466 | for out in these_outputs:
467 | final_outputs.append(out)
468 | final_inputs.append('')
469 | else:
470 | prev_ini_out = np.sum(num_cap[:link])
471 | prev_outputs = outputs[prev_ini_out:prev_ini_out + num_cap[link]]
472 | images_repeat.append(num_cap[i] * num_cap[link])
473 | for n in range(num_cap[link]):
474 | for out in these_outputs:
475 | final_outputs.append(out)
476 | final_inputs.append(prev_outputs[n])
477 |
478 | # Overwrite input images assigning the new repeat pattern
479 | for feat_type in params['FEATURE_NAMES']:
480 | list_files = base_path + '/' + params['FRAMES_LIST_FILES'][s] % feat_type
481 | counts_files = base_path + '/' + params['FRAMES_COUNTS_FILES'][s] % feat_type
482 |
483 | ds.setInput([list_files, counts_files],
484 | s,
485 | type=params['INPUT_DATA_TYPE'],
486 | id=params['INPUTS_IDS_DATASET'][0],
487 | repeat_set=images_repeat,
488 | max_video_len=params['NUM_FRAMES'],
489 | feat_len=params['IMG_FEAT_SIZE'],
490 | overwrite_split=True,
491 | data_augmentation_types=params['DATA_AUGMENTATION_TYPE'])
492 |
493 | if not video:
494 | # Overwrite outputs assigning the new outputs repeat pattern
495 | ds.setOutput(final_outputs,
496 | s,
497 | type='text',
498 | id=params['OUTPUTS_IDS_DATASET'][0],
499 | build_vocabulary=True,
500 | tokenization=params['TOKENIZATION_METHOD'],
501 | fill=params['FILL'],
502 | pad_on_batch=True,
503 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
504 | sample_weights=params['SAMPLE_WEIGHTS'],
505 | min_occ=params['MIN_OCCURRENCES_VOCAB'],
506 | overwrite_split=True)
507 |
508 | # Overwrite the input state_below assigning the new outputs repeat pattern
509 | ds.setInput(final_outputs,
510 | s,
511 | type='text',
512 | id=params['INPUTS_IDS_DATASET'][1],
513 | required=False,
514 | tokenization=params['TOKENIZATION_METHOD'],
515 | pad_on_batch=True,
516 | build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
517 | offset=1,
518 | fill=params['FILL'],
519 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
520 | max_words=params['OUTPUT_VOCABULARY_SIZE'],
521 | min_occ=params['MIN_OCCURRENCES_VOCAB'],
522 | overwrite_split=True)
523 |
524 | if video:
525 | for feat_type in params['FEATURE_NAMES']:
526 | ds.setInput(final_inputs[feat_type],
527 | s,
528 | type=params['INPUT_DATA_TYPE'],
529 | id=params['INPUTS_IDS_DATASET'][2],
530 | repeat_set=images_repeat,
531 | max_video_len=params['NUM_FRAMES'],
532 | feat_len=params['IMG_FEAT_SIZE'],
533 | overwrite_split=True,
534 | data_augmentation_types=params['DATA_AUGMENTATION_TYPE'])
535 | else:
536 | # Set new input captions from previous temporally-linked event/video
537 | ds.setInput(final_inputs,
538 | s,
539 | type='text',
540 | id=params['INPUTS_IDS_DATASET'][2],
541 | build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
542 | tokenization=params['TOKENIZATION_METHOD'],
543 | fill=params['FILL'],
544 | pad_on_batch=True,
545 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
546 | min_occ=params['MIN_OCCURRENCES_VOCAB'])
547 |
548 | if upperbound:
549 | images_repeat = upperbound_images_repeat
550 | repeat_images[s] = images_repeat
551 |
552 | return ds, repeat_images
553 |
554 |
555 | def insertTemporallyLinkedCaptionsVidText(ds, params, vidtext_set_names={'video': ['train'], 'text': ['train']}):
556 | """
557 | Inserts two additional input consisting of the videos and captions from the previous segment/event
558 | in chronological order. Example:
559 | :
560 | :
561 | .
562 | .
563 | .
564 | :
565 | :
566 | .
567 | .
568 | .
569 |
570 | :param ds: dataset to modify
571 | :param params: parameters from config
572 | :param vidtext_set_names: dictionary names of the splits that will be modified for 'video' and for 'text'
573 |
574 | :return: dataset modified with the additional input
575 | """
576 | base_path = params['DATA_ROOT_PATH']
577 | repeat_images = dict()
578 |
579 | set_names = set(vidtext_set_names['video'] + vidtext_set_names['text'])
580 | for s in set_names:
581 | # retrieve number of output captions per sample
582 | num_cap = np.load(base_path + '/' + params['DESCRIPTION_COUNTS_FILES'][s])
583 |
584 | # get temporal links
585 | links = []
586 | with open(base_path + '/' + params['LINK_SAMPLE_FILES'][s], 'r') as f_links:
587 | for line in f_links:
588 | links.append(int(line.strip()))
589 |
590 | outputs = []
591 | with open(base_path + '/' + params['DESCRIPTION_FILES'][s], 'r') as f_outs:
592 | for line in f_outs:
593 | outputs.append(line.strip())
594 |
595 | # get outputs
596 | if s in vidtext_set_names['video']:
597 | prev_videos = []
598 | for feat_type in params['FEATURE_NAMES']:
599 | list_files = base_path + '/' + params['FRAMES_LIST_FILES'][s] % feat_type
600 | counts_files = base_path + '/' + params['FRAMES_COUNTS_FILES'][s] % feat_type
601 | with open(list_files, 'r') as f_outs, open(counts_files, 'r') as f_outs_counts:
602 | prev_videos.append(
603 | [[line.strip() for line in f_outs], [int(line.strip()) for line in f_outs_counts]])
604 |
605 | # modify outputs and prepare inputs
606 | images_repeat = []
607 | final_outputs = []
608 | if s in vidtext_set_names['video']:
609 | final_inputs_vid = dict()
610 | for feat_type in params['FEATURE_NAMES']:
611 | final_inputs_vid[feat_type] = [[], []]
612 | final_inputs_txt = []
613 |
614 | for i, link in enumerate(links):
615 | ini_out = np.sum(num_cap[:i])
616 | these_outputs = outputs[ini_out:ini_out + num_cap[i]]
617 |
618 | # first sample in the temporally-linked sequence
619 | if link == -1:
620 | images_repeat.append(num_cap[i])
621 | for out in these_outputs:
622 | final_outputs.append(out)
623 | if s in vidtext_set_names['text']:
624 | final_inputs_txt.append('')
625 | if s in vidtext_set_names['video']:
626 | for ifeat, feat_type in enumerate(params['FEATURE_NAMES']):
627 | final_inputs_vid[feat_type][1] += [0]
628 | else:
629 | if s in vidtext_set_names['text']:
630 | prev_ini_out = np.sum(num_cap[:link])
631 | prev_outputs = outputs[prev_ini_out:prev_ini_out + num_cap[link]]
632 | images_repeat.append(num_cap[i] * num_cap[link])
633 | else:
634 | images_repeat.append(num_cap[i])
635 |
636 | # video only
637 | if s not in vidtext_set_names['text'] and s in vidtext_set_names['video']:
638 | for out in these_outputs:
639 | final_outputs.append(out)
640 |
641 | for ifeat, feat_type in enumerate(params['FEATURE_NAMES']):
642 | if link > 0:
643 | init_frame = int(sum(prev_videos[ifeat][1][:link]))
644 | else:
645 | init_frame = 0
646 | this_count = prev_videos[ifeat][1][link]
647 | final_inputs_vid[feat_type][0] += prev_videos[ifeat][0][init_frame:init_frame + this_count]
648 | final_inputs_vid[feat_type][1] += [this_count]
649 |
650 | # text only
651 | elif s in vidtext_set_names['text'] and s not in vidtext_set_names['video']:
652 | for n in range(num_cap[link]):
653 | for out in these_outputs:
654 | final_outputs.append(out)
655 | final_inputs_txt.append(prev_outputs[n])
656 |
657 | # both
658 | else:
659 | for n in range(num_cap[link]):
660 | for out in these_outputs:
661 | final_outputs.append(out)
662 | final_inputs_txt.append(prev_outputs[n])
663 |
664 | for ifeat, feat_type in enumerate(params['FEATURE_NAMES']):
665 | if link > 0:
666 | init_frame = int(sum(prev_videos[ifeat][1][:link]))
667 | else:
668 | init_frame = 0
669 | this_count = prev_videos[ifeat][1][link]
670 | final_inputs_vid[feat_type][0] += prev_videos[ifeat][0][init_frame:init_frame + this_count]
671 | final_inputs_vid[feat_type][1] += [this_count]
672 |
673 | # Overwrite input images assigning the new repeat pattern
674 | for feat_type in params['FEATURE_NAMES']:
675 | list_files = base_path + '/' + params['FRAMES_LIST_FILES'][s] % feat_type
676 | counts_files = base_path + '/' + params['FRAMES_COUNTS_FILES'][s] % feat_type
677 |
678 | ds.setInput([list_files, counts_files],
679 | s,
680 | type=params['INPUT_DATA_TYPE'],
681 | id=params['INPUTS_IDS_DATASET'][0],
682 | repeat_set=images_repeat,
683 | max_video_len=params['NUM_FRAMES'],
684 | feat_len=params['IMG_FEAT_SIZE'],
685 | overwrite_split=True,
686 | data_augmentation_types=params['DATA_AUGMENTATION_TYPE'])
687 |
688 | # if text
689 | if s in vidtext_set_names['text']:
690 | # Overwrite outputs assigning the new outputs repeat pattern
691 | ds.setOutput(final_outputs,
692 | s,
693 | type='text',
694 | id=params['OUTPUTS_IDS_DATASET'][0],
695 | build_vocabulary=True,
696 | tokenization=params['TOKENIZATION_METHOD'],
697 | fill=params['FILL'],
698 | pad_on_batch=True,
699 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
700 | sample_weights=params['SAMPLE_WEIGHTS'],
701 | min_occ=params['MIN_OCCURRENCES_VOCAB'],
702 | overwrite_split=True)
703 |
704 | # Overwrite the input state_below assigning the new outputs repeat pattern
705 | ds.setInput(final_outputs,
706 | s,
707 | type='text',
708 | id=params['INPUTS_IDS_DATASET'][1],
709 | required=False,
710 | tokenization=params['TOKENIZATION_METHOD'],
711 | pad_on_batch=True,
712 | build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
713 | offset=1,
714 | fill=params['FILL'],
715 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
716 | max_words=params['OUTPUT_VOCABULARY_SIZE'],
717 | min_occ=params['MIN_OCCURRENCES_VOCAB'],
718 | overwrite_split=True)
719 |
720 | if s in vidtext_set_names['video']:
721 | for feat_type in params['FEATURE_NAMES']:
722 | ds.setInput(final_inputs_vid[feat_type],
723 | s,
724 | type=params['INPUT_DATA_TYPE'],
725 | id=params['INPUTS_IDS_DATASET'][3],
726 | repeat_set=images_repeat,
727 | max_video_len=params['NUM_FRAMES'],
728 | feat_len=params['IMG_FEAT_SIZE'],
729 | overwrite_split=True,
730 | data_augmentation_types=params['DATA_AUGMENTATION_TYPE'])
731 |
732 | if s in vidtext_set_names['text']:
733 | # Set new input captions from previous temporally-linked event/video
734 | ds.setInput(final_inputs_txt,
735 | s,
736 | type='text',
737 | id=params['INPUTS_IDS_DATASET'][2],
738 | required=False,
739 | build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
740 | tokenization=params['TOKENIZATION_METHOD'],
741 | fill=params['FILL'],
742 | pad_on_batch=True,
743 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
744 | min_occ=params['MIN_OCCURRENCES_VOCAB'],
745 | overwrite_split=True)
746 |
747 | repeat_images[s] = images_repeat
748 |
749 | return ds, repeat_images
750 |
751 |
752 | def insertVidTextEmbedNegativeSamples(ds, params, repeat):
753 | """
754 | Inserts negative balanced examples for training a Video-Text Embedding model.
755 |
756 | :param ds: dataset object with inputs of positive samples inserted
757 | :param params: config params
758 | :param repeat: number of times each video was repeated
759 | """
760 |
761 | for s, r in zip(['train', 'val', 'test'], repeat):
762 |
763 | # Get data from dataset
764 | X = None
765 | num_samples = 0
766 | exec ('num_samples = ds.len_' + s)
767 | exec ('X = ds.X_' + s)
768 |
769 | video_indices = X[params['INPUTS_IDS_DATASET'][0]]
770 | descriptions = X[params['INPUTS_IDS_DATASET'][1]]
771 |
772 | # Get real indices considering repetitions
773 | desc_real_indices = np.repeat(range(len(r)), r)
774 |
775 | # Let's generate some random video-description pairs
776 | negative_videos = np.random.choice(video_indices, num_samples, replace=True)
777 | for neg_id in negative_videos:
778 | # Insert index of repeated video (now as negative sample)
779 | video_indices.append(neg_id)
780 |
781 | # New find random description (avoiding correct descriptions for the selected video)
782 | real_id = desc_real_indices[neg_id]
783 | desc_id = np.random.choice([ind for ind in range(num_samples) if desc_real_indices[ind] != real_id], 1)[0]
784 |
785 | # Insert description of negative sample
786 | descriptions.append(descriptions[desc_id])
787 |
788 | # Re-insert videos and descriptions, including new length
789 | exec ('ds.X_' + s + '["' + params['INPUTS_IDS_DATASET'][0] + '"] = video_indices')
790 | exec ('ds.X_' + s + '["' + params['INPUTS_IDS_DATASET'][1] + '"] = descriptions')
791 | exec ('ds.len_' + s + ' = num_samples*2')
792 |
793 | # Insert output, which consists in 'matching'/'non-matching labels'
794 | matches = [1 for i in range(num_samples)] + [0 for i in range(num_samples)]
795 | ds.setOutput(matches,
796 | s,
797 | type='categorical',
798 | id=params['OUTPUTS_IDS_DATASET'][0])
799 |
800 | ds.setClasses(['matching', 'non-matching'], id=params['OUTPUTS_IDS_DATASET'][0])
801 |
--------------------------------------------------------------------------------
/data_engine/split_data.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 |
4 | import numpy as np
5 | import xlrd
6 |
7 | # Split the existent data in train, val and test
8 | data_path = '/media/HDD_3TB/DATASETS/EDUB-SegDesc'
9 | split_prop = {'train': 0.7,
10 | 'val': 0.15,
11 | 'test': 0.15,
12 | }
13 | sets_names = ['Estefania1', 'Estefania2', 'Estefania3', 'Estefania4', 'Estefania5',
14 | 'Gabriel1', 'Gabriel2', 'Gabriel3', 'Gabriel4',
15 | 'MAngeles1', 'MAngeles2', 'MAngeles3', 'MAngeles4',
16 | 'Marc1', 'Marc2', 'Marc3', 'Marc4', 'Marc5', 'Marc6', 'Marc7', 'Marc8', 'Marc9',
17 | 'Marc10', 'Marc11', 'Marc12', 'Marc13', 'Marc14', 'Marc15', 'Marc16', 'Marc17', 'Marc18',
18 | 'MarcC1',
19 | 'Mariella', 'Mariella2', 'Mariella3',
20 | 'Maya1', 'Maya2', 'Maya3', 'Maya4', 'Maya5', 'Maya6', 'Maya7', 'Maya8',
21 | 'Maya9', 'Maya10', 'Maya11', 'Maya12', 'Maya13', 'Maya14',
22 | 'Pedro1', 'Pedro2', 'Pedro3', 'Pedro4',
23 | # 'Txell1'
24 | 'Petia1', 'Petia2',
25 | ]
26 |
27 | sets = {'train': ['Maya14', 'Maya11', 'Maya10', 'Maya13', 'Maya12', 'Petia2',
28 | 'MAngeles4', 'Mariella', 'MAngeles1', 'Pedro1', 'MAngeles3',
29 | 'Pedro3', 'MarcC1', 'Estefania1', 'Estefania3', 'Marc18', 'Maya5',
30 | 'Gabriel3', 'Maya6', 'Maya1', 'Maya3', 'Marc16', 'Marc17',
31 | 'Marc15', 'Maya9', 'Maya8', 'Marc10', 'Marc11', 'Gabriel2',
32 | 'Marc7', 'Maya4', 'MAngeles2', 'Gabriel1', 'Marc8', 'Marc12',
33 | 'Marc5', 'Mariella3', 'Marc2', 'Marc3'],
34 | 'val': ['Pedro4', 'Pedro2', 'Estefania4', 'Maya7', 'Marc6', 'Petia1', 'Mariella2'],
35 | 'test': ['Estefania2', 'Marc1', 'Estefania5', 'Marc9', 'Gabriel4', 'Maya2', 'Marc4', 'Marc14', 'Marc13'],
36 | }
37 |
38 | # input data paths
39 | in_features_path = 'Features/Features_original' # /.csv
40 | in_descriptions_path = 'GT/descriptions' # .txt
41 | in_segments_path = 'GT/segmentations' # GT_.xls(x)
42 | in_images_path = 'Images' # /.jpg
43 | in_features_name = 'GoogleNet_ImageNet'
44 | format = '.jpg'
45 | # list of non-informative images stored in /NonInfo/.csv
46 | # leave empty for not using it
47 | in_noninfo_path = 'Features/NonInfo'
48 | noninformative_prefix = 'infoCNN_outputClasses'
49 |
50 | # output data paths
51 | out_features_path = 'Features' # __all_frames.csv & __all_frames_counts.txt
52 | out_descriptions_path = 'Annotations' # captions.id.en & _list.txt
53 | out_features_name = 'ImageNet_Without_NonInfo'
54 | separator = '----'
55 |
56 | ####################################
57 |
58 | if noninformative_prefix:
59 | suffix_name = '_without_noninfo'
60 | else:
61 | suffix_name = ''
62 |
63 | # Only apply random selection if the sets split is not already provided
64 | if not sets:
65 | # generate data splits
66 | available_sets = len(sets_names)
67 | randomized = np.random.choice(sets_names, available_sets, replace=False)
68 |
69 | # randomized = np.array(sets_names)
70 |
71 | sets = dict()
72 | picked_so_far = 0
73 | for s, p in split_prop.iteritems():
74 | last_picked = np.ceil(picked_so_far + available_sets * p)
75 | sets[s] = randomized[picked_so_far:last_picked]
76 | picked_so_far = last_picked
77 |
78 | # read images
79 | images = dict()
80 | for n, s in sets.iteritems():
81 | for set in s:
82 | images[set] = []
83 | these_images = glob.glob(data_path + '/' + in_images_path + '/' + set + '/*' + format)
84 | for im in these_images:
85 | images[set].append(im.split('/')[-1].split('.')[0])
86 | images[set] = sorted(images[set])
87 |
88 | # read segmentations
89 | events = dict()
90 | for n, s in sets.iteritems():
91 | for set in s:
92 | possible_names = ['/GT_' + set + '.xls', '/GT_' + set + '.xlsx', '/' + set + '.xls', '/' + set + '.xlsx']
93 | exists = False
94 | i = 0
95 | while not os.path.isfile(data_path + '/' + in_segments_path + possible_names[i]):
96 | i += 1
97 | file = xlrd.open_workbook(data_path + '/' + in_segments_path + possible_names[i])
98 | sheet = file.sheet_by_index(0)
99 |
100 | these_events = []
101 | empty = False
102 | i = 2 # 1st row with info
103 | while not empty:
104 | try:
105 | evt = sheet.cell(i, 1).value.split()
106 | if len(evt) == 1:
107 | evt = sheet.cell(i, 1).value.split('-')
108 | if evt:
109 | these_events.append([evt[0].strip(), evt[1].strip()])
110 | else:
111 | empty = True
112 | i += 1
113 | except:
114 | empty = True
115 | events[set] = these_events
116 |
117 | # get frames counts from segments and images lists
118 | counts = dict()
119 | for n, s in sets.iteritems():
120 | counts[n] = []
121 | for set in s:
122 | counts[set] = []
123 | prev = -1
124 | for e in events[set]:
125 | if e[1] not in images[set]:
126 | e[1] = '0' + e[1]
127 | if e[0] not in images[set]:
128 | e[0] = '0' + e[0]
129 |
130 | if prev != -1 and images[set].index(e[0]) - images[set].index(prev) > 1:
131 | raise Exception(images[set].index(e[0]), images[set].index(prev))
132 | c = images[set].index(e[1]) - images[set].index(e[0]) + 1
133 | prev = e[1]
134 |
135 | counts[set].append(c)
136 | counts[n].append(c)
137 |
138 | assert np.sum(counts[set]) == len(images[set])
139 |
140 | # get erroneous segments
141 | to_remove = dict()
142 | for n, s in sets.iteritems():
143 | to_remove[n] = dict()
144 | for set in s:
145 | to_remove[n][set] = []
146 | with open(data_path + '/' + in_descriptions_path + '/' + set + '.txt', 'r') as desc_file:
147 | prev_segm = -1
148 | count = 0
149 | segm_count = 0
150 | segm_count_show = 0
151 | for cline, line in enumerate(desc_file):
152 | if line:
153 | line = line.rstrip('\n').split(',')
154 | segm = line[0]
155 | desc = ','.join(line[1:])
156 | desc = desc.strip().lower()
157 | if desc == 'error':
158 | to_remove[n][set].append(segm_count)
159 | else:
160 | if prev_segm != segm:
161 | segm_count_show += 1
162 | count = 0
163 | count += 1
164 | assert segm[:7] == 'Segment', set + ', line ' + str(cline)
165 | if prev_segm != segm:
166 | if prev_segm == -1:
167 | assert int(segm[7:]) == 1
168 | else:
169 | assert int(segm[7:]) == int(prev_segm[7:]) + 1, set + ', line ' + str(cline) + ': ' + str(
170 | int(segm[7:])) + ' != ' + str(int(prev_segm[7:]) + 1)
171 | segm_count += 1
172 | prev_segm = segm
173 |
174 | # get features for each data splits
175 | print 'Building features files...'
176 | print '----------------------------------------'
177 | for n, s in sets.iteritems():
178 | extra_removed = 0
179 | written_in_file = 0
180 | all_total = 0
181 | all_error = 0
182 | feats_file = open(
183 | data_path + '/' + out_features_path + '/' + n + '_' + out_features_name + '_all_frames' + suffix_name + '.csv',
184 | 'w')
185 | counts_file = open(
186 | data_path + '/' + out_features_path + '/' + n + '_' + out_features_name + '_all_frames_counts' + suffix_name + '.txt',
187 | 'w')
188 | for set in s:
189 | these_removed = to_remove[n][set]
190 | these_counts = counts[set]
191 | feats_set = open(data_path + '/' + in_features_path + '/' + set + '/' + in_features_name + '.csv', 'r')
192 | if noninformative_prefix:
193 | noninfo_file = open(data_path + '/' + in_noninfo_path + '/' + noninformative_prefix + '_' + set + '.csv',
194 | 'r')
195 | for ic, count in enumerate(these_counts):
196 | all_total += 1
197 | new_count = 0
198 | these_feats = []
199 | for c in range(count):
200 | line = feats_set.next().rstrip('\n')
201 | is_informative = True
202 | if noninformative_prefix:
203 | noninfo_line = noninfo_file.next().rstrip('\n')
204 | # checks if the current frame is non-informative and discards it
205 | if float(noninfo_line.split(',')[0]) >= 0.5:
206 | is_informative = False
207 | if is_informative:
208 | these_feats.append(line)
209 | new_count += 1
210 | if ic in these_removed:
211 | all_error += 1
212 | # Empty sequence due to non-informative removal. Let's introduce it into to_remove list
213 | if noninformative_prefix and len(these_feats) == 0:
214 | if ic not in these_removed:
215 | extra_removed += 1
216 | to_remove[n][set].append(ic)
217 | these_removed.append(ic)
218 | if ic not in these_removed:
219 | written_in_file += 1
220 | for feat in these_feats:
221 | feats_file.write(feat + '\n')
222 | counts_file.write(str(new_count) + '\n')
223 |
224 | if noninformative_prefix:
225 | noninfo_file.close()
226 | feats_set.close()
227 | feats_file.close()
228 | counts_file.close()
229 |
230 | print 'Extra removed', n, ':', extra_removed
231 | print 'Written in file', n, ':', written_in_file
232 | print '"ERROR" events', n, ':', all_error
233 | print 'Total original events', n, ':', all_total
234 | print
235 |
236 | # get descriptions for each segment
237 | print 'Building captions files...'
238 | print '----------------------------------------'
239 | caption_general = open(data_path + '/' + out_descriptions_path + '/' + 'captions_final' + suffix_name + '.id.en', 'w')
240 | for n, s in sets.iteritems():
241 | written_in_file = 0
242 | all_total = 0
243 | all_error = 0
244 | split_file = open(data_path + '/' + out_descriptions_path + '/' + n + '_list_final' + suffix_name + '.txt', 'w')
245 | for set in s:
246 | with open(data_path + '/' + in_descriptions_path + '/' + set + '.txt', 'r') as desc_file:
247 | prev_segm = -1
248 | count = 0
249 | segm_count = -1
250 | segm_count_show = 0
251 | for cline, line in enumerate(desc_file):
252 | if line:
253 | line = line.rstrip('\n').split(',')
254 | segm = line[0]
255 | desc = ','.join(line[1:])
256 | desc = desc.strip().lower()
257 | if prev_segm != segm:
258 | all_total += 1
259 | if prev_segm == -1:
260 | assert int(segm[7:]) == 1
261 | else:
262 | assert int(segm[7:]) == int(prev_segm[7:]) + 1, set + ', line ' + str(cline) + ': ' + str(
263 | int(segm[7:])) + ' != ' + str(int(prev_segm[7:]) + 1)
264 | segm_count += 1
265 | if desc != 'error' and segm_count not in to_remove[n][set]:
266 | if prev_segm != segm:
267 | written_in_file += 1
268 | segm_count_show += 1
269 | split_file.write(set + '_Segment_' + str(segm_count_show) + '\n')
270 | count = 0
271 | caption_general.write(set + '_Segment_' + str(segm_count_show)
272 | + '#' + str(count) + separator + desc + '\n')
273 | count += 1
274 | else:
275 | if prev_segm != segm:
276 | all_error += 1
277 | assert segm[:7] == 'Segment', set + ', line ' + str(cline)
278 |
279 | prev_segm = segm
280 | try:
281 | int(segm[7:])
282 | except:
283 | raise Exception(set + ' wrong Segment identifier: ' + segm)
284 | assert segm_count + 1 == int(segm[7:]), set + ': ' + str(segm_count + 1) + ' != ' + segm[7:]
285 | assert len(counts[set]) == segm_count + 1, set + ': ' + str(segm_count + 1) + ' != ' + str(len(counts[set]))
286 |
287 | split_file.close()
288 |
289 | print 'Written in file', n, ':', written_in_file
290 | print 'All removed events', n, ':', all_error
291 | print 'Total original events', n, ':', all_total
292 | print
293 |
294 | caption_general.close()
295 |
296 | print 'DONE!'
297 |
--------------------------------------------------------------------------------
/data_engine/subsample_frames_features.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | base_path = '/media/HDD_2TB/DATASETS/MSVD/'
4 | features_path = 'Features/Full_Features'
5 | output_path = 'Features'
6 |
7 | n_frames_per_video_subsample = 26 # subsample fixed number of equidistant frames per video
8 | repeat_frames = False # decides if we are going to repeate some frames when needed for filling the desired
9 | # "n_frames_per_video_subsample", or we are simply filling the video frames with 0s
10 |
11 |
12 | # Inputs
13 | features_name = 'C3D_fc8_ImageNet'
14 | features_files = ['train_' + features_name + '_features.csv', 'val_' + features_name + '_features.csv',
15 | 'test_' + features_name + '_features.csv']
16 | features_counts_files = ['train_' + features_name + '_counts.txt', 'val_' + features_name + '_counts.txt',
17 | 'test_' + features_name + '_counts.txt']
18 |
19 | # Outputs
20 | out_features_name = 'C3D_fc8_ImageNet'
21 | out_features = ['train_' + out_features_name + '.csv', 'val_' + out_features_name + '.csv',
22 | 'test_' + out_features_name + '.csv']
23 | out_features_counts = ['train_' + out_features_name + '_counts.txt', 'val_' + out_features_name + '_counts.txt',
24 | 'test_' + out_features_name + '_counts.txt']
25 |
26 | #########
27 |
28 | for ff_, fc_, of_, oc_ in zip(features_files, features_counts_files, out_features, out_features_counts):
29 |
30 | print 'Processing file', base_path + '/' + features_path + '/' + ff_
31 |
32 | # Open files
33 | ff = open(base_path + '/' + features_path + '/' + ff_, 'r')
34 | fc = open(base_path + '/' + features_path + '/' + fc_, 'r')
35 | of = open(base_path + '/' + output_path + '/' + of_, 'w')
36 | oc = open(base_path + '/' + output_path + '/' + oc_, 'w')
37 |
38 | # Process each video
39 | for count_videos, count in enumerate(fc):
40 | # Calculate chosen frames
41 | count = int(count.strip('\n'))
42 | # pick_pos = np.round(np.linspace(0,count-1,n_frames_per_video_subsample)).astype('int64')
43 | pick_pos = np.linspace(0, count - 1, n_frames_per_video_subsample).astype('int64')
44 | if not repeat_frames:
45 | pick_pos = np.unique(pick_pos)
46 | count_pick = len(pick_pos)
47 |
48 | # Get all frames from current video
49 | feats = [[] for i in range(count)]
50 | for i in range(count):
51 | feats[i] = ff.next()
52 |
53 | # Get chosen frames
54 | for p in pick_pos:
55 | of.write(feats[p])
56 | oc.write(str(count_pick) + '\n')
57 | if count_pick != n_frames_per_video_subsample:
58 | print "different", count_videos
59 | print "num", count_pick
60 |
61 | ff.close()
62 | fc.close()
63 | of.close()
64 | oc.close()
65 |
66 | print 'Output stored in', base_path + '/' + output_path + '/' + of_
67 |
--------------------------------------------------------------------------------
/docs/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarcBS/TMA/76f4e16b3f10c056c45ada5df4a64cc564b69011/docs/model.png
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import ast
2 | import logging
3 | import sys
4 | from timeit import default_timer as timer
5 |
6 | from config import load_parameters
7 | from data_engine.prepare_data import build_dataset
8 | from keras_wrapper.cnn_model import loadModel, transferWeights, updateModel
9 | from keras_wrapper.extra.callbacks import EvalPerformance, Sample
10 | from keras_wrapper.extra.evaluation import selectMetric
11 | from keras_wrapper.extra.read_write import dict2pkl, list2file
12 | from keras_wrapper.utils import decode_predictions_beam_search, decode_predictions
13 | from viddesc_model import VideoDesc_Model
14 |
15 | logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(message)s', datefmt='%d/%m/%Y %H:%M:%S')
16 | logger = logging.getLogger(__name__)
17 |
18 |
19 | def train_model(params):
20 | """
21 | Training function. Sets the training parameters from params. Build or loads the model and launches the training.
22 | :param params: Dictionary of network hyperparameters.
23 | :return: None
24 | """
25 |
26 | if params['RELOAD'] > 0:
27 | logging.info('Resuming training.')
28 |
29 | check_params(params)
30 |
31 | ########### Load data
32 | dataset = build_dataset(params)
33 | if not '-vidtext-embed' in params['DATASET_NAME']:
34 | params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]]
35 | else:
36 | params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][1]]
37 | ###########
38 |
39 |
40 | ########### Build model
41 |
42 | if params['MODE'] == 'finetuning':
43 | # video_model = loadModel(params['PRE_TRAINED_MODEL_STORE_PATHS'], params['RELOAD'])
44 | video_model = VideoDesc_Model(params,
45 | type=params['MODEL_TYPE'],
46 | verbose=params['VERBOSE'],
47 | model_name=params['MODEL_NAME'] + '_reloaded',
48 | vocabularies=dataset.vocabulary,
49 | store_path=params['STORE_PATH'],
50 | set_optimizer=False,
51 | clear_dirs=False)
52 | video_model = updateModel(video_model, params['RELOAD_PATH'], params['RELOAD'], reload_epoch=False)
53 | video_model.setParams(params)
54 |
55 | # Define the inputs and outputs mapping from our Dataset instance to our model
56 | inputMapping = dict()
57 | for i, id_in in enumerate(params['INPUTS_IDS_DATASET']):
58 | if len(video_model.ids_inputs) > i:
59 | pos_source = dataset.ids_inputs.index(id_in)
60 | id_dest = video_model.ids_inputs[i]
61 | inputMapping[id_dest] = pos_source
62 | video_model.setInputsMapping(inputMapping)
63 |
64 | outputMapping = dict()
65 | for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']):
66 | if len(video_model.ids_outputs) > i:
67 | pos_target = dataset.ids_outputs.index(id_out)
68 | id_dest = video_model.ids_outputs[i]
69 | outputMapping[id_dest] = pos_target
70 | video_model.setOutputsMapping(outputMapping)
71 |
72 | video_model.setOptimizer()
73 | params['MAX_EPOCH'] += params['RELOAD']
74 |
75 | else:
76 | if params['RELOAD'] == 0 or params['LOAD_WEIGHTS_ONLY']: # build new model
77 | video_model = VideoDesc_Model(params,
78 | type=params['MODEL_TYPE'],
79 | verbose=params['VERBOSE'],
80 | model_name=params['MODEL_NAME'],
81 | vocabularies=dataset.vocabulary,
82 | store_path=params['STORE_PATH'],
83 | set_optimizer=True)
84 | dict2pkl(params, params['STORE_PATH'] + '/config')
85 |
86 | # Define the inputs and outputs mapping from our Dataset instance to our model
87 | inputMapping = dict()
88 | for i, id_in in enumerate(params['INPUTS_IDS_DATASET']):
89 | if len(video_model.ids_inputs) > i:
90 | pos_source = dataset.ids_inputs.index(id_in)
91 | id_dest = video_model.ids_inputs[i]
92 | inputMapping[id_dest] = pos_source
93 | video_model.setInputsMapping(inputMapping)
94 |
95 | outputMapping = dict()
96 | for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']):
97 | if len(video_model.ids_outputs) > i:
98 | pos_target = dataset.ids_outputs.index(id_out)
99 | id_dest = video_model.ids_outputs[i]
100 | outputMapping[id_dest] = pos_target
101 | video_model.setOutputsMapping(outputMapping)
102 |
103 | # Only load weights from pre-trained model
104 | if params['LOAD_WEIGHTS_ONLY'] and params['RELOAD'] > 0:
105 | for i in range(0, len(params['RELOAD'])):
106 | old_model = loadModel(params['PRE_TRAINED_MODEL_STORE_PATHS'][i], params['RELOAD'][i])
107 | video_model = transferWeights(old_model, video_model, params['LAYERS_MAPPING'][i])
108 | video_model.setOptimizer()
109 | params['RELOAD'] = 0
110 | else: # resume from previously trained model
111 | video_model = loadModel(params['PRE_TRAINED_MODEL_STORE_PATHS'], params['RELOAD'])
112 | video_model.params['LR'] = params['LR']
113 | video_model.setOptimizer()
114 |
115 | if video_model.model_path != params['STORE_PATH']:
116 | video_model.setName(params['MODEL_NAME'], models_path=params['STORE_PATH'], clear_dirs=False)
117 | # Update optimizer either if we are loading or building a model
118 | video_model.params = params
119 | video_model.setOptimizer()
120 | ###########
121 |
122 |
123 | ########### Test model saving/loading functions
124 | # saveModel(video_model, params['RELOAD'])
125 | # video_model = loadModel(params['STORE_PATH'], params['RELOAD'])
126 | ###########
127 |
128 |
129 | ########### Callbacks
130 | callbacks = buildCallbacks(params, video_model, dataset)
131 | ###########
132 |
133 |
134 | ########### Training
135 | total_start_time = timer()
136 |
137 | logger.debug('Starting training!')
138 | training_params = {'n_epochs': params['MAX_EPOCH'], 'batch_size': params['BATCH_SIZE'],
139 | 'homogeneous_batches': params['HOMOGENEOUS_BATCHES'], 'maxlen': params['MAX_OUTPUT_TEXT_LEN'],
140 | 'lr_decay': params['LR_DECAY'], 'lr_gamma': params['LR_GAMMA'],
141 | 'epochs_for_save': params['EPOCHS_FOR_SAVE'], 'verbose': params['VERBOSE'],
142 | 'eval_on_sets': params['EVAL_ON_SETS_KERAS'], 'n_parallel_loaders': params['PARALLEL_LOADERS'],
143 | 'extra_callbacks': callbacks, 'reload_epoch': params['RELOAD'], 'epoch_offset': params['RELOAD'],
144 | 'data_augmentation': params['DATA_AUGMENTATION'],
145 | 'patience': params.get('PATIENCE', 0), # early stopping parameters
146 | 'metric_check': params.get('STOP_METRIC', None),
147 | 'eval_on_epochs': params.get('EVAL_EACH_EPOCHS', True),
148 | 'each_n_epochs': params.get('EVAL_EACH', 1),
149 | 'start_eval_on_epoch': params.get('START_EVAL_ON_EPOCH', 0)
150 | }
151 |
152 | video_model.trainNet(dataset, training_params)
153 |
154 | total_end_time = timer()
155 | time_difference = total_end_time - total_start_time
156 | logging.info('In total is {0:.2f}s = {1:.2f}m'.format(time_difference, time_difference / 60.0))
157 |
158 |
159 | def apply_Video_model(params):
160 | """
161 | Function for using a previously trained model for sampling.
162 | """
163 |
164 | ########### Load data
165 | dataset = build_dataset(params)
166 | if not '-vidtext-embed' in params['DATASET_NAME']:
167 | params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]]
168 | else:
169 | params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][1]]
170 | ###########
171 |
172 |
173 | ########### Load model
174 | video_model = loadModel(params['STORE_PATH'], params['SAMPLING_RELOAD_POINT'],
175 | reload_epoch=params['SAMPLING_RELOAD_EPOCH'])
176 | video_model.setOptimizer()
177 | ###########
178 |
179 |
180 | ########### Apply sampling
181 | extra_vars = dict()
182 | extra_vars['tokenize_f'] = eval('dataset.' + params['TOKENIZATION_METHOD'])
183 | extra_vars['language'] = params.get('TRG_LAN', 'en')
184 |
185 | for s in params["EVAL_ON_SETS"]:
186 |
187 | # Apply model predictions
188 | params_prediction = {'max_batch_size': params['BATCH_SIZE'],
189 | 'n_parallel_loaders': params['PARALLEL_LOADERS'],
190 | 'predict_on_sets': [s]}
191 |
192 | # Convert predictions into sentences
193 | if not '-vidtext-embed' in params['DATASET_NAME']:
194 | vocab = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['idx2words']
195 | else:
196 | vocab = None
197 |
198 | if params['BEAM_SEARCH']:
199 | params_prediction['beam_size'] = params['BEAM_SIZE']
200 | params_prediction['maxlen'] = params['MAX_OUTPUT_TEXT_LEN_TEST']
201 | params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH'] and '-upperbound' not in params[
202 | 'DATASET_NAME']
203 | params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL']
204 | params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL']
205 | params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET']
206 | params_prediction['dataset_outputs'] = params['OUTPUTS_IDS_DATASET']
207 | params_prediction['normalize_probs'] = params['NORMALIZE_SAMPLING']
208 | params_prediction['alpha_factor'] = params['ALPHA_FACTOR']
209 | params_prediction['temporally_linked'] = '-linked' in params['DATASET_NAME'] and '-upperbound' not in \
210 | params[
211 | 'DATASET_NAME'] and '-video' not in \
212 | params[
213 | 'DATASET_NAME']
214 | predictions = video_model.predictBeamSearchNet(dataset, params_prediction)[s]
215 | predictions = decode_predictions_beam_search(predictions, vocab, verbose=params['VERBOSE'])
216 | else:
217 | predictions = video_model.predictNet(dataset, params_prediction)[s]
218 | predictions = decode_predictions(predictions, 1, vocab, params['SAMPLING'], verbose=params['VERBOSE'])
219 |
220 | # Store result
221 | filepath = video_model.model_path + '/' + s + '_sampling.pred' # results file
222 | if params['SAMPLING_SAVE_MODE'] == 'list':
223 | list2file(filepath, predictions)
224 | else:
225 | raise Exception, 'Only "list" is allowed in "SAMPLING_SAVE_MODE"'
226 |
227 | # Evaluate if any metric in params['METRICS']
228 | for metric in params['METRICS']:
229 | logging.info('Evaluating on metric ' + metric)
230 | filepath = video_model.model_path + '/' + s + '_sampling.' + metric # results file
231 |
232 | # Evaluate on the chosen metric
233 | extra_vars[s] = dict()
234 | extra_vars[s]['references'] = dataset.extra_variables[s][params['OUTPUTS_IDS_DATASET'][0]]
235 | metrics = selectMetric[metric](
236 | pred_list=predictions,
237 | verbose=1,
238 | extra_vars=extra_vars,
239 | split=s)
240 |
241 | # Print results to file
242 | with open(filepath, 'w') as f:
243 | header = ''
244 | line = ''
245 | for metric_ in sorted(metrics):
246 | value = metrics[metric_]
247 | header += metric_ + ','
248 | line += str(value) + ','
249 | f.write(header + '\n')
250 | f.write(line + '\n')
251 | logging.info('Done evaluating on metric ' + metric)
252 |
253 |
254 | def buildCallbacks(params, model, dataset):
255 | """
256 | Builds the selected set of callbacks run during the training of the model.
257 |
258 | :param params: Dictionary of network hyperparameters.
259 | :param model: Model instance on which to apply the callback.
260 | :param dataset: Dataset instance on which to apply the callback.
261 | :return:
262 | """
263 |
264 | callbacks = []
265 |
266 | if params['METRICS']:
267 | # Evaluate training
268 | extra_vars = {'language': params.get('TRG_LAN', 'en'),
269 | 'n_parallel_loaders': params['PARALLEL_LOADERS'],
270 | 'tokenize_f': eval('dataset.' + params['TOKENIZATION_METHOD'])}
271 |
272 | if not '-vidtext-embed' in params['DATASET_NAME']:
273 | vocab = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['idx2words']
274 | for s in params['EVAL_ON_SETS']:
275 | extra_vars[s] = dict()
276 | extra_vars[s]['references'] = dataset.extra_variables[s][params['OUTPUTS_IDS_DATASET'][0]]
277 | else:
278 | vocab = None
279 | extra_vars['n_classes'] = len(dataset.dic_classes[params['OUTPUTS_IDS_DATASET'][0]].values())
280 | for s in params['EVAL_ON_SETS']:
281 | extra_vars[s] = dict()
282 | extra_vars[s]['references'] = eval('dataset.Y_' + s + '["' + params['OUTPUTS_IDS_DATASET'][0] + '"]')
283 |
284 | if params['BEAM_SEARCH']:
285 | extra_vars['beam_size'] = params.get('BEAM_SIZE', 6)
286 | extra_vars['state_below_index'] = params.get('BEAM_SEARCH_COND_INPUT', -1)
287 | extra_vars['maxlen'] = params.get('MAX_OUTPUT_TEXT_LEN_TEST', 30)
288 | extra_vars['optimized_search'] = params.get('OPTIMIZED_SEARCH', True) and '-upperbound' not in params[
289 | 'DATASET_NAME']
290 | extra_vars['model_inputs'] = params['INPUTS_IDS_MODEL']
291 | extra_vars['model_outputs'] = params['OUTPUTS_IDS_MODEL']
292 | extra_vars['dataset_inputs'] = params['INPUTS_IDS_DATASET']
293 | extra_vars['dataset_outputs'] = params['OUTPUTS_IDS_DATASET']
294 | extra_vars['normalize_probs'] = params.get('NORMALIZE_SAMPLING', False)
295 | extra_vars['alpha_factor'] = params.get('ALPHA_FACTOR', 1.)
296 | extra_vars['temporally_linked'] = '-linked' in params['DATASET_NAME'] and '-upperbound' not in params[
297 | 'DATASET_NAME'] and '-video' not in params['DATASET_NAME']
298 | input_text_id = None
299 | vocab_src = None
300 |
301 | callback_metric = EvalPerformance(model,
302 | dataset,
303 | gt_id=params['OUTPUTS_IDS_DATASET'][0],
304 | metric_name=params['METRICS'],
305 | set_name=params['EVAL_ON_SETS'],
306 | batch_size=params['BATCH_SIZE'],
307 | each_n_epochs=params['EVAL_EACH'],
308 | extra_vars=extra_vars,
309 | reload_epoch=params['RELOAD'],
310 | is_text=True,
311 | input_text_id=input_text_id,
312 | index2word_y=vocab,
313 | index2word_x=vocab_src,
314 | sampling_type=params['SAMPLING'],
315 | beam_search=params['BEAM_SEARCH'],
316 | save_path=model.model_path,
317 | start_eval_on_epoch=params['START_EVAL_ON_EPOCH'],
318 | write_samples=True,
319 | write_type=params['SAMPLING_SAVE_MODE'],
320 | eval_on_epochs=params['EVAL_EACH_EPOCHS'],
321 | save_each_evaluation=params['SAVE_EACH_EVALUATION'],
322 | verbose=params['VERBOSE'])
323 | else:
324 | callback_metric = EvalPerformance(model,
325 | dataset,
326 | gt_id=params['OUTPUTS_IDS_DATASET'][0],
327 | metric_name=params['METRICS'],
328 | set_name=params['EVAL_ON_SETS'],
329 | batch_size=params['BATCH_SIZE'],
330 | each_n_epochs=params['EVAL_EACH'],
331 | extra_vars=extra_vars,
332 | reload_epoch=params['RELOAD'],
333 | save_path=model.model_path,
334 | start_eval_on_epoch=params[
335 | 'START_EVAL_ON_EPOCH'],
336 | write_samples=True,
337 | write_type=params['SAMPLING_SAVE_MODE'],
338 | eval_on_epochs=params['EVAL_EACH_EPOCHS'],
339 | save_each_evaluation=params[
340 | 'SAVE_EACH_EVALUATION'],
341 | verbose=params['VERBOSE'])
342 |
343 | callbacks.append(callback_metric)
344 |
345 | if params['SAMPLE_ON_SETS']:
346 | # Write some samples
347 | extra_vars = {'language': params.get('TRG_LAN', 'en'), 'n_parallel_loaders': params['PARALLEL_LOADERS']}
348 | if not '-vidtext-embed' in params['DATASET_NAME']:
349 | vocab = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['idx2words']
350 | else:
351 | vocab = None
352 | if params['BEAM_SEARCH']:
353 | extra_vars['beam_size'] = params['BEAM_SIZE']
354 | extra_vars['state_below_index'] = params.get('BEAM_SEARCH_COND_INPUT', -1)
355 | extra_vars['maxlen'] = params['MAX_OUTPUT_TEXT_LEN_TEST']
356 | extra_vars['optimized_search'] = params['OPTIMIZED_SEARCH'] and '-upperbound' not in params['DATASET_NAME']
357 | extra_vars['model_inputs'] = params['INPUTS_IDS_MODEL']
358 | extra_vars['model_outputs'] = params['OUTPUTS_IDS_MODEL']
359 | extra_vars['dataset_inputs'] = params['INPUTS_IDS_DATASET']
360 | extra_vars['dataset_outputs'] = params['OUTPUTS_IDS_DATASET']
361 | extra_vars['normalize_probs'] = params['NORMALIZE_SAMPLING']
362 | extra_vars['alpha_factor'] = params['ALPHA_FACTOR']
363 | extra_vars['temporally_linked'] = '-linked' in params['DATASET_NAME'] and '-upperbound' not in params[
364 | 'DATASET_NAME'] and '-video' not in params['DATASET_NAME']
365 |
366 | callback_sampling = Sample(model,
367 | dataset,
368 | gt_id=params['OUTPUTS_IDS_DATASET'][0],
369 | set_name=params['SAMPLE_ON_SETS'],
370 | n_samples=params['N_SAMPLES'],
371 | each_n_updates=params['SAMPLE_EACH_UPDATES'],
372 | extra_vars=extra_vars,
373 | reload_epoch=params['RELOAD'],
374 | batch_size=params['BATCH_SIZE'],
375 | is_text=True,
376 | index2word_y=vocab, # text info
377 | in_pred_idx=params['INPUTS_IDS_DATASET'][0],
378 | sampling_type=params['SAMPLING'], # text info
379 | beam_search=params['BEAM_SEARCH'],
380 | start_sampling_on_epoch=params['START_SAMPLING_ON_EPOCH'],
381 | verbose=params['VERBOSE'])
382 | callbacks.append(callback_sampling)
383 |
384 | return callbacks
385 |
386 |
387 | def check_params(params):
388 | if 'Glove' in params['MODEL_TYPE'] and params['GLOVE_VECTORS'] is None:
389 | logger.warning("You set a model that uses pretrained word vectors but you didn't specify a vector file."
390 | "We'll train WITHOUT pretrained embeddings!")
391 | if params["USE_DROPOUT"] and params["USE_BATCH_NORMALIZATION"]:
392 | logger.warning("It's not recommended to use both dropout and batch normalization")
393 |
394 |
395 | if __name__ == "__main__":
396 |
397 | parameters = load_parameters()
398 | try:
399 | for arg in sys.argv[1:]:
400 | k, v = arg.split('=')
401 | parameters[k] = ast.literal_eval(v)
402 | except ValueError:
403 | print 'Overwritten arguments must have the form key=Value'
404 | exit(1)
405 | check_params(parameters)
406 | if parameters['MODE'] == 'training' or parameters['MODE'] == 'finetuning':
407 | logging.info('Running training.')
408 | train_model(parameters)
409 | elif parameters['MODE'] == 'sampling':
410 | logging.info('Running sampling.')
411 | apply_Video_model(parameters)
412 |
413 | logging.info('Done!')
414 |
--------------------------------------------------------------------------------
/meta-optimizers/spearmint/README.md:
--------------------------------------------------------------------------------
1 | Package for performing hyperparameter optimization with [Spearmint] (https://github.com/HIPS/Spearmint).
2 |
3 | Requirements: Those specified in the [Spearmint] (https://github.com/HIPS/Spearmint) package:
4 |
5 | * [NumPy](http://www.numpy.org/)
6 | * [scikit learn](http://scikit-learn.org/stable/index.html)
7 | * [pymongo](https://api.mongodb.org/python/current)
8 | * [MongoDB](https://www.mongodb.org)
9 |
10 | Installation:
11 |
12 | * Install [Spearmint] (https://github.com/HIPS/Spearmint/blob/master/README.md)
13 |
14 | Usage:
15 |
16 | 1) Set your experimental settings (see `${nmt_keras_path}/spearmint/config.json` for an example)
17 |
18 | * **_WARNING!_**: It is highly recommendable to specify an absolute path to the data files in `config.py` when launching spearmint!
19 |
20 | 2) Run the `launch_spearmint.sh` script. It will execute the following steps:
21 |
22 | * Get NMT-Keras directory:
23 |
24 | ```bash
25 | cd nmt-keras
26 | nmt_keras_path=`pwd`
27 | ```
28 |
29 | * Create directory for storing the database:
30 |
31 | ```bash
32 | mkdir ${nmt_keras_path}/spearmint/db
33 | ```
34 |
35 | * Start the Mongo database:
36 |
37 | ```bash
38 | mongod --fork --logpath ${nmt_keras_path}/spearmint/db/log --dbpath ${nmt_keras_path}/spearmint/db
39 | ```
40 |
41 | * Remove eventual instances of previous experiments
42 |
43 | ```bash
44 | ${spearmint_path}/spearmint/cleanup.sh ${nmt_keras_path}/spearmint/
45 | ```
46 |
47 | * Lauch Spearmint! Assuming that it is installed under `${spearmint_path}`:
48 |
49 | ```bash
50 | cd ${nmt_keras_path}; nohup python ${spearmint_path}/spearmint/main.py ${dest_dir} --config=${nmt_keras_path}/meta-optimizers/spearmint/config.json >> ${dest_dir}/logs/out.log 2> ${dest_dir}/logs/out.err &
51 | ```
52 |
53 | * The results will appear at `${nmt_keras_path}/spearmint/output`
54 |
55 |
--------------------------------------------------------------------------------
/meta-optimizers/spearmint/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarcBS/TMA/76f4e16b3f10c056c45ada5df4a64cc564b69011/meta-optimizers/spearmint/__init__.py
--------------------------------------------------------------------------------
/meta-optimizers/spearmint/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "language": "PYTHON",
3 | "main-file": "spearmint_opt.py",
4 | "experiment-name": "TemporallyLinkedVideoDescriptionAtt",
5 | "likelihood": "GAUSSIAN",
6 | "variables": {
7 | "TARGET_TEXT_EMBEDDING_SIZE": {
8 | "type": "INT",
9 | "size": 1,
10 | "min": 50,
11 | "max": 600
12 | },
13 | "ENCODER_HIDDEN_SIZE": {
14 | "type": "INT",
15 | "size": 1,
16 | "min": 100,
17 | "max": 500
18 | },
19 | "LR_GAMMA": {
20 | "type": "FLOAT",
21 | "size": 1,
22 | "min": 0.95,
23 | "max": 1.0
24 | },
25 | "N_LAYERS_ENCODER": {
26 | "type": "INT",
27 | "size": 1,
28 | "min": 1,
29 | "max": 2
30 | },
31 | "N_LAYERS_PREV_SENT_ENCODER": {
32 | "type": "INT",
33 | "size": 1,
34 | "min": 1,
35 | "max": 2
36 | },
37 | "DECODER_HIDDEN_SIZE": {
38 | "type": "INT",
39 | "size": 1,
40 | "min": 100,
41 | "max": 600
42 | },
43 | "PREV_SENT_ENCODER_HIDDEN_SIZE": {
44 | "type": "INT",
45 | "size": 1,
46 | "min": 100,
47 | "max": 500
48 | }
49 | }
50 | }
51 |
52 |
--------------------------------------------------------------------------------
/meta-optimizers/spearmint/launch_spearmint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | spearmint_path=${SOFTWARE_PREFIX}/Spearmint
4 | nmt_keras_path=${SOFTWARE_PREFIX}/egocentric-video-description
5 | dest_dir=${nmt_keras_path}/meta-optimizers/spearmint
6 | mkdir -p ${dest_dir}/db
7 | mkdir -p ${dest_dir}/logs
8 |
9 | #Launch mongodb if it is not already launched
10 | if [ `ps -wuax |grep mongod |wc -l` -lt 2 ]; then
11 | mongod --fork --logpath ${dest_dir}/db/log --dbpath ${dest_dir}/db;
12 | fi
13 |
14 |
15 | ${spearmint_path}/spearmint/cleanup.sh ${dest_dir}
16 |
17 | cd ${nmt_keras_path}; nohup python ${spearmint_path}/spearmint/main.py ${dest_dir} --config=${nmt_keras_path}/meta-optimizers/spearmint/config.json >> ${dest_dir}/logs/out.log 2> ${dest_dir}/logs/out.err &
18 | echo "Main Spearmint process PID:" $! >> ${dest_dir}/logs/out.log
--------------------------------------------------------------------------------
/meta-optimizers/spearmint/spearmint_opt.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import subprocess
4 | import sys
5 |
6 | # sys.path.append("../../") # Adds higher directory to python modules path.
7 | sys.path.insert(1, os.path.abspath("."))
8 | sys.path.insert(0, os.path.abspath("../../"))
9 |
10 | print sys.path
11 |
12 | from config import load_parameters
13 | from main import check_params, train_model
14 |
15 | logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(message)s', datefmt='%d/%m/%Y %H:%M:%S')
16 | logger = logging.getLogger(__name__)
17 | metric_name = 'Bleu_4'
18 | maximize = True # Select whether we want to maximize the metric or minimize it
19 | d = dict(os.environ.copy())
20 | d['LC_NUMERIC'] = 'en_US.utf-8'
21 |
22 |
23 | def invoke_model(parameters):
24 | model_params = load_parameters()
25 | model_name = model_params["MODEL_TYPE"]
26 | for parameter in parameters.keys():
27 | model_params[parameter] = parameters[parameter][0]
28 | logger.debug("Assigning to %s the value %s" % (str(parameter), parameters[parameter][0]))
29 | model_name += '_' + str(parameter) + '_' + str(parameters[parameter][0])
30 | model_params["SKIP_VECTORS_HIDDEN_SIZE"] = model_params["TARGET_TEXT_EMBEDDING_SIZE"]
31 | model_params["MODEL_NAME"] = model_name
32 | # models and evaluation results will be stored here
33 | model_params[
34 | "STORE_PATH"] = '/home/lvapeab/smt/software/egocentric-video-description/meta-optimizers/spearmint/trained_models/' + \
35 | model_params["MODEL_NAME"] + '/'
36 | check_params(model_params)
37 | assert model_params['MODE'] == 'training', 'You can only launch Spearmint when training!'
38 | logging.info('Running training.')
39 | train_model(model_params)
40 |
41 | results_path = model_params['STORE_PATH'] + '/' + model_params['EVAL_ON_SETS'][0] + '.' + model_params['METRICS'][0]
42 |
43 | # Recover the highest metric score
44 | metric_pos_cmd = "head -n 1 " + results_path + \
45 | " |awk -v metric=" + metric_name + \
46 | " 'BEGIN{FS=\",\"}" \
47 | "{for (i=1; i<=NF; i++) if ($i == metric) print i;}'"
48 | metric_pos = \
49 | subprocess.Popen(metric_pos_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True).communicate()[0][:-1]
50 | cmd = "tail -n +2 " + results_path + \
51 | " |awk -v m_pos=" + str(metric_pos) + \
52 | " 'BEGIN{FS=\",\"}{print $m_pos}'|sort -gr|head -n 1"
53 | ps = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, env=d)
54 | metric_value = float(ps.communicate()[0])
55 | print "Best %s: %f" % (metric_name, metric_value)
56 |
57 | return 1. - metric_value if maximize else metric_value # Spearmint minimizes a function
58 |
59 |
60 | def main(job_id, params):
61 | print params
62 | return invoke_model(params)
63 |
64 |
65 | if __name__ == "__main__":
66 | # Testing function
67 | params = {'SOURCE_TEXT_EMBEDDING_SIZE': [1],
68 | 'ENCODER_HIDDEN_SIZE': [2],
69 | 'TARGET_TEXT_EMBEDDING_SIZE': [1],
70 | 'DECODER_HIDDEN_SIZE': [2],
71 | 'MAX_EPOCH': [2],
72 | 'START_EVAL_ON_EPOCH': [1]}
73 | main(1, params)
74 |
--------------------------------------------------------------------------------
/train.sh:
--------------------------------------------------------------------------------
1 | PYTHONPATH=$PYTHONPATH:/media/HDD_2TB/marc/multimodal_keras_wrapper python -u main.py
2 |
--------------------------------------------------------------------------------
/turing_test.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import random
3 | import sys
4 |
5 | import numpy as np
6 |
7 | from config import load_parameters
8 | from data_engine.prepare_data import build_dataset
9 | from viddesc_model import VideoDesc_Model
10 |
11 | logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(message)s', datefmt='%d/%m/%Y %H:%M:%S')
12 | logger = logging.getLogger(__name__)
13 |
14 |
15 | def build(params):
16 | ds = build_dataset(params)
17 | params['OUTPUT_VOCABULARY_SIZE'] = ds.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]]
18 | vocab = ds.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['idx2words']
19 |
20 | # We only want the model for decoding
21 | video_model = VideoDesc_Model(params,
22 | type=params['MODEL_TYPE'],
23 | verbose=0,
24 | model_name=params['MODEL_NAME'],
25 | vocabularies=ds.vocabulary,
26 | store_path=params['STORE_PATH'],
27 | set_optimizer=False)
28 |
29 | return ds, vocab, video_model
30 |
31 |
32 | def sample(ds, vocab, video_model, n_samples, split='train', verbose=1):
33 | truth_data = np.random.randint(0, high=eval('ds.len_' + split), size=n_samples)
34 |
35 | matches = 0
36 | misses = 0
37 | guesses = 0
38 |
39 | [truth_X, truth_Y] = ds.getXY_FromIndices('train', truth_data)
40 |
41 | truth_Xs = video_model.decode_predictions_beam_search(np.asarray(truth_X[-2]), vocab, verbose=0, pad_sequences=True)
42 | truth_Ys = video_model.decode_predictions_one_hot(np.asarray(truth_Y[0][0]), vocab)
43 |
44 | for i, (truth_X, truth_Y) in enumerate(zip(truth_Xs, truth_Ys)):
45 | try:
46 | fake_data = np.random.randint(0, high=eval('ds.len_' + split), size=n_samples)
47 | [fake_X, fake_Y] = ds.getXY_FromIndices('train', fake_data)
48 | fake_Xs = video_model.decode_predictions_beam_search(np.asarray(fake_X[-2]), vocab, verbose=0,
49 | pad_sequences=True)
50 | fake_Ys = video_model.decode_predictions_one_hot(np.asarray(fake_Y[0][0]), vocab)
51 |
52 | print "Input", i, ":", truth_X
53 | print "Which is the following event?"
54 |
55 | answer_list = [truth_Y] + fake_Ys
56 | correctness_list = [True] + [False] * len(fake_Ys)
57 | answer_correctness_list = list(zip(answer_list, correctness_list))
58 | random.shuffle(answer_correctness_list)
59 | shuffled_answer_list, shuffled_correctness_list = zip(*answer_correctness_list)
60 | for j, answer in enumerate(shuffled_answer_list):
61 | print "\t", j, ":", answer
62 | action = int(raw_input('Select the upcoming event. \n'))
63 | if shuffled_correctness_list[action]:
64 | matches += 1
65 | if verbose:
66 | print "Correct!"
67 | else:
68 | misses += 1
69 | if verbose:
70 | print "Not correct!. The correct one was:", shuffled_answer_list[
71 | shuffled_correctness_list.index(True)]
72 | guesses += 1
73 | print ""
74 | print ""
75 | except KeyboardInterrupt:
76 | return matches, misses, guesses
77 |
78 | return matches, misses, guesses
79 |
80 |
81 | if __name__ == "__main__":
82 |
83 | parameters = load_parameters()
84 | ###########
85 | ds, vocab, model = build(parameters)
86 | total_matches = 0
87 | total_misses = 0
88 | total_guesses = 0
89 | while True:
90 | try:
91 | matches, misses, guesses = sample(ds, vocab, model, 4, split='train', verbose=0)
92 | total_matches += matches
93 | total_misses += misses
94 | total_guesses += guesses
95 | except KeyboardInterrupt:
96 | print "Interrupted!"
97 | print "Total number of matches: %d/%d" % (total_matches, total_guesses)
98 | print "Total number of misses: %d/%d" % (total_misses, total_guesses)
99 | print "Precision: %f" % (float(total_matches) / total_guesses)
100 | sys.exit(0)
101 |
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarcBS/TMA/76f4e16b3f10c056c45ada5df4a64cc564b69011/utils/__init__.py
--------------------------------------------------------------------------------
/utils/common.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import json
4 | import os
5 | import re
6 |
7 | from toolz import itemmap
8 |
9 | from keras.optimizers import Adadelta
10 | from keras.optimizers import Adagrad
11 | from keras.optimizers import Adam
12 | from keras.optimizers import RMSprop
13 | from keras.optimizers import SGD
14 |
15 | PADDING = ''
16 | UNKNOWN = 'UNK'
17 | EOA = '' # end of answer
18 | EOQ = '' # end of question
19 | EXTRA_WORDS_NAMES = [PADDING, UNKNOWN, EOA, EOQ]
20 | EXTRA_WORDS = {PADDING: 0, UNKNOWN: 1, EOA: 2, EOQ: 3}
21 | EXTRA_WORDS_ID = itemmap(reversed, EXTRA_WORDS)
22 | MAXLEN = 50
23 |
24 | OPTIMIZERS = { \
25 | 'sgd': SGD,
26 | 'adagrad': Adagrad,
27 | 'adadelta': Adadelta,
28 | 'rmsprop': RMSprop,
29 | 'adam': Adam,
30 | }
31 |
32 |
33 | ###
34 | # Functions
35 | ###
36 | def static_vars(**kwargs):
37 | def decorate(func):
38 | for k in kwargs:
39 | setattr(func, k, kwargs[k])
40 | return func
41 |
42 | return decorate
43 |
44 |
45 | @static_vars(counter=len(EXTRA_WORDS))
46 | def _myinc(d):
47 | """
48 | Gets a tuple d, and returns d[0]: id.
49 | """
50 | x = d[0]
51 | _myinc.counter += 1
52 | return (x, _myinc.counter - 1)
53 |
54 |
55 | def create_dir_if_not_exists(directory):
56 | if not os.path.exists(directory):
57 | print 'creating directory %s' % directory
58 | os.makedirs(directory)
59 | else:
60 | print "%s already exists!" % directory
61 |
62 |
63 | def preprocess_line(line):
64 | cap_tmp = line.strip().decode('utf-8').lower().encode('utf8')
65 | return cap_tmp
66 |
67 |
68 | def preprocess_caption(cap):
69 | commaStrip = re.compile("(\d)(\,)(\d)")
70 | punct = [';', r"/", '[', ']', '"', '{', '}', '(', ')', '=', '+', '\\', '_', '-', '>', '<', '@', '`', ',', '?', '!']
71 | periodStrip = re.compile("(?!<=\d)(\.)(?!\d)")
72 |
73 | def processPunctuation(inText):
74 | outText = inText
75 | for p in punct:
76 | if (p + ' ' in inText or ' ' + p in inText) or (re.search(commaStrip, inText) != None):
77 | outText = outText.replace(p, '')
78 | else:
79 | outText = outText.replace(p, ' ')
80 | outText = periodStrip.sub("", outText, re.UNICODE)
81 | return outText
82 |
83 | cap_tmp = cap.strip().decode('utf-8').lower().encode('utf8')
84 | cap_tmp = processPunctuation(cap_tmp)
85 | return cap_tmp
86 |
87 |
88 | def preprocess_question(q):
89 | contractions = {"aint": "ain't", "arent": "aren't", "cant": "can't", "couldve": "could've", "couldnt": "couldn't",
90 | "couldn'tve": "couldn’t’ve", "couldnt’ve": "couldn’t’ve", "didnt": "didn’t", "doesnt": "doesn’t",
91 | "dont": "don’t", "hadnt": "hadn’t", "hadnt’ve": "hadn’t’ve", "hadn'tve": "hadn’t’ve",
92 | "hasnt": "hasn’t", "havent": "haven’t", "hed": "he’d", "hed’ve": "he’d’ve", "he’dve": "he’d’ve",
93 | "hes": "he’s", "howd": "how’d", "howll": "how’ll", "hows": "how’s", "Id’ve": "I’d’ve",
94 | "I’dve": "I’d’ve", "Im": "I’m", "Ive": "I’ve", "isnt": "isn’t", "itd": "it’d", "itd’ve": "it’d’ve",
95 | "it’dve": "it’d’ve", "itll": "it’ll", "let’s": "let’s", "maam": "ma’am", "mightnt": "mightn’t",
96 | "mightnt’ve": "mightn’t’ve", "mightn’tve": "mightn’t’ve", "mightve": "might’ve",
97 | "mustnt": "mustn’t",
98 | "mustve": "must’ve", "neednt": "needn’t", "notve": "not’ve", "oclock": "o’clock",
99 | "oughtnt": "oughtn’t",
100 | "ow’s’at": "’ow’s’at", "’ows’at": "’ow’s’at", "’ow’sat": "’ow’s’at", "shant": "shan’t",
101 | "shed’ve": "she’d’ve", "she’dve": "she’d’ve", "she’s": "she’s", "shouldve": "should’ve",
102 | "shouldnt": "shouldn’t", "shouldnt’ve": "shouldn’t’ve", "shouldn’tve": "shouldn’t’ve",
103 | "somebody’d": "somebodyd", "somebodyd’ve": "somebody’d’ve", "somebody’dve": "somebody’d’ve",
104 | "somebodyll": "somebody’ll", "somebodys": "somebody’s", "someoned": "someone’d",
105 | "someoned’ve": "someone’d’ve", "someone’dve": "someone’d’ve", "someonell": "someone’ll",
106 | "someones": "someone’s", "somethingd": "something’d", "somethingd’ve": "something’d’ve",
107 | "something’dve": "something’d’ve", "somethingll": "something’ll", "thats": "that’s",
108 | "thered": "there’d", "thered’ve": "there’d’ve", "there’dve": "there’d’ve", "therere": "there’re",
109 | "theres": "there’s", "theyd": "they’d", "theyd’ve": "they’d’ve", "they’dve": "they’d’ve",
110 | "theyll": "they’ll", "theyre": "they’re", "theyve": "they’ve", "twas": "’twas", "wasnt": "wasn’t",
111 | "wed’ve": "we’d’ve", "we’dve": "we’d’ve", "weve": "we've", "werent": "weren’t", "whatll": "what’ll",
112 | "whatre": "what’re", "whats": "what’s", "whatve": "what’ve", "whens": "when’s", "whered":
113 | "where’d", "wheres": "where's", "whereve": "where’ve", "whod": "who’d", "whod’ve": "who’d’ve",
114 | "who’dve": "who’d’ve", "wholl": "who’ll", "whos": "who’s", "whove": "who've", "whyll": "why’ll",
115 | "whyre": "why’re", "whys": "why’s", "wont": "won’t", "wouldve": "would’ve", "wouldnt": "wouldn’t",
116 | "wouldnt’ve": "wouldn’t’ve", "wouldn’tve": "wouldn’t’ve", "yall": "y’all", "yall’ll": "y’all’ll",
117 | "y’allll": "y’all’ll", "yall’d’ve": "y’all’d’ve", "y’alld’ve": "y’all’d’ve",
118 | "y’all’dve": "y’all’d’ve",
119 | "youd": "you’d", "youd’ve": "you’d’ve", "you’dve": "you’d’ve", "youll": "you’ll",
120 | "youre": "you’re", "youve": "you’ve"}
121 | manualMap = {'none': '0', 'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6',
122 | 'seven': '7', 'eight': '8', 'nine': '9', 'ten': '10'}
123 | articles = ['a', 'an', 'the']
124 | commaStrip = re.compile("(\d)(\,)(\d)")
125 | punct = [';', r"/", '[', ']', '"', '{', '}', '(', ')', '=', '+', '\\', '_', '-', '>', '<', '@', '`', ',', '?', '!']
126 | periodStrip = re.compile("(?!<=\d)(\.)(?!\d)")
127 |
128 | def processPunctuation(inText):
129 | outText = inText
130 | for p in punct:
131 | if (p + ' ' in inText or ' ' + p in inText) or (re.search(commaStrip, inText) != None):
132 | outText = outText.replace(p, '')
133 | else:
134 | outText = outText.replace(p, ' ')
135 | outText = periodStrip.sub("", outText, re.UNICODE)
136 | return outText
137 |
138 | def processDigitArticle(inText):
139 | outText = []
140 | tempText = inText.lower().split()
141 | for word in tempText:
142 | word = manualMap.setdefault(word, word)
143 | if word not in articles:
144 | outText.append(word)
145 | else:
146 | pass
147 | for wordId, word in enumerate(outText):
148 | if word in contractions:
149 | outText[wordId] = contractions[word]
150 | outText = ' '.join(outText)
151 | return outText
152 |
153 | q_tmp = q.strip().lower().encode('utf8')
154 | # q_tmp = processPunctuation(q_tmp)
155 | # q_tmp = processDigitArticle(q_tmp)
156 | if q_tmp[-1] == '?' and q_tmp[-2] != ' ':
157 | # separate word token from the question mark
158 | q_tmp = q_tmp[:-1] + ' ?'
159 | # remove question mark
160 | if q_tmp[-1] == '?': q_tmp = q_tmp[:-1]
161 | return q_tmp
162 |
163 |
164 | def save_txt_answers(samples, savefile='./sample', whichset='val', step=''):
165 | with open(savefile + '_' + whichset + '_samples_' + str(step) + '.json', 'w') as f:
166 | print >> f, '\n'.join(samples)
167 |
168 |
169 | def save_json_answers(samples, savefile='./sample', whichset='val', step=''):
170 | with open(savefile + '_' + whichset + '_samples_' + str(step) + '.json', 'w') as f:
171 | json.dump(samples, f)
172 |
173 |
174 | def build_vocabulary(this_wordcount, extra_words=EXTRA_WORDS,
175 | is_reset=True, truncate_to_most_frequent=0):
176 | """
177 | Builds vocabulary from wordcount.
178 | It also adds extra words to the vocabulary.
179 |
180 | In:
181 | this_wordcount - dictionary of wordcounts, e.g. {'cpu':3}
182 | extra_words - additional words to build the vocabulary
183 | dictionary of {word: id}
184 | by default {UNKNOWN: 0}
185 | is_reset - if True we restart the vocabulary counting
186 | by defaults False
187 | truncate_to_most_frequent - if positive then the vocabulary
188 | is truncated to 'truncate_to_most_frequent' words;
189 | by default 0
190 | Out:
191 | word2index - mapping from words to indices
192 | index2word - mapping from indices to words
193 | """
194 | if is_reset:
195 | _myinc.counter = len(EXTRA_WORDS)
196 | if truncate_to_most_frequent > 0:
197 | sorted_wordcount = dict(sorted(
198 | this_wordcount.items(), key=lambda x: x[1], reverse=True)[:truncate_to_most_frequent])
199 | this_wordcount = sorted_wordcount
200 | word2index = itemmap(_myinc, this_wordcount)
201 | if not extra_words == {}:
202 | assert (all([el not in word2index.values() for el in extra_words.values()]))
203 | word2index.update(extra_words)
204 | index2word = itemmap(reversed, word2index)
205 | return word2index, index2word
206 |
207 |
208 | def index_sequence(x, word2index):
209 | """
210 | Converts list of words into a list of its indices wrt. word2index, that is into
211 | index encoded sequence.
212 |
213 | In:
214 | x - list of lines
215 | word2index - mapping from words to indices
216 |
217 | Out:
218 | a list of the list of indices that encode the words
219 | """
220 | one_hot_x = []
221 | for line in x:
222 | line_list = []
223 | for w in line.split():
224 | w = w.strip()
225 | if w in word2index:
226 | this_ind = word2index[w]
227 | else:
228 | this_ind = word2index[UNKNOWN]
229 | line_list.append(this_ind)
230 | one_hot_x.append(line_list)
231 | return one_hot_x
232 |
--------------------------------------------------------------------------------
/utils/evaluate_from_file.py:
--------------------------------------------------------------------------------
1 | """
2 | Scores a file of hypothesis.
3 | Usage:
4 | 1. Set the references in this file (questions and annotations).
5 | 2. python evaluate_vqa.py hypothesis.json
6 | """
7 |
8 | import argparse
9 |
10 | from pycocoevalcap.bleu.bleu import Bleu
11 | from pycocoevalcap.cider.cider import Cider
12 | from pycocoevalcap.meteor.meteor import Meteor
13 | from pycocoevalcap.rouge.rouge import Rouge
14 | from pycocoevalcap.vqa import vqaEval, visual_qa
15 |
16 | # ROOT_PATH = '/home/lvapeab/smt/tasks/image_desc/'
17 | ROOT_PATH = '/media/HDD_2TB/DATASETS/'
18 |
19 | questions = ROOT_PATH + '/VQA/Questions/OpenEnded_mscoco_val2014_questions.json'
20 | annotations = ROOT_PATH + '/VQA/Annotations/mscoco_val2014_annotations.json'
21 |
22 | parser = argparse.ArgumentParser(
23 | description="""This takes two files and a path the references (source, references),
24 | computes bleu, meteor, rouge and cider metrics""", formatter_class=argparse.RawTextHelpFormatter)
25 | parser.add_argument('-vqa', default=False, action="store_true", help='Compute VQA metrics')
26 |
27 | parser.add_argument('-q', type=str, default=questions, help='Path to questions file (only if the -vqa flag is active)')
28 | parser.add_argument('-a', type=str, default=annotations,
29 | help='Path to annotations file (only if the -vqa flag is active)')
30 | parser.add_argument('-hyp', type=str, help='Hypotheses file')
31 |
32 | parser.add_argument('-l', type=str, default='en', help='Meteor language')
33 | parser.add_argument('-r', type=argparse.FileType('r'), nargs="+",
34 | help='Path to all the reference files (single-reference files)')
35 |
36 |
37 | def score_vqa(resFile, quesFile, annFile):
38 | # create vqa object and vqaRes object
39 | vqa_ = visual_qa.VQA(annFile, quesFile)
40 | vqaRes = vqa_.loadRes(resFile, quesFile)
41 | vqaEval_ = vqaEval.VQAEval(vqa_, vqaRes,
42 | n=2) # n is precision of accuracy (number of places after decimal), default is 2
43 | vqaEval_.evaluate()
44 | print "Overall Accuracy is: %.02f\n" % (vqaEval_.accuracy['overall'])
45 | return vqaEval_.accuracy['overall']
46 |
47 |
48 | def load_textfiles(references, hypothesis):
49 | print "The number of references is {}".format(len(references))
50 | hypo = {idx: [lines.strip()] for (idx, lines) in enumerate(hypothesis)}
51 | # take out newlines before creating dictionary
52 | raw_refs = [map(str.strip, r) for r in zip(*references)]
53 | refs = {idx: rr for idx, rr in enumerate(raw_refs)}
54 | # sanity check that we have the same number of references as hypothesis
55 | if len(hypo) != len(refs):
56 | raise ValueError("There is a sentence number mismatch between the inputs: \n"
57 | "\t # sentences in references: %d\n"
58 | "\t # sentences in hypothesis: %d" % (len(refs), len(hypo)))
59 | return refs, hypo
60 |
61 |
62 | def CocoScore(ref, hypo, language='en'):
63 | """
64 | ref, dictionary of reference sentences (id, sentence)
65 | hypo, dictionary of hypothesis sentences (id, sentence)
66 | score, dictionary of scores
67 | """
68 | scorers = [
69 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
70 | (Meteor(language), "METEOR"),
71 | (Rouge(), "ROUGE_L"),
72 | (Cider(), "CIDEr")
73 | ]
74 | final_scores = {}
75 | for scorer, method in scorers:
76 | score, scores = scorer.compute_score(ref, hypo)
77 | if type(score) == list:
78 | for m, s in zip(method, score):
79 | final_scores[m] = s
80 | else:
81 | final_scores[method] = score
82 | return final_scores
83 |
84 |
85 | if __name__ == "__main__":
86 |
87 | args = parser.parse_args()
88 | vqa_evaluation = args.vqa
89 | if vqa_evaluation:
90 | questions = args.q
91 | annotations = args.a
92 | hypotheses = args.hyp
93 | print "hypotheses file:", hypotheses
94 | score = score_vqa(hypotheses, questions, annotations)
95 | print "Score: ", score
96 | else:
97 | language = args.l
98 | hypotheses = open(args.hyp, 'r')
99 | ref, hypo = load_textfiles(args.r, hypotheses)
100 | score = CocoScore(ref, hypo, language=language)
101 | print "Score: ", score
102 |
--------------------------------------------------------------------------------
/utils/plot_metric.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Read and plot several logs from cococaption
4 |
5 | if [ $# -lt 1 ];
6 | then
7 | echo "Usage $0 [train.log] [val.log] [test.log]"
8 | fi
9 |
10 | metric_pos="3"
11 | metric_name="Bleu_4"
12 | out_name="./${metric_name}_plot"
13 | tail -n +2 $1 | awk 'BEGIN{FS=","}{print 1}'>/tmp/epochs;
14 |
15 | i=1
16 | for result in "$@"; do
17 | basename=$(basename $result)
18 | tail -n +2 $result | awk -v pos=${metric_pos} 'BEGIN{FS=","}{print $pos}'>/tmp/${basename};
19 | names[$i]="${basename%.*}"
20 | i=$(( i + 1 ))
21 | basenames=${basenames}" /tmp/`basename $result`"
22 | done
23 | echo "Epoch ${names[*]}" > /tmp/scores
24 |
25 | paste -d " " /tmp/epochs $basenames >> /tmp/scores
26 |
27 | echo "set encoding iso_8859_1
28 |
29 | set style data lines
30 | set key font ',20' height 2
31 | set xtics font ',18'
32 | set ytics font ',18'
33 | set xlabel font ',20' '# Epoch'
34 | set ylabel font ',20' '${metric_name}';
35 |
36 | set title ''
37 | set terminal pdf enhanced
38 | set termoption dash
39 | set output '${out_name}.pdf'
40 | set key left
41 |
42 | set yrange[0:1]
43 | set ytics (0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0)
44 |
45 | set bmargin 4
46 | plot for [col=2:$(( $# + 1 ))] '/tmp/scores' using 0:col with lines lt col lw 5 title columnheader " | gnuplot
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
--------------------------------------------------------------------------------
/utils/prepare_features.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from common import create_dir_if_not_exists
4 |
5 | ###### Parameters
6 |
7 | ROOT_PATH = '/media/HDD_2TB/DATASETS/'
8 |
9 | base_path = ROOT_PATH + '/Flickr8k/Features/'
10 | features = 'KCNN' # KCNN, Scenes, Objects
11 | base_path_save = base_path + features
12 |
13 | feats_paths = ['train_' + features + '_features.csv',
14 | 'val_' + features + '_features.csv',
15 | 'test_' + features + '_features.csv']
16 |
17 | names_lists = ['train_list.txt', 'val_list.txt', 'test_list.txt']
18 | folders_save = ['train', 'val', 'test']
19 |
20 | apply_L2 = False
21 | n_feats = 1024
22 |
23 | ############
24 |
25 | if apply_L2:
26 | file_save = features + '_L2'
27 | else:
28 | file_save = features
29 |
30 |
31 | def csv2npy():
32 | # Process each data split separately
33 | for n, f, fs in zip(names_lists, feats_paths, folders_save):
34 | print "Preparing features %s" % f
35 | feats_dict = dict()
36 | # Get file names
37 | names = []
38 | with open(base_path + '/' + n, 'r') as file:
39 | for line in file:
40 | line = line.rstrip('\n')
41 | line = line.split('.')[0]
42 | names.append(line)
43 | # Get features
44 | with open(base_path + '/' + f, 'r') as file:
45 | for i, line in enumerate(file):
46 | feats = np.fromstring(line.rstrip('\n'), sep=',')
47 | if (apply_L2):
48 | feats = feats / np.linalg.norm(feats, ord=2)
49 | # Insert in dictionary
50 | feats_dict[names[i]] = feats[:n_feats]
51 |
52 | # Store dict
53 | print "Saving features in %s" % (base_path_save + '/' + fs + '/' + file_save + '.npy')
54 | create_dir_if_not_exists(base_path_save + '/' + fs)
55 | np.save(base_path_save + '/' + fs + '/' + file_save + '.npy', feats_dict)
56 | print
57 |
58 |
59 | if __name__ == "__main__":
60 | csv2npy()
61 |
--------------------------------------------------------------------------------
/utils/pretrain_word_vectors.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | # Parameters
4 | # ROOT_PATH = '/home/lvapeab/smt/tasks/image_desc/VQA/'
5 | ROOT_PATH = '/media/HDD_2TB/DATASETS/VQA/'
6 | base_path = ROOT_PATH + 'Glove/'
7 | glove_path = base_path + 'glove.42B.300d.txt'
8 | dest_file = 'glove_300'
9 |
10 |
11 | def glove2npy(glove_path, base_path_save, dest_file):
12 | vecs_dict = dict()
13 | print "Loading vectors from %s" % (glove_path)
14 |
15 | glove_vectors = [x[:-1] for x in open(glove_path).readlines()]
16 | n_vecs = len(glove_vectors)
17 | print "Found %d vectors in %s" % (n_vecs, glove_path)
18 | i = 0
19 | for vector in glove_vectors:
20 | v = vector.split()
21 | word = v[0]
22 | vec = np.asarray(v[1:], dtype='float32')
23 | vecs_dict[word] = vec
24 | i += 1
25 | if i % 1000 == 0:
26 | print "Processed", i, "vectors (", 100 * float(i) / n_vecs, "%)\r",
27 | print
28 | # Store dict
29 | print "Saving word vectors in %s" % (base_path_save + '/' + dest_file + '.npy')
30 | # create_dir_if_not_exists(base_path_save)
31 | np.save(base_path_save + '/' + dest_file + '.npy', vecs_dict)
32 | print
33 |
34 |
35 | if __name__ == "__main__":
36 | glove2npy(glove_path, base_path, dest_file)
37 |
--------------------------------------------------------------------------------
/utils/sort_by_split.py:
--------------------------------------------------------------------------------
1 | # Retrieves the images of a given split and sorts them according to that split
2 | import shutil
3 |
4 | from common import create_dir_if_not_exists
5 |
6 | image_dir = '/data/DATASETS/Flickr8k/Images'
7 | annotatios_dir = '/data/DATASETS/Flickr8k/Annotations'
8 | split_name = 'val'
9 | dest_dir = image_dir + '/' + split_name + '_images'
10 | ext = '.jpg'
11 |
12 | with open(annotatios_dir + '/' + split_name + '_list_ids.txt') as f:
13 | lines = f.readlines()
14 |
15 | create_dir_if_not_exists(dest_dir)
16 | n_items = len(str(len(lines))) + 1
17 | i = 0
18 | for filename in lines:
19 | i += 1
20 | shutil.copyfile(image_dir + '/' + filename[:-1] + ext, dest_dir + '/' + str(i).zfill(n_items) + ext)
21 |
--------------------------------------------------------------------------------
/utils/split_features.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def iter_loadtxt(filename, delimiter=',', skiprows=0, dtype=np.float32):
5 | def iter_func():
6 | with open(filename, 'r') as infile:
7 | for _ in range(skiprows):
8 | next(infile)
9 | for line in infile:
10 | line = line.rstrip().split(delimiter)
11 | for item in line:
12 | yield dtype(item)
13 | iter_loadtxt.rowlength = len(line)
14 |
15 | data = np.fromiter(iter_func(), dtype=dtype)
16 | data = data.reshape((-1, iter_loadtxt.rowlength))
17 | return data
18 |
19 |
20 | base_path = '/media/HDD_2TB/DATASETS/MSVD/Features/'
21 | feature = 'ImageNetFV_Places_C3Dfc8'
22 | out_feature = 'ImageNetFV'
23 |
24 | for split in ['train', 'val', 'test']:
25 | print "Loading %s features" % str(split + '_' + feature)
26 | # feats = np.genfromtxt(open(base_path + split + '_' + feature + "_features.csv", "rb"), delimiter=",", dtype='float32')
27 | feats = iter_loadtxt(base_path + split + '_' + feature + "_features.csv")
28 | new_feats = feats[:, :1024] # Modify this instruction to get the desired features!
29 | print "Saving %s features" % str(split + '_' + feature)
30 | np.savetxt(base_path + split + '_' + out_feature + "_features.csv", new_feats, delimiter=",")
31 |
--------------------------------------------------------------------------------
/utils/vocabulary_size.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 |
4 | if [ $# -lt 1 ]
5 | then
6 | echo "Usage: $0 text_file"
7 | echo "Computes the vocabulary size of text_file"
8 | exit 1
9 | fi
10 |
11 |
12 | for file in $* ;do
13 | vocab=`cat $file | tr " " '\n' | sort -u |wc -l`
14 | echo "$file: $vocab"
15 | done
16 |
--------------------------------------------------------------------------------