├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── config.py ├── data_engine ├── README.md ├── __init__.py ├── generate_corpus_full_history.py ├── generate_descriptions_lists.py ├── generate_features_lists.py ├── generate_img_lists.py ├── generate_img_lists_from_split.py ├── generate_link_lists.py ├── generate_parallel_corpus.py ├── prepare_data.py ├── split_data.py └── subsample_frames_features.py ├── docs └── model.png ├── main.py ├── meta-optimizers └── spearmint │ ├── README.md │ ├── __init__.py │ ├── config.json │ ├── launch_spearmint.sh │ └── spearmint_opt.py ├── train.sh ├── turing_test.py ├── utils ├── __init__.py ├── common.py ├── evaluate_from_file.py ├── plot_metric.sh ├── prepare_features.py ├── pretrain_word_vectors.py ├── sort_by_split.py ├── split_features.py └── vocabulary_size.sh ├── viddesc_model.py └── visualization.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | 3 | .idea 4 | 5 | /meta-optimizers/spearmint/db/ 6 | /meta-optimizers/spearmint/trained_models/ 7 | /meta-optimizers/spearmint/output/ 8 | 9 | ### Python template 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | env/ 21 | build/ 22 | develop-eggs/ 23 | dist/ 24 | downloads/ 25 | eggs/ 26 | .eggs/ 27 | lib/ 28 | lib64/ 29 | parts/ 30 | sdist/ 31 | var/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *,cover 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | ### Emacs template 69 | # -*- mode: gitignore; -*- 70 | *~ 71 | \#*\# 72 | /.emacs.desktop 73 | /.emacs.desktop.lock 74 | *.elc 75 | auto-save-list 76 | tramp 77 | .\#* 78 | 79 | # Org-mode 80 | .org-id-locations 81 | *_archive 82 | 83 | # flymake-mode 84 | *_flymake.* 85 | 86 | # eshell files 87 | /eshell/history 88 | /eshell/lastdir 89 | 90 | # elpa packages 91 | /elpa/ 92 | 93 | # reftex files 94 | *.rel 95 | 96 | # AUCTeX auto folder 97 | /auto/ 98 | 99 | # cask packages 100 | .cask/ 101 | 102 | # Models 103 | *.pkl 104 | *.json 105 | *.h5 106 | *.npy 107 | *.zip 108 | 109 | # Training results 110 | *.vqa 111 | *.coco 112 | *.multiclass 113 | *.pred 114 | *.txt 115 | 116 | # Visualization files 117 | .ipynb_checkpoints 118 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | COPYRIGHT 2 | 3 | Copyright (c) 2016, the respective contributors 4 | All rights reserved. 5 | 6 | ABiViRNet uses a shared copyright model: each contributor 7 | holds copyright over their contributions to ABiViRNet. The project versioning records 8 | all such contribution and copyright details. If a contributor wants to further 9 | mark their specific copyright on a particular contribution, they should 10 | indicate their copyright solely in the commit message of the change when it 11 | is committed. 12 | 13 | 14 | GNU GENERAL PUBLIC LICENSE 15 | Version 2, June 1991 16 | 17 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 18 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 | Everyone is permitted to copy and distribute verbatim copies 20 | of this license document, but changing it is not allowed. 21 | 22 | Preamble 23 | 24 | The licenses for most software are designed to take away your 25 | freedom to share and change it. By contrast, the GNU General Public 26 | License is intended to guarantee your freedom to share and change free 27 | software--to make sure the software is free for all its users. This 28 | General Public License applies to most of the Free Software 29 | Foundation's software and to any other program whose authors commit to 30 | using it. (Some other Free Software Foundation software is covered by 31 | the GNU Lesser General Public License instead.) You can apply it to 32 | your programs, too. 33 | 34 | When we speak of free software, we are referring to freedom, not 35 | price. Our General Public Licenses are designed to make sure that you 36 | have the freedom to distribute copies of free software (and charge for 37 | this service if you wish), that you receive source code or can get it 38 | if you want it, that you can change the software or use pieces of it 39 | in new free programs; and that you know you can do these things. 40 | 41 | To protect your rights, we need to make restrictions that forbid 42 | anyone to deny you these rights or to ask you to surrender the rights. 43 | These restrictions translate to certain responsibilities for you if you 44 | distribute copies of the software, or if you modify it. 45 | 46 | For example, if you distribute copies of such a program, whether 47 | gratis or for a fee, you must give the recipients all the rights that 48 | you have. You must make sure that they, too, receive or can get the 49 | source code. And you must show them these terms so they know their 50 | rights. 51 | 52 | We protect your rights with two steps: (1) copyright the software, and 53 | (2) offer you this license which gives you legal permission to copy, 54 | distribute and/or modify the software. 55 | 56 | Also, for each author's protection and ours, we want to make certain 57 | that everyone understands that there is no warranty for this free 58 | software. If the software is modified by someone else and passed on, we 59 | want its recipients to know that what they have is not the original, so 60 | that any problems introduced by others will not reflect on the original 61 | authors' reputations. 62 | 63 | Finally, any free program is threatened constantly by software 64 | patents. We wish to avoid the danger that redistributors of a free 65 | program will individually obtain patent licenses, in effect making the 66 | program proprietary. To prevent this, we have made it clear that any 67 | patent must be licensed for everyone's free use or not licensed at all. 68 | 69 | The precise terms and conditions for copying, distribution and 70 | modification follow. 71 | 72 | GNU GENERAL PUBLIC LICENSE 73 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 74 | 75 | 0. This License applies to any program or other work which contains 76 | a notice placed by the copyright holder saying it may be distributed 77 | under the terms of this General Public License. The "Program", below, 78 | refers to any such program or work, and a "work based on the Program" 79 | means either the Program or any derivative work under copyright law: 80 | that is to say, a work containing the Program or a portion of it, 81 | either verbatim or with modifications and/or translated into another 82 | language. (Hereinafter, translation is included without limitation in 83 | the term "modification".) Each licensee is addressed as "you". 84 | 85 | Activities other than copying, distribution and modification are not 86 | covered by this License; they are outside its scope. The act of 87 | running the Program is not restricted, and the output from the Program 88 | is covered only if its contents constitute a work based on the 89 | Program (independent of having been made by running the Program). 90 | Whether that is true depends on what the Program does. 91 | 92 | 1. You may copy and distribute verbatim copies of the Program's 93 | source code as you receive it, in any medium, provided that you 94 | conspicuously and appropriately publish on each copy an appropriate 95 | copyright notice and disclaimer of warranty; keep intact all the 96 | notices that refer to this License and to the absence of any warranty; 97 | and give any other recipients of the Program a copy of this License 98 | along with the Program. 99 | 100 | You may charge a fee for the physical act of transferring a copy, and 101 | you may at your option offer warranty protection in exchange for a fee. 102 | 103 | 2. You may modify your copy or copies of the Program or any portion 104 | of it, thus forming a work based on the Program, and copy and 105 | distribute such modifications or work under the terms of Section 1 106 | above, provided that you also meet all of these conditions: 107 | 108 | a) You must cause the modified files to carry prominent notices 109 | stating that you changed the files and the date of any change. 110 | 111 | b) You must cause any work that you distribute or publish, that in 112 | whole or in part contains or is derived from the Program or any 113 | part thereof, to be licensed as a whole at no charge to all third 114 | parties under the terms of this License. 115 | 116 | c) If the modified program normally reads commands interactively 117 | when run, you must cause it, when started running for such 118 | interactive use in the most ordinary way, to print or display an 119 | announcement including an appropriate copyright notice and a 120 | notice that there is no warranty (or else, saying that you provide 121 | a warranty) and that users may redistribute the program under 122 | these conditions, and telling the user how to view a copy of this 123 | License. (Exception: if the Program itself is interactive but 124 | does not normally print such an announcement, your work based on 125 | the Program is not required to print an announcement.) 126 | 127 | These requirements apply to the modified work as a whole. If 128 | identifiable sections of that work are not derived from the Program, 129 | and can be reasonably considered independent and separate works in 130 | themselves, then this License, and its terms, do not apply to those 131 | sections when you distribute them as separate works. But when you 132 | distribute the same sections as part of a whole which is a work based 133 | on the Program, the distribution of the whole must be on the terms of 134 | this License, whose permissions for other licensees extend to the 135 | entire whole, and thus to each and every part regardless of who wrote it. 136 | 137 | Thus, it is not the intent of this section to claim rights or contest 138 | your rights to work written entirely by you; rather, the intent is to 139 | exercise the right to control the distribution of derivative or 140 | collective works based on the Program. 141 | 142 | In addition, mere aggregation of another work not based on the Program 143 | with the Program (or with a work based on the Program) on a volume of 144 | a storage or distribution medium does not bring the other work under 145 | the scope of this License. 146 | 147 | 3. You may copy and distribute the Program (or a work based on it, 148 | under Section 2) in object code or executable form under the terms of 149 | Sections 1 and 2 above provided that you also do one of the following: 150 | 151 | a) Accompany it with the complete corresponding machine-readable 152 | source code, which must be distributed under the terms of Sections 153 | 1 and 2 above on a medium customarily used for software interchange; or, 154 | 155 | b) Accompany it with a written offer, valid for at least three 156 | years, to give any third party, for a charge no more than your 157 | cost of physically performing source distribution, a complete 158 | machine-readable copy of the corresponding source code, to be 159 | distributed under the terms of Sections 1 and 2 above on a medium 160 | customarily used for software interchange; or, 161 | 162 | c) Accompany it with the information you received as to the offer 163 | to distribute corresponding source code. (This alternative is 164 | allowed only for noncommercial distribution and only if you 165 | received the program in object code or executable form with such 166 | an offer, in accord with Subsection b above.) 167 | 168 | The source code for a work means the preferred form of the work for 169 | making modifications to it. For an executable work, complete source 170 | code means all the source code for all modules it contains, plus any 171 | associated interface definition files, plus the scripts used to 172 | control compilation and installation of the executable. However, as a 173 | special exception, the source code distributed need not include 174 | anything that is normally distributed (in either source or binary 175 | form) with the major components (compiler, kernel, and so on) of the 176 | operating system on which the executable runs, unless that component 177 | itself accompanies the executable. 178 | 179 | If distribution of executable or object code is made by offering 180 | access to copy from a designated place, then offering equivalent 181 | access to copy the source code from the same place counts as 182 | distribution of the source code, even though third parties are not 183 | compelled to copy the source along with the object code. 184 | 185 | 4. You may not copy, modify, sublicense, or distribute the Program 186 | except as expressly provided under this License. Any attempt 187 | otherwise to copy, modify, sublicense or distribute the Program is 188 | void, and will automatically terminate your rights under this License. 189 | However, parties who have received copies, or rights, from you under 190 | this License will not have their licenses terminated so long as such 191 | parties remain in full compliance. 192 | 193 | 5. You are not required to accept this License, since you have not 194 | signed it. However, nothing else grants you permission to modify or 195 | distribute the Program or its derivative works. These actions are 196 | prohibited by law if you do not accept this License. Therefore, by 197 | modifying or distributing the Program (or any work based on the 198 | Program), you indicate your acceptance of this License to do so, and 199 | all its terms and conditions for copying, distributing or modifying 200 | the Program or works based on it. 201 | 202 | 6. Each time you redistribute the Program (or any work based on the 203 | Program), the recipient automatically receives a license from the 204 | original licensor to copy, distribute or modify the Program subject to 205 | these terms and conditions. You may not impose any further 206 | restrictions on the recipients' exercise of the rights granted herein. 207 | You are not responsible for enforcing compliance by third parties to 208 | this License. 209 | 210 | 7. If, as a consequence of a court judgment or allegation of patent 211 | infringement or for any other reason (not limited to patent issues), 212 | conditions are imposed on you (whether by court order, agreement or 213 | otherwise) that contradict the conditions of this License, they do not 214 | excuse you from the conditions of this License. If you cannot 215 | distribute so as to satisfy simultaneously your obligations under this 216 | License and any other pertinent obligations, then as a consequence you 217 | may not distribute the Program at all. For example, if a patent 218 | license would not permit royalty-free redistribution of the Program by 219 | all those who receive copies directly or indirectly through you, then 220 | the only way you could satisfy both it and this License would be to 221 | refrain entirely from distribution of the Program. 222 | 223 | If any portion of this section is held invalid or unenforceable under 224 | any particular circumstance, the balance of the section is intended to 225 | apply and the section as a whole is intended to apply in other 226 | circumstances. 227 | 228 | It is not the purpose of this section to induce you to infringe any 229 | patents or other property right claims or to contest validity of any 230 | such claims; this section has the sole purpose of protecting the 231 | integrity of the free software distribution system, which is 232 | implemented by public license practices. Many people have made 233 | generous contributions to the wide range of software distributed 234 | through that system in reliance on consistent application of that 235 | system; it is up to the author/donor to decide if he or she is willing 236 | to distribute software through any other system and a licensee cannot 237 | impose that choice. 238 | 239 | This section is intended to make thoroughly clear what is believed to 240 | be a consequence of the rest of this License. 241 | 242 | 8. If the distribution and/or use of the Program is restricted in 243 | certain countries either by patents or by copyrighted interfaces, the 244 | original copyright holder who places the Program under this License 245 | may add an explicit geographical distribution limitation excluding 246 | those countries, so that distribution is permitted only in or among 247 | countries not thus excluded. In such case, this License incorporates 248 | the limitation as if written in the body of this License. 249 | 250 | 9. The Free Software Foundation may publish revised and/or new versions 251 | of the General Public License from time to time. Such new versions will 252 | be similar in spirit to the present version, but may differ in detail to 253 | address new problems or concerns. 254 | 255 | Each version is given a distinguishing version number. If the Program 256 | specifies a version number of this License which applies to it and "any 257 | later version", you have the option of following the terms and conditions 258 | either of that version or of any later version published by the Free 259 | Software Foundation. If the Program does not specify a version number of 260 | this License, you may choose any version ever published by the Free Software 261 | Foundation. 262 | 263 | 10. If you wish to incorporate parts of the Program into other free 264 | programs whose distribution conditions are different, write to the author 265 | to ask for permission. For software which is copyrighted by the Free 266 | Software Foundation, write to the Free Software Foundation; we sometimes 267 | make exceptions for this. Our decision will be guided by the two goals 268 | of preserving the free status of all derivatives of our free software and 269 | of promoting the sharing and reuse of software generally. 270 | 271 | NO WARRANTY 272 | 273 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 274 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 275 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 276 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 277 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 278 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 279 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 280 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 281 | REPAIR OR CORRECTION. 282 | 283 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 284 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 285 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 286 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 287 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 288 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 289 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 290 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 291 | POSSIBILITY OF SUCH DAMAGES. 292 | 293 | END OF TERMS AND CONDITIONS 294 | 295 | How to Apply These Terms to Your New Programs 296 | 297 | If you develop a new program, and you want it to be of the greatest 298 | possible use to the public, the best way to achieve this is to make it 299 | free software which everyone can redistribute and change under these terms. 300 | 301 | To do so, attach the following notices to the program. It is safest 302 | to attach them to the start of each source file to most effectively 303 | convey the exclusion of warranty; and each file should have at least 304 | the "copyright" line and a pointer to where the full notice is found. 305 | 306 | {description} 307 | Copyright (C) {year} {fullname} 308 | 309 | This program is free software; you can redistribute it and/or modify 310 | it under the terms of the GNU General Public License as published by 311 | the Free Software Foundation; either version 2 of the License, or 312 | (at your option) any later version. 313 | 314 | This program is distributed in the hope that it will be useful, 315 | but WITHOUT ANY WARRANTY; without even the implied warranty of 316 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 317 | GNU General Public License for more details. 318 | 319 | You should have received a copy of the GNU General Public License along 320 | with this program; if not, write to the Free Software Foundation, Inc., 321 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 322 | 323 | Also add information on how to contact you by electronic and paper mail. 324 | 325 | If the program is interactive, make it output a short notice like this 326 | when it starts in an interactive mode: 327 | 328 | Gnomovision version 69, Copyright (C) year name of author 329 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 330 | This is free software, and you are welcome to redistribute it 331 | under certain conditions; type `show c' for details. 332 | 333 | The hypothetical commands `show w' and `show c' should show the appropriate 334 | parts of the General Public License. Of course, the commands you use may 335 | be called something other than `show w' and `show c'; they could even be 336 | mouse-clicks or menu items--whatever suits your program. 337 | 338 | You should also get your employer (if you work as a programmer) or your 339 | school, if any, to sign a "copyright disclaimer" for the program, if 340 | necessary. Here is a sample; alter the names: 341 | 342 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 343 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 344 | 345 | {signature of Ty Coon}, 1 April 1989 346 | Ty Coon, President of Vice 347 | 348 | This General Public License does not permit incorporating your program into 349 | proprietary programs. If your program is a subroutine library, you may 350 | consider it more useful to permit linking proprietary applications with the 351 | library. If this is what you want to do, use the GNU Lesser General 352 | Public License instead of this License. 353 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Egocentric Video Description based on Temporally-Linked Sequences 2 | 3 | This repository contains the code for building the Temporally-linked Multi-input Attention (TMA) model, which was presented in 4 | the work [Egocentric Video Description based on Temporally-Linked Sequences](), 5 | submitted to the [Journal of Visual Communication and Image Representation](https://www.journals.elsevier.com/journal-of-visual-communication-and-image-representation). 6 | With this module, you can replicate our experiments and easily deploy new models. TMA is built upon our fork of 7 | [Keras](https://github.com/MarcBS/keras) framework ([version 1.2](https://github.com/MarcBS/keras/tree/Keras-1.2-(stable))) and tested for the [Theano](http://deeplearning.net/software/theano) 8 | backend. 9 | 10 | ## Features: 11 | 12 | * Temporally-linked mechanism for learning using information from previous events. 13 | * Multi-input Attention LSTM model over any of the input multimodal sequences. 14 | * Peeked decoder LSTM: The previously generated word is an input of the current LSTM timestep 15 | * MLPs for initializing the LSTM hidden and memory state 16 | * Beam search decoding 17 | 18 | ## Architecture 19 | 20 | ![TMA_model](./docs/model.png) 21 | 22 | ## Requirements 23 | 24 | TMA requires the following libraries: 25 | 26 | - [Our version of Keras](https://github.com/MarcBS/keras) >= 1.2.3 27 | - [Multimodal Keras Wrapper](https://github.com/MarcBS/multimodal_keras_wrapper) >= 0.7 28 | - [Coco-caption evaluation package](https://github.com/lvapeab/coco-caption/tree/master/pycocoevalcap/) 29 | 30 | ## Instructions: 31 | 32 | Assuming you have a dataset and features extracted from the video frames: 33 | 34 | 0) Set the paths to Keras and Multimodal Keras Wraper in train.sh 35 | 36 | 1) Prepare data: 37 | 38 | `` 39 | python data_engine/subsample_frames_features.py 40 | `` 41 | 42 | `` 43 | python data_engine/generate_features_lists.py 44 | `` 45 | 46 | `` 47 | python data_engine/generate_descriptions_lists.py 48 | `` 49 | 50 | See [data_engine/README.md](data_engine/README.md) for detailed information. 51 | 52 | 2) Prepare the inputs/outputs of your model in `data_engine/prepare_data.py` 53 | 54 | 3) Set a model configuration in `config.py` 55 | 56 | 4) Train!: 57 | 58 | `` 59 | python main.py 60 | `` 61 | 62 | ## Dataset 63 | 64 | The dataset [EDUB-SegDesc](http://www.ub.edu/cvub/edub-segdesc/) was used to evaluate this model. It was acquired by the wearable camera Narrative Clip, taking a picture every 30 seconds (2 fpm). It consists of 55 days acquired by 9 people. Containing a total of 48,717 images, divided in 1,339 events (or image sequences) and 3,991 captions. 65 | 66 | ## Citation 67 | 68 | If you use this code for any purpose, please, do not forget to cite the following paper: 69 | 70 | ``` 71 | Marc Bolaños, Álvaro Peris, Francisco Casacuberta, Sergi Soler and Petia Radeva. 72 | Egocentric Video Description based on Temporally-Linked Sequences 73 | In Special Issue on Egocentric Vision and Lifelogging Tools. 74 | Journal of Visual Communication and Image Representation (VCIR), (SUBMITTED). 75 | ``` 76 | 77 | ## About 78 | 79 | Joint collaboration between the [Computer Vision at the University of Barcelona (CVUB)](http://www.ub.edu/cvub/) group at [Universitat de Barcelona](www.ub.edu)-[CVC](http://www.cvc.uab.es) and the [PRHLT Research Center](https://www.prhlt.upv.es) at [Universitat Politècnica de València](https://www.upv.es). 80 | 81 | 82 | ## Contact 83 | 84 | Marc Bolaños ([web page](http://www.ub.edu/cvub/marcbolanos/)): marc.bolanos@ub.edu 85 | 86 | Álvaro Peris ([web page](http://lvapeab.github.io/)): lvapeab@prhlt.upv.es 87 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarcBS/TMA/76f4e16b3f10c056c45ada5df4a64cc564b69011/__init__.py -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | def load_parameters(): 2 | """ 3 | Loads the defined parameters 4 | """ 5 | # Input data params 6 | DATA_ROOT_PATH = '/media/HDD_3TB/DATASETS/EDUB-SegDesc/' 7 | 8 | # preprocessed features 9 | DATASET_NAME = 'EDUB-SegDesc_features' # Dataset name (add '-linked' suffix for using 10 | # dataset with temporally-linked training data) 11 | # 12 | # -linked 13 | # -linked-upperbound 14 | # -linked-upperbound-copy 15 | # -linked-upperbound-prev 16 | # -linked-upperbound-nocopy 17 | # -linked-video 18 | # -linked-vidtext 19 | # -vidtext-embed 20 | # 21 | 22 | PRE_TRAINED_DATASET_NAME = None #'MSVD_features' # Dataset name for reusing vocabulary of pre-trained model (set to None for disabling) 23 | # (only applicable if we are using a pre-trained model, default None) 24 | VOCABULARIES_MAPPING = {'description': 'description', 25 | 'state_below': 'description', 26 | 'prev_description': 'description'} 27 | 28 | PRE_TRAINED_VOCABULARY_NAME = None #'1BillionWords_vocabulary' # Dataset name for reusing vocabulary of pre-trained model 29 | 30 | # Input data 31 | INPUT_DATA_TYPE = 'video-features' # 'video-features' or 'video' 32 | NUM_FRAMES = 26 # fixed number of input frames per video 33 | 34 | if '-noninfo' in DATASET_NAME: 35 | suffix_annotations = '_without_noninfo' 36 | suffix_features = '_Without_NonInfo' 37 | else: 38 | suffix_annotations = '' 39 | suffix_features = '' 40 | 41 | #### Features from video frames 42 | FRAMES_LIST_FILES = {'train': 'Annotations/%s/train_feat_list'+suffix_annotations+'.txt', # Feature frames list files 43 | 'val': 'Annotations/%s/val_feat_list'+suffix_annotations+'.txt', 44 | 'test': 'Annotations/%s/test_feat_list'+suffix_annotations+'.txt', 45 | } 46 | FRAMES_COUNTS_FILES = { 'train': 'Annotations/%s/train_feat_counts'+suffix_annotations+'.txt', # Frames counts files 47 | 'val': 'Annotations/%s/val_feat_counts'+suffix_annotations+'.txt', 48 | 'test': 'Annotations/%s/test_feat_counts'+suffix_annotations+'.txt', 49 | } 50 | FEATURE_NAMES = ['ImageNet' 51 | + suffix_features] # append '_L2' at the end of each feature type if using their L2 version 52 | 53 | # Output data 54 | DESCRIPTION_FILES = {'train': 'Annotations/train_descriptions'+suffix_annotations+'.txt', # Description files 55 | 'val': 'Annotations/val_descriptions'+suffix_annotations+'.txt', 56 | 'test': 'Annotations/test_descriptions'+suffix_annotations+'.txt', 57 | } 58 | DESCRIPTION_COUNTS_FILES = { 'train': 'Annotations/train_descriptions_counts'+suffix_annotations+'.npy', # Description counts files 59 | 'val': 'Annotations/val_descriptions_counts'+suffix_annotations+'.npy', 60 | 'test': 'Annotations/test_descriptions_counts'+suffix_annotations+'.npy', 61 | } 62 | 63 | # Dataset parameters 64 | if not '-vidtext-embed' in DATASET_NAME: 65 | INPUTS_IDS_DATASET = ['video', 'state_below'] # Corresponding inputs of the dataset 66 | OUTPUTS_IDS_DATASET = ['description'] # Corresponding outputs of the dataset 67 | INPUTS_IDS_MODEL = ['video', 'state_below'] # Corresponding inputs of the built model 68 | OUTPUTS_IDS_MODEL = ['description'] # Corresponding outputs of the built model 69 | else: 70 | INPUTS_IDS_DATASET = ['video', 'description'] # Corresponding inputs of the dataset 71 | OUTPUTS_IDS_DATASET = ['match'] # Corresponding outputs of the dataset 72 | INPUTS_IDS_MODEL = ['video', 'description'] # Corresponding inputs of the built model 73 | OUTPUTS_IDS_MODEL = ['match'] # Corresponding outputs of the built model 74 | 75 | 76 | if '-linked' in DATASET_NAME: 77 | 78 | LINK_SAMPLE_FILES = {'train': 'Annotations/train_link_samples'+suffix_annotations+'.txt', # Links index files 79 | 'val': 'Annotations/val_link_samples'+suffix_annotations+'.txt', 80 | 'test': 'Annotations/test_link_samples'+suffix_annotations+'.txt', 81 | } 82 | 83 | INPUTS_IDS_DATASET.append('prev_description') 84 | INPUTS_IDS_MODEL.append('prev_description') 85 | 86 | if '-vidtext' in DATASET_NAME: 87 | INPUTS_IDS_DATASET.append('prev_video') 88 | INPUTS_IDS_MODEL.append('prev_video') 89 | 90 | if '-upperbound' not in DATASET_NAME and '-video' not in DATASET_NAME: 91 | INPUTS_IDS_DATASET.append('link_index') 92 | INPUTS_IDS_MODEL.append('link_index') 93 | 94 | 95 | # Evaluation params 96 | if not '-vidtext-embed' in DATASET_NAME: 97 | METRICS = ['coco'] # Metric used for evaluating model after each epoch (leave empty if only prediction is required) 98 | else: 99 | METRICS = ['multiclass_metrics'] 100 | EVAL_ON_SETS = ['val', 'test'] # Possible values: 'train', 'val' and 'test' (external evaluator) 101 | EVAL_ON_SETS_KERAS = [] # Possible values: 'train', 'val' and 'test' (Keras' evaluator) 102 | START_EVAL_ON_EPOCH = 0 # First epoch where the model will be evaluated 103 | EVAL_EACH_EPOCHS = False # Select whether evaluate between N epochs or N updates 104 | EVAL_EACH = 50 # Sets the evaluation frequency (epochs or updates) 105 | 106 | # Search parameters 107 | SAMPLING = 'max_likelihood' # Possible values: multinomial or max_likelihood (recommended) 108 | TEMPERATURE = 1 # Multinomial sampling parameter 109 | if not '-vidtext-embed' in DATASET_NAME: 110 | BEAM_SEARCH = True # Switches on-off the beam search procedure 111 | else: 112 | BEAM_SEARCH = False 113 | BEAM_SIZE = 10 # Beam size (in case of BEAM_SEARCH == True) 114 | BEAM_SEARCH_COND_INPUT = 1 # Index of the conditional input used in beam search (i.e., state_below) 115 | OPTIMIZED_SEARCH = True # Compute annotations only a single time per sample 116 | NORMALIZE_SAMPLING = False # Normalize hypotheses scores according to their length 117 | ALPHA_FACTOR = .6 # Normalization according to length**ALPHA_FACTOR 118 | # (see: arxiv.org/abs/1609.08144) 119 | 120 | # Sampling params: Show some samples during training 121 | if not '-vidtext-embed' in DATASET_NAME: 122 | SAMPLE_ON_SETS = ['train', 'val'] # Possible values: 'train', 'val' and 'test' 123 | else: 124 | SAMPLE_ON_SETS = [] 125 | N_SAMPLES = 5 # Number of samples generated 126 | START_SAMPLING_ON_EPOCH = 0 # First epoch where the model will be evaluated 127 | SAMPLE_EACH_UPDATES = 50 # Sampling frequency (default 450) 128 | 129 | # Word representation params 130 | TOKENIZATION_METHOD = 'tokenize_icann' # Select which tokenization we'll apply: 131 | # tokenize_basic, tokenize_aggressive, tokenize_soft, 132 | # tokenize_icann or tokenize_questions 133 | 134 | FILL = 'end' # whether we fill the 'end' or the 'start' of the sentence with 0s 135 | TRG_LAN = 'en' # Language of the outputs (mainly used for the Meteor evaluator) 136 | PAD_ON_BATCH = True # Whether we take as many timesteps as the longes sequence of the batch 137 | # or a fixed size (MAX_OUTPUT_TEXT_LEN) 138 | 139 | # Input image parameters 140 | DATA_AUGMENTATION = False # Apply data augmentation on input data (noise on features) 141 | DATA_AUGMENTATION_TYPE = ['random_selection'] # 'random_selection', 'noise' 142 | IMG_FEAT_SIZE = 1024 # Size of the image features 143 | 144 | # Output text parameters 145 | OUTPUT_VOCABULARY_SIZE = 0 # Size of the input vocabulary. Set to 0 for using all, 146 | # otherwise it will be truncated to these most frequent words. 147 | MAX_OUTPUT_TEXT_LEN = 30 # Maximum length of the output sequence 148 | # set to 0 if we want to use the whole answer as a single class 149 | MAX_OUTPUT_TEXT_LEN_TEST = 50 # Maximum length of the output sequence during test time 150 | MIN_OCCURRENCES_VOCAB = 0 # Minimum number of occurrences allowed for the words in the vocabulay. 151 | 152 | # Optimizer parameters (see model.compile() function) 153 | LOSS = 'categorical_crossentropy' 154 | CLASSIFIER_ACTIVATION = 'softmax' 155 | 156 | OPTIMIZER = 'Adadelta' # Optimizer 157 | LR = 1. # Learning rate. Recommended values - Adam 0.001 - Adadelta 1.0 158 | CLIP_C = 10. # During training, clip gradients to this norm 159 | if not '-vidtext-embed' in DATASET_NAME: 160 | SAMPLE_WEIGHTS = True # Select whether we use a weights matrix (mask) for the data outputs 161 | LR_DECAY = None # Minimum number of epochs before the next LR decay. Set to None if don't want to decay the learning rate 162 | LR_GAMMA = 0.995 # Multiplier used for decreasing the LR 163 | 164 | # Training parameters 165 | MAX_EPOCH = 200 # Stop when computed this number of epochs 166 | BATCH_SIZE = 64 # ABiViRNet trained with BATCH_SIZE = 64 167 | 168 | HOMOGENEOUS_BATCHES = False # Use batches with homogeneous output lengths for every minibatch (Possibly buggy!) 169 | PARALLEL_LOADERS = 8 # Parallel data batch loaders 170 | EPOCHS_FOR_SAVE = 1 if EVAL_EACH_EPOCHS else None # Number of epochs between model saves (None for disabling epoch save) 171 | WRITE_VALID_SAMPLES = True # Write valid samples in file 172 | SAVE_EACH_EVALUATION = True if not EVAL_EACH_EPOCHS else False # Save each time we evaluate the model 173 | 174 | # Early stop parameters 175 | EARLY_STOP = True # Turns on/off the early stop protocol 176 | PATIENCE = 20 # We'll stop if the val STOP_METRIC does not improve after this 177 | # number of evaluations 178 | 179 | if not '-vidtext-embed' in DATASET_NAME: 180 | STOP_METRIC = 'Bleu_4' # Metric for the stop 181 | else: 182 | STOP_METRIC = 'accuracy' 183 | 184 | # Model parameters 185 | MODEL_TYPE = 'TemporallyLinkedVideoDescriptionAttDoublePrev' # 'ArcticVideoCaptionWithInit' 186 | # 'ArcticVideoCaptionNoLSTMEncWithInit' 187 | # 'TemporallyLinkedVideoDescriptionNoAtt' 188 | # 'TemporallyLinkedVideoDescriptionAtt' 189 | # 'TemporallyLinkedVideoDescriptionAttDoublePrev' 190 | # 'VideoTextEmbedding' 191 | # 'DeepSeek' 192 | 193 | RNN_TYPE = 'LSTM' # RNN unit type ('LSTM' supported) 194 | 195 | # Input text parameters 196 | TARGET_TEXT_EMBEDDING_SIZE = 301 # Source language word embedding size (ABiViRNet 301) 197 | TRG_PRETRAINED_VECTORS = None # Path to pretrained vectors. (e.g. DATA_ROOT_PATH + '/DATA/word2vec.%s.npy' % TRG_LAN) 198 | # Set to None if you don't want to use pretrained vectors. 199 | # When using pretrained word embeddings, the size of the pretrained word embeddings must match with the word embeddings size. 200 | TRG_PRETRAINED_VECTORS_TRAINABLE = True # Finetune or not the target word embedding vectors. 201 | 202 | # Encoder configuration 203 | ENCODER_HIDDEN_SIZE = 717 # For models with RNN encoder (ABiViRNet 717) 204 | BIDIRECTIONAL_ENCODER = True # Use bidirectional encoder 205 | N_LAYERS_ENCODER = 1 # Stack this number of encoding layers (default 1) 206 | BIDIRECTIONAL_DEEP_ENCODER = True # Use bidirectional encoder in all encoding layers 207 | 208 | 209 | # Previous sentence encoder 210 | PREV_SENT_ENCODER_HIDDEN_SIZE = 717 # For models with previous sentence RNN encoder (484) 211 | BIDIRECTIONAL_PREV_SENT_ENCODER = True # Use bidirectional encoder 212 | N_LAYERS_PREV_SENT_ENCODER = 1 # Stack this number of encoding layers 213 | BIDIRECTIONAL_DEEP_PREV_SENT_ENCODER = True # Use bidirectional encoder in all encoding layers 214 | 215 | DECODER_HIDDEN_SIZE = 484 # For models with LSTM decoder (ABiViRNet 484) 216 | SKIP_VECTORS_HIDDEN_SIZE = TARGET_TEXT_EMBEDDING_SIZE 217 | ADDITIONAL_OUTPUT_MERGE_MODE = 'sum' # Merge mode for the skip connections 218 | WEIGHTED_MERGE = False # Wether we want to apply a conventional or a weighted merge 219 | 220 | 221 | AFFINE_LAYERS_DIM = 500 # Dimensionality of the affine layers in 'DeepSeek' model 222 | 223 | IMG_EMBEDDING_LAYERS = [] # FC layers for visual embedding 224 | # Here we should specify the activation function and the output dimension 225 | # (e.g IMG_EMBEDDING_LAYERS = [('linear', 1024)] 226 | 227 | # Fully-Connected layers for initializing the first RNN state 228 | # Here we should only specify the activation function of each layer 229 | # (as they have a potentially fixed size) 230 | # (e.g INIT_LAYERS = ['tanh', 'relu']) 231 | INIT_LAYERS = ['tanh'] 232 | 233 | # Additional Fully-Connected layers's sizes applied before softmax. 234 | # Here we should specify the activation function and the output dimension 235 | # (e.g DEEP_OUTPUT_LAYERS = [('tanh', 600), ('relu', 400), ('relu', 200)]) 236 | DEEP_OUTPUT_LAYERS = [] 237 | 238 | # Regularizers 239 | WEIGHT_DECAY = 1e-4 # L2 regularization 240 | RECURRENT_WEIGHT_DECAY = 0. # L2 regularization in recurrent layers 241 | 242 | USE_DROPOUT = True # Use dropout 243 | DROPOUT_P = 0.5 # Percentage of units to drop 244 | 245 | USE_RECURRENT_DROPOUT = False # Use dropout in recurrent layers # DANGEROUS! 246 | RECURRENT_DROPOUT_P = 0.5 # Percentage of units to drop in recurrent layers 247 | 248 | USE_NOISE = True # Use gaussian noise during training 249 | NOISE_AMOUNT = 0.01 # Amount of noise 250 | 251 | USE_BATCH_NORMALIZATION = True # If True it is recommended to deactivate Dropout 252 | BATCH_NORMALIZATION_MODE = 1 # See documentation in Keras' BN 253 | 254 | USE_PRELU = False # use PReLU activations as regularizer 255 | USE_L2 = False # L2 normalization on the features 256 | 257 | # Results plot and models storing parameters 258 | EXTRA_NAME = '' # This will be appended to the end of the model name 259 | MODEL_NAME = DATASET_NAME + '_' + MODEL_TYPE +\ 260 | '_txtemb_' + str(TARGET_TEXT_EMBEDDING_SIZE) + \ 261 | '_imgemb_' + '_'.join([layer[0] for layer in IMG_EMBEDDING_LAYERS]) + \ 262 | '_lstmenc_' + str(ENCODER_HIDDEN_SIZE) + \ 263 | '_lstm_' + str(DECODER_HIDDEN_SIZE) + \ 264 | '_additional_output_mode_' + str(ADDITIONAL_OUTPUT_MERGE_MODE) + \ 265 | '_deepout_' + '_'.join([layer[0] for layer in DEEP_OUTPUT_LAYERS]) + \ 266 | '_' + OPTIMIZER + '_decay_' + str(LR_DECAY) + '-' + str(LR_GAMMA) 267 | 268 | MODEL_NAME += '_' + EXTRA_NAME 269 | 270 | # Name and location of the pre-trained model (only if RELOAD > 0) 271 | PRE_TRAINED_MODELS = ['MSVD_best_model'] 272 | # default: MODEL_NAME 273 | # ['EDUB-SegDesc_features-vidtext-embed_VideoTextEmbedding_txtemb_301_imgemb__lstmenc_717_lstm_484_additional_output_mode_sum_deepout__Adadelta_decay_None-0.95_vidtext_classification_BLSTM_text'] 274 | # ['EDUB-SegDesc_features-vidtext-embed_VideoTextEmbedding_txtemb_301_imgemb__lstmenc_717_lstm_484_additional_output_mode_sum_deepout__Adadelta_decay_None-0.95_vidtext_classification'] 275 | # ['EDUB-SegDesc_features-vidtext-embed_VideoTextEmbedding_txtemb_301_imgemb__lstmenc_717_lstm_484_additional_output_mode_sum_deepout__Adam_decay_1-0.95vidtext_embed'] 276 | # ['MSVD_best_model'] 277 | # ['MSVD_best_model', '1BillionWords'] 278 | PRE_TRAINED_MODEL_STORE_PATHS = map(lambda x: 'trained_models/' + x + '/', PRE_TRAINED_MODELS) if isinstance(PRE_TRAINED_MODELS, list) else 'trained_models/'+PRE_TRAINED_MODELS+'/' 279 | LOAD_WEIGHTS_ONLY = True # Load weights of pre-trained model or complete Model_Wrapper instance 280 | # Layers' mapping from old to new model if LOAD_WEIGHTS_ONLY 281 | # You can check the layers of a model with [layer.name for layer in model_wrapper.model.layers] 282 | if '-video' in DATASET_NAME: 283 | # Pre-train MSVD 284 | LAYERS_MAPPING = [{'bidirectional_encoder': 'bidirectional_encoder_LSTM', 285 | 'initial_state': 'initial_state', 286 | 'initial_memory': 'initial_memory', 287 | 'attlstmcond_1': 'decoder_AttLSTMCond2Inputs', # 'decoder_AttLSTMCond', 288 | 'target_word_embedding': 'target_word_embedding', 289 | 'logit_ctx': 'logit_ctx', 290 | 'logit_lstm': 'logit_lstm', 291 | 'description': 'description' 292 | } 293 | ] 294 | # Pre-train vidtext embedding 295 | """ 296 | LAYERS_MAPPING = [{'bidirectional_encoder_LSTM': 'bidirectional_encoder_LSTM', 297 | 'bidirectional_encoder_LSTM': 'prev_desc_emb_bidirectional_encoder_LSTM', 298 | 'target_word_embedding': 'target_word_embedding', 299 | 'logit_ctx': 'logit_ctx', 300 | 'logit_prev': 'logit_prev', 301 | } 302 | ] 303 | """ 304 | 305 | elif '-vidtext-embed' in DATASET_NAME: 306 | LAYERS_MAPPING = [{'bidirectional_encoder': 'bidirectional_encoder_LSTM', 307 | 'target_word_embedding': 'target_word_embedding', 308 | 'logit_ctx': 'logit_ctx', 309 | } 310 | ] 311 | else: 312 | if MODEL_TYPE == 'ArcticVideoCaptionWithInit': 313 | LAYERS_MAPPING = [{'bidirectional_encoder': 'bidirectional_encoder_LSTM', 314 | 'initial_state': 'initial_state', 315 | 'initial_memory': 'initial_memory', 316 | 'attlstmcond_1': 'decoder_AttLSTMCond', 317 | 'target_word_embedding': 'target_word_embedding', 318 | 'logit_ctx': 'logit_ctx', 319 | 'logit_lstm': 'logit_lstm', 320 | 'description': 'description' 321 | } 322 | ] 323 | 324 | elif MODEL_TYPE == 'TemporallyLinkedVideoDescriptionAttDoublePrev': 325 | LAYERS_MAPPING = [{'bidirectional_encoder': 'bidirectional_encoder_LSTM', 326 | 'initial_state': 'initial_state', 327 | 'initial_memory': 'initial_memory', 328 | 'attlstmcond_1': 'decoder_AttLSTMCond3Inputs', # 'decoder_AttLSTMCond', 329 | 'target_word_embedding': 'target_word_embedding', 330 | 'logit_ctx': 'logit_ctx', 331 | 'logit_lstm': 'logit_lstm', 332 | 'description': 'description' 333 | } 334 | ] 335 | 336 | elif len(PRE_TRAINED_MODELS) == 2: 337 | LAYERS_MAPPING = [{'bidirectional_encoder': 'bidirectional_encoder_LSTM', 338 | 'initial_state': 'initial_state', 339 | 'initial_memory': 'initial_memory', 340 | 'attlstmcond_1': 'decoder_AttLSTMCond2Inputs', # 'decoder_AttLSTMCond', 341 | #'target_word_embedding': 'target_word_embedding', 342 | 'logit_ctx': 'logit_ctx', 343 | 'logit_lstm': 'logit_lstm', 344 | #'description': 'description' 345 | }, 346 | {'bidirectional_encoder_LSTM': 'prev_desc_emb_bidirectional_encoder_LSTM', #'prev_desc_emb_encoder_LSTM', 347 | 'target_word_embedding': 'target_word_embedding', 348 | 'decoder_AttLSTMCond': 'decoder_AttLSTMCond2Inputs', #'decoder_AttLSTMCond', 349 | 'target_text': 'description' 350 | } 351 | ] 352 | 353 | elif len(PRE_TRAINED_MODELS) == 1: # reuse data from vidtext-embedding model 354 | 355 | LAYERS_MAPPING = [{'bidirectional_encoder_LSTM': 'bidirectional_encoder_LSTM', 356 | 'prev_desc_emb_bidirectional_encoder_LSTM': 'prev_desc_emb_bidirectional_encoder_LSTM', 357 | 'target_word_embedding': 'target_word_embedding', 358 | 'logit_ctx': 'logit_ctx', 359 | 'logit_prev': 'logit_prev', 360 | } 361 | ] 362 | 363 | 364 | STORE_PATH = 'trained_models/' + MODEL_NAME + '/' # Models and evaluation results will be stored here 365 | DATASET_STORE_PATH = 'datasets/' # Dataset instance will be stored here 366 | 367 | SAMPLING_SAVE_MODE = 'list' # 'list' or 'vqa' 368 | VERBOSE = 1 # Vqerbosity level 369 | RELOAD = 0 # If 0 start training from scratch, otherwise the model 370 | # Saved on epoch 'RELOAD' will be used 371 | REBUILD_DATASET = True # Build again or use stored instance 372 | MODE = 'training' # 'training' or 'sampling' (if 'sampling' then RELOAD must 373 | # be greater than 0 and EVAL_ON_SETS will be used) 374 | RELOAD_PATH = None 375 | SAMPLING_RELOAD_EPOCH = False 376 | SAMPLING_RELOAD_POINT = 0 377 | # Extra parameters for special trainings 378 | TRAIN_ON_TRAINVAL = False # train the model on both training and validation sets combined 379 | FORCE_RELOAD_VOCABULARY = False # force building a new vocabulary from the training samples applicable if RELOAD > 1 380 | 381 | # ============================================ 382 | parameters = locals().copy() 383 | return parameters 384 | -------------------------------------------------------------------------------- /data_engine/README.md: -------------------------------------------------------------------------------- 1 | # Preprocessing of EDUB-SegDesc dataset 2 | 3 | The scripts stored in this folder 'data_engine' are intended to preprocess the data from the [EDUB-SegDesc](http://www.ub.edu/cvub/edub-segdesc/) dataset in order to use them as an input for building a Dataset object instance (see [staged_keras_wrapper](https://github.com/MarcBS/staged_keras_wrapper)). 4 | 5 | Two different kinds of inputs can be used for training the video description models: 6 | 7 | 1) Raw video frames (see section 'Image lists generation') 8 | 2) Features from video frames (see section 'Image features generation') 9 | 10 | Additionally, we can train a model for temporally-linked samples, in that case we have to run an additional pre-processing script. 11 | 12 | ## Folder structure 13 | 14 | Following we describe the desired folder structure for storing the dataset-related information: 15 | 16 | ./Images 17 | video_[video_id] 18 | [num_image].jpg 19 | [num_image].jpg 20 | ./Annotations 21 | test_list.txt 22 | train_list.txt 23 | val_list.txt 24 | captions.id.en 25 | ./Features 26 | test_[name_feat].csv 27 | train_[name_feat].csv 28 | val_[name_feat].csv 29 | 30 | The folder ./Images contains a set of folders 'video_[video_id]', where each folder represents a video and contains a set of frames '[num_image].jpg'. 31 | 32 | The folder ./Annotations contains, for each set split {train, val, test}, a file with the suffix _list.txt. Containing the list of videos 'video_[video_id]' belonging to the respective split. It also contains the file 'captions.id.en', which lists all the available captions for all the videos. 33 | 34 | The folder ./Features contains any kind of features extracted from the respective set splits (only needed if using image features instead of raw images). 35 | 36 | 37 | ## Descriptions generation 38 | 39 | This step will be needed either if we are using raw video frames or video features. 40 | 41 | Script name: 42 | generate_descriptions_lists.py 43 | Description: 44 | Extracts and counts the available descriptions for each video. 45 | Output: 46 | - A file per split with the suffix _descriptions.txt. 47 | Containing a list of descriptions for all videos. 48 | - A file per split with the suffix _descriptions_counts.npy. 49 | Containing a python list with the counts of descriptions per video. 50 | The output will be stored in ./Annotations. 51 | 52 | 53 | ## Image lists generation 54 | 55 | This step will be needed if we are using raw video frames only. 56 | 57 | Script name: 58 | generate_img_lists.py 59 | Description: 60 | Lists and counts the frames belonging to each video. 61 | Output: 62 | - A file per split with the suffix _imgs_list.txt. 63 | Containing the list of frames for all videos. 64 | - A file per split with the suffix _imgs_counts.txt. 65 | Containing a list of frame counts per video. 66 | The output will be stored in ./Annotations. 67 | 68 | 69 | ## Image features generation 70 | 71 | This step will be needed if we are using image features only. The number of feature vectors per video does not need to match the number of frames. 72 | 73 | Script name: 74 | generate_features_lists.py 75 | Description: 76 | Stores each feature vector contained in the corresponding .Features/[split_name]_[name_feat].csv in a separate .npy file and counts them. 77 | Output: 78 | - A file per split with the suffix _feat_list.txt. 79 | Containing the path to each feature vector. 80 | - A file per split with the suffix _feat_counts.txt. 81 | Containing the counts of vectors per video. 82 | The output .txt files will be stored in ./Annotations/[name_feat]/. And the .npy files in ./Features/[name_feat]/ 83 | 84 | ## Temporally-linked samples 85 | 86 | This step will be needed if we are using temporally-linked samples. 87 | 88 | Script name: 89 | generate_link_lists.py 90 | Description: 91 | Stores a separate list .Annotations/[split_list]_link_samples.txt with the indices to the previous samples in the temporal link. 92 | Output: 93 | - A file per split with the suffix _link_samples.txt. 94 | Containing the index to the previous sample in the link (or -1) if it is the first sample in the link. 95 | -------------------------------------------------------------------------------- /data_engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarcBS/TMA/76f4e16b3f10c056c45ada5df4a64cc564b69011/data_engine/__init__.py -------------------------------------------------------------------------------- /data_engine/generate_corpus_full_history.py: -------------------------------------------------------------------------------- 1 | """ 2 | the file id_seg_cap.txt has been generated with the folloing script 3 | 4 | awk '{print substr(FILENAME, 1, length(FILENAME)-4) "," $0}' * > ../id_seg_cap.txt 5 | 6 | and its format is: 7 | file_id, segment_number, caption 8 | """ 9 | 10 | base_path = '/media/HDD_2TB/DATASETS/EDUB-SegDesc/GT/' 11 | 12 | txt_files = base_path + 'id_seg_cap.txt' 13 | dest_files = base_path + 'captions.id.full_history.txt' 14 | 15 | file = open(txt_files, mode='r') 16 | dest_file = open(dest_files + 'curr', mode='w') 17 | 18 | separator = '----' 19 | space_sym = ' ' 20 | 21 | prev_id = 'Segment1' 22 | caps_txt = [] 23 | prev_caps = [] 24 | j = 0 25 | for line in file: 26 | id_text = line.split(",") 27 | user_id = id_text[0] 28 | segment_id = id_text[1] 29 | text = ' '.join(id_text[2:]).strip() 30 | j += 1 31 | if j % 1000 == 0: 32 | print "Processed", j, "lines" 33 | if segment_id == prev_id: 34 | caps_txt.append(text) 35 | 36 | # for prev_cap in prev_caps: 37 | # caps_txt.append(prev_cap + space_sym + text) 38 | elif segment_id == 'Segment1': # Start of day 39 | prev_id = segment_id 40 | i = 0 41 | for curr_cap in caps_txt: 42 | dest_file.write(user_id + '_' + segment_id + '#' + str(i) + separator + curr_cap + '\n') 43 | i += 1 44 | prev_caps = caps_txt 45 | else: 46 | # Different segment 47 | # We combine 48 | prev_id = segment_id 49 | # for prev_cap in prev_caps: 50 | # prev_caps2.append(prev_cap + space_sym + cap) 51 | caps_txt = [] 52 | caps_txt.append(text) 53 | i = 0 54 | for prev_cap in prev_caps: 55 | for curr_cap in caps_txt: 56 | dest_file.write( 57 | user_id + '_' + segment_id + '#' + str(i) + separator + prev_cap + space_sym + curr_cap + '\n') 58 | i += 1 59 | prev_caps = [prev_cap + space_sym + curr_cap for curr_cap in caps_txt for prev_cap in prev_caps] 60 | -------------------------------------------------------------------------------- /data_engine/generate_descriptions_lists.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def main(): 5 | # base_path = '/media/HDD_2TB/DATASETS/MSVD/' 6 | base_path = '/media/HDD_3TB/DATASETS/EDUB-SegDesc/' 7 | 8 | without_noninfo = True 9 | 10 | path_files = 'Annotations' 11 | 12 | # Inputs 13 | # text = 'captions.id.en' 14 | if without_noninfo: 15 | text = 'captions_final_without_noninfo.id.en' 16 | else: 17 | text = 'captions_final.id.en' 18 | separator = '----' 19 | 20 | # train = 'train_list.txt' 21 | # val = 'val_list.txt' 22 | # test = 'test_list.txt' 23 | 24 | if without_noninfo: 25 | train = 'train_list_final_without_noninfo.txt' 26 | val = 'val_list_final_without_noninfo.txt' 27 | test = 'test_list_final_without_noninfo.txt' 28 | 29 | # Outputs 30 | train_out = 'train_descriptions_without_noninfo.txt' 31 | val_out = 'val_descriptions_without_noninfo.txt' 32 | test_out = 'test_descriptions_without_noninfo.txt' 33 | 34 | train_out_counts = 'train_descriptions_counts_without_noninfo.npy' 35 | val_out_counts = 'val_descriptions_counts_without_noninfo.npy' 36 | test_out_counts = 'test_descriptions_counts_without_noninfo.npy' 37 | 38 | else: 39 | train = 'train_list_final.txt' 40 | val = 'val_list_final.txt' 41 | test = 'test_list_final.txt' 42 | 43 | # Outputs 44 | train_out = 'train_descriptions.txt' 45 | val_out = 'val_descriptions.txt' 46 | test_out = 'test_descriptions.txt' 47 | 48 | train_out_counts = 'train_descriptions_counts.npy' 49 | val_out_counts = 'val_descriptions_counts.npy' 50 | test_out_counts = 'test_descriptions_counts.npy' 51 | 52 | ################################# 53 | 54 | # Code 55 | 56 | text = path_files + '/' + text 57 | splits = [path_files + '/' + train, path_files + '/' + val, path_files + '/' + test] 58 | splits_out = [path_files + '/' + train_out, path_files + '/' + val_out, path_files + '/' + test_out] 59 | splits_counts = [path_files + '/' + train_out_counts, path_files + '/' + val_out_counts, 60 | path_files + '/' + test_out_counts] 61 | 62 | # read video names 63 | img_splits = [[], [], []] 64 | for i, s in enumerate(splits): 65 | with open(base_path + s, 'r') as f: 66 | for line in f: 67 | line = line.rstrip('\n') 68 | img_splits[i].append(line) 69 | 70 | # print img_splits 71 | 72 | 73 | # read descriptions and assign them to a split 74 | desc_splits = [] 75 | counts_splits = [] 76 | for i_s, s in enumerate(splits): 77 | desc_splits.append([[] for i in range(len(img_splits[i_s]))]) 78 | counts_splits.append([0 for i in range(len(img_splits[i_s]))]) 79 | with open(base_path + text, 'r') as f: 80 | for line in f: 81 | line = line.rstrip('\n') 82 | line = line.split('#') 83 | img = line[0] 84 | line = line[1].split(separator) 85 | desc = line[1] 86 | 87 | found = False 88 | i = 0 89 | while (not found and i < len(splits)): 90 | if (img in img_splits[i]): 91 | found = True 92 | idx = img_splits[i].index(img) 93 | desc_splits[i][idx].append(desc) 94 | counts_splits[i][idx] += 1 95 | i += 1 96 | 97 | if (not found): 98 | print 'Warning: Video ' + img + ' does not exist in lists' 99 | 100 | # write descriptions in separate files 101 | for f, d in zip(splits_out, desc_splits): 102 | f = open(base_path + f, 'w') 103 | for im in d: 104 | for desc in im: 105 | f.write(desc + '\n') 106 | f.close() 107 | 108 | # store description counts for each video 109 | for c, s in zip(counts_splits, splits_counts): 110 | np.save(base_path + s, c) 111 | 112 | print 'Done' 113 | 114 | 115 | main() 116 | -------------------------------------------------------------------------------- /data_engine/generate_features_lists.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | import numpy as np 5 | 6 | base_path = '/media/HDD_3TB/DATASETS/EDUB-SegDesc/' 7 | path_features = 'Features' 8 | path_annotations = 'Annotations' 9 | without_noninfo = True 10 | 11 | # Inputs 12 | if without_noninfo: 13 | features_name = 'ImageNet_Without_NonInfo' 14 | else: 15 | features_name = 'ImageNet' 16 | 17 | ###### Files with fixed number of frames per video 18 | # features_files = ['train_' + features_name + '.csv', 'val_' + features_name + '.csv', 'test_' + features_name + '.csv'] 19 | # features_counts = ['train_' + features_name + '_counts.txt', 'val_' + features_name + '_counts.txt', 'test_' + features_name + '_counts.txt'] 20 | 21 | ###### Files all original frames of videos 22 | # features_files = ['train_' + features_name + '.csv', 23 | # 'val_' + features_name + '.csv', 24 | # 'test_' + features_name + '.csv'] 25 | # features_counts = ['train_' + features_name + '_counts.txt', 26 | # 'val_' + features_name + '_counts.txt', 27 | # 'test_' + features_name + '_all_frames_counts.txt'] 28 | 29 | 30 | if without_noninfo: 31 | features_files = ['train_' + features_name + '_all_frames_without_noninfo.csv', 32 | 'val_' + features_name + '_all_frames_without_noninfo.csv', 33 | 'test_' + features_name + '_all_frames_without_noninfo.csv'] 34 | features_counts = ['train_' + features_name + '_all_frames_counts_without_noninfo.txt', 35 | 'val_' + features_name + '_all_frames_counts_without_noninfo.txt', 36 | 'test_' + features_name + '_all_frames_counts_without_noninfo.txt'] 37 | else: 38 | features_files = ['train_' + features_name + '_all_frames.csv', 39 | 'val_' + features_name + '_all_frames.csv', 40 | 'test_' + features_name + '_all_frames.csv'] 41 | features_counts = ['train_' + features_name + '_all_frames_counts.txt', 42 | 'val_' + features_name + '_all_frames_counts.txt', 43 | 'test_' + features_name + '_all_frames_counts.txt'] 44 | 45 | # features_name = 'C3D_fc8_ImageNet' 46 | # features_files = ['train_' + features_name + '.csv', 'val_' + features_name + '.csv', 'test_' + features_name + '.csv'] 47 | # features_counts = ['train_' + features_name + '_counts.txt', 'val_' + features_name + '_counts.txt', 'test_' + features_name + '_counts.txt'] 48 | 49 | # Outputs 50 | if without_noninfo: 51 | out_lists = ['train_feat_list_without_noninfo.txt', 52 | 'val_feat_list_without_noninfo.txt', 53 | 'test_feat_list_without_noninfo.txt'] 54 | counts_lists = ['train_feat_counts_without_noninfo.txt', 55 | 'val_feat_counts_without_noninfo.txt', 56 | 'test_feat_counts_without_noninfo.txt'] 57 | else: 58 | out_lists = ['train_feat_list.txt', 'val_feat_list.txt', 'test_feat_list.txt'] 59 | counts_lists = ['train_feat_counts.txt', 'val_feat_counts.txt', 'test_feat_counts.txt'] 60 | 61 | ######### 62 | 63 | if os.path.isdir(base_path + '/' + path_features + '/' + features_name): 64 | shutil.rmtree(base_path + '/' + path_features + '/' + features_name) 65 | os.makedirs(base_path + '/' + path_features + '/' + features_name) 66 | 67 | if not os.path.isdir(base_path + '/' + path_annotations + '/' + features_name): 68 | os.makedirs(base_path + '/' + path_annotations + '/' + features_name) 69 | 70 | c_videos = 0 71 | for f, fc, o, c in zip(features_files, features_counts, out_lists, counts_lists): 72 | print "Processing " + f 73 | 74 | f = open(base_path + '/' + path_features + '/' + f, 'r') 75 | fc = open(base_path + '/' + path_features + '/' + fc, 'r') 76 | o = open(base_path + '/' + path_annotations + '/' + features_name + '/' + o, 'w') 77 | c = open(base_path + '/' + path_annotations + '/' + features_name + '/' + c, 'w') 78 | 79 | all_counts = list() 80 | for line in fc: 81 | line = line.strip('\n') 82 | all_counts.append(int(line)) 83 | 84 | c_frame = 0 85 | c_videos_split = 0 86 | # Process each line in the file 87 | for enum, line in enumerate(f): 88 | frame = line.strip('\n') 89 | frame = np.fromstring(frame, sep=',') # covert csv line to numpy array 90 | 91 | this_path = "%s/video_%0.4d" % (path_features + '/' + features_name, c_videos) 92 | if not os.path.isdir(base_path + this_path): 93 | os.makedirs(base_path + this_path) 94 | this_path = "%s/video_%0.4d/frame_%0.4d.npy" % (path_features + '/' + features_name, c_videos, c_frame) 95 | # Save array in disk 96 | try: 97 | np.save(base_path + this_path, frame) 98 | except: 99 | print 'line file', enum 100 | print 'file name', base_path + this_path 101 | print 'lenvec', len(frame) 102 | print 'vec', frame 103 | print 104 | # Write path to file 105 | o.write(this_path + '\n') 106 | 107 | c_frame += 1 108 | 109 | # a complete video was processed 110 | if c_frame % all_counts[c_videos_split] == 0: 111 | c_videos += 1 112 | c.write(str(all_counts[c_videos_split]) + '\n') # store counts 113 | c_videos_split += 1 114 | c_frame = 0 115 | 116 | f.close() 117 | fc.close() 118 | o.close() 119 | c.close() 120 | 121 | print 'Done!' 122 | -------------------------------------------------------------------------------- /data_engine/generate_img_lists.py: -------------------------------------------------------------------------------- 1 | import glob 2 | 3 | base_path = '/media/HDD_2TB/DATASETS/MSVD/' 4 | 5 | # Inputs 6 | split_lists = ['train_list.txt', 'val_list.txt', 'test_list.txt'] 7 | imgs_format = '.jpg' 8 | path_imgs = 'Images' 9 | path_files = 'Annotations' 10 | 11 | # Outputs 12 | out_lists = ['train_imgs_list.txt', 'val_imgs_list.txt', 'test_imgs_list.txt'] 13 | counts_lists = ['train_imgs_counts.txt', 'val_imgs_counts.txt', 'test_imgs_counts.txt'] 14 | 15 | # Code 16 | print 'Listing all images from all videos...' 17 | 18 | len_base = len(base_path) 19 | for s, o, c in zip(split_lists, out_lists, counts_lists): 20 | s = open(base_path + '/' + path_files + '/' + s, 'r') 21 | o = open(base_path + '/' + path_files + '/' + o, 'w') 22 | c = open(base_path + '/' + path_files + '/' + c, 'w') 23 | for line in s: 24 | video = line.strip('\n') 25 | this_path = base_path + '/' + path_imgs + "/video_" + video + "/*" + imgs_format 26 | images = glob.glob(this_path) 27 | for im in images: 28 | # o.write(path_imgs+"/video_"+video+"/"+im+'\n') # store each image path 29 | o.write(im[len_base:] + '\n') 30 | c.write(str(len(images)) + '\n') # store counts 31 | s.close() 32 | o.close() 33 | c.close() 34 | 35 | print 'Done!' 36 | -------------------------------------------------------------------------------- /data_engine/generate_img_lists_from_split.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | 4 | import xlrd 5 | 6 | # Split the existent data in train, val and test 7 | data_path = '/media/HDD_3TB/DATASETS/EDUB-SegDesc' 8 | 9 | # input data paths 10 | in_descriptions_path = 'GT/descriptions' 11 | in_segments_path = 'GT/segmentations' 12 | in_images_path = 'Images' # //.jpg 13 | imgs_format = '.jpg' 14 | 15 | # output data paths 16 | out_features_path = 'Features' # __all_frames.csv & __all_frames_counts.txt 17 | out_descriptions_path = 'Annotations' 18 | out_image_lists_path = 'Annotations' # _imgs_list.txt & _imgs_counts.txt 19 | 20 | # Get day_sets for each data split 21 | sets = dict() 22 | for s in ['train', 'val', 'test']: 23 | sets[s] = [] 24 | with open(data_path + '/' + out_descriptions_path + '/' + s + '_list_final.txt', 'r') as list_file: 25 | prev_set = -1 26 | for line in list_file: 27 | line = line.rstrip('\n') 28 | line = line.split('_') 29 | if line[0] != prev_set: 30 | sets[s].append(line[0]) 31 | prev_set = line[0] 32 | 33 | # Get segments' IDs with errors 34 | errors = dict() 35 | for s in ['train', 'val', 'test']: 36 | errors[s] = dict() 37 | for day_split in sets[s]: 38 | errors[s][day_split] = [] 39 | with open(data_path + '/' + in_descriptions_path + '/' + day_split + '.txt', 'r') as list_file: 40 | for line in list_file: 41 | line = line.rstrip('\n').split(',') 42 | segm_id = int(line[0][7:]) 43 | desc = ','.join(line[1:]) 44 | desc = desc.strip().lower() 45 | if desc == 'error': 46 | errors[s][day_split].append(segm_id) 47 | 48 | # Get events of correct segments 49 | for s in ['train', 'val', 'test']: 50 | 51 | file_imgs = open(data_path + '/' + out_image_lists_path + '/' + s + '_imgs_list.txt', 'w') 52 | file_counts = open(data_path + '/' + out_image_lists_path + '/' + s + '_imgs_counts.txt', 'w') 53 | 54 | for day_split in sets[s]: 55 | possible_names = ['/GT_' + day_split + '.xls', '/GT_' + day_split + '.xlsx', '/' + day_split + '.xls', 56 | '/' + day_split + '.xlsx'] 57 | exists = False 58 | i = 0 59 | while not os.path.isfile(data_path + '/' + in_segments_path + possible_names[i]): 60 | i += 1 61 | file = xlrd.open_workbook(data_path + '/' + in_segments_path + possible_names[i]) 62 | sheet = file.sheet_by_index(0) 63 | 64 | count_segments = 1 65 | these_events = [] 66 | empty = False 67 | i = 2 # 1st row with info 68 | while not empty: 69 | try: 70 | evt = sheet.cell(i, 1).value.split() 71 | if len(evt) == 1: 72 | evt = sheet.cell(i, 1).value.split('-') 73 | if evt: 74 | if count_segments not in errors[s][day_split]: # avoid segments with errors (dark/blurry images) 75 | these_events.append([evt[0].strip(), evt[1].strip()]) 76 | else: 77 | empty = True 78 | i += 1 79 | count_segments += 1 80 | except: 81 | empty = True 82 | 83 | # Get list of images 84 | these_images = glob.glob(data_path + '/' + in_images_path + '/' + day_split + '/*' + imgs_format) 85 | final_these_images = [] 86 | for im in these_images: 87 | final_these_images.append(im.split('/')[-1].split('.')[0]) 88 | final_these_images = sorted(final_these_images) 89 | 90 | for e in these_events: 91 | if e[1] not in final_these_images: 92 | e[1] = '0' + e[1] 93 | if e[0] not in final_these_images: 94 | e[0] = '0' + e[0] 95 | 96 | fin_idx = final_these_images.index(e[1]) + 1 97 | ini_idx = final_these_images.index(e[0]) 98 | current_event_imgs = final_these_images[ini_idx:fin_idx] 99 | 100 | # Store in files 101 | this_count = 0 102 | for imid in current_event_imgs: 103 | file_imgs.write(in_images_path + '/' + day_split + '/' + imid + imgs_format + '\n') 104 | this_count += 1 105 | file_counts.write(str(this_count) + '\n') 106 | 107 | file_imgs.close() 108 | file_counts.close() 109 | 110 | print 'DONE!' 111 | -------------------------------------------------------------------------------- /data_engine/generate_link_lists.py: -------------------------------------------------------------------------------- 1 | ## Parameters 2 | 3 | base_path = '/media/HDD_3TB/DATASETS/EDUB-SegDesc/' 4 | 5 | path_files = 'Annotations' 6 | without_noninfo = True 7 | 8 | # Names of the different samples 9 | # All samples belonging to the same day must accomplish the following requirements: 10 | # - Be referenced continuously, without mixing with other days 11 | # - Be stored in chronological order 12 | # - Include the day identifier at the beginning of the line separated by the symbol '_' 13 | # Example: 14 | # Day1_video_1 15 | # Day1_video_2 16 | # Day1_video_3 17 | # Day2_video_1 18 | # Day2_video_2 19 | #### 20 | 21 | if without_noninfo: 22 | suffix = '_without_noninfo' 23 | else: 24 | suffix = '' 25 | 26 | train = 'train_list_final' + suffix + '.txt' 27 | val = 'val_list_final' + suffix + '.txt' 28 | test = 'test_list_final' + suffix + '.txt' 29 | 30 | # Outputs 31 | train_out = 'train_link_samples' + suffix + '.txt' 32 | val_out = 'val_link_samples' + suffix + '.txt' 33 | test_out = 'test_link_samples' + suffix + '.txt' 34 | 35 | ################################# 36 | 37 | ## Code 38 | 39 | # Generate temporal links between samples which belong to the same day 40 | for fin, fout in zip([train, val, test], [train_out, val_out, test_out]): 41 | 42 | with open(base_path + '/' + path_files + '/' + fin, 'r') as fi, open(base_path + '/' + path_files + '/' + fout, 43 | 'w') as fo: 44 | prev_day_name = '' 45 | lines_counter = -1 46 | for line in fi: 47 | day_name = line.split('_')[0] 48 | if day_name == prev_day_name: 49 | fo.write(str(lines_counter) + '\n') 50 | lines_counter += 1 51 | else: 52 | fo.write('-1\n') 53 | lines_counter += 1 54 | 55 | prev_day_name = day_name 56 | 57 | print 'Done' 58 | -------------------------------------------------------------------------------- /data_engine/generate_parallel_corpus.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generates a parallel corpus from the EDUB-GT Annotations: 3 | A language is the image captions. 4 | The other language is the previous caption of each sentence. 5 | """ 6 | 7 | base_path = '/media/HDD_2TB/DATASETS/EDUB-SegDesc/GT/' 8 | 9 | txt_files = base_path + 'text.clean.txt' 10 | dest_files = base_path + 'training.' 11 | 12 | file = open(txt_files, mode='r') 13 | 14 | file_prevs = open(dest_files + 'prev', mode='w') 15 | file_curr = open(dest_files + 'curr', mode='w') 16 | 17 | prev_id = 'Segment1' 18 | caps_txt = [] 19 | prev_caps = ['None'] 20 | i = 0 21 | for line in file: 22 | id_text = line.split(",") 23 | id = id_text[0] 24 | text = ' '.join(id_text[1:]).strip() 25 | if id == prev_id: 26 | caps_txt.append(text) 27 | elif id == 'Segment1': 28 | prev_id = id 29 | prev_caps = ['None'] 30 | caps_txt.append(text) 31 | for curr_cap in caps_txt: 32 | for prev_cap in prev_caps: 33 | file_prevs.write(prev_cap + '\n') 34 | file_curr.write(curr_cap + '\n') 35 | i += 1 36 | else: 37 | caps_txt.append(text) 38 | for curr_cap in caps_txt: 39 | for prev_cap in prev_caps: 40 | file_prevs.write(prev_cap + '\n') 41 | file_curr.write(curr_cap + '\n') 42 | i += 1 43 | 44 | prev_id = id 45 | prev_caps = caps_txt 46 | caps_txt = [] 47 | -------------------------------------------------------------------------------- /data_engine/prepare_data.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import logging 3 | 4 | import numpy as np 5 | 6 | from keras_wrapper.dataset import Dataset, saveDataset, loadDataset 7 | from keras_wrapper.extra.read_write import pkl2dict 8 | 9 | logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(message)s', datefmt='%d/%m/%Y %H:%M:%S') 10 | 11 | 12 | def build_dataset(params): 13 | if params['REBUILD_DATASET']: # We build a new dataset instance 14 | if params['VERBOSE'] > 0: 15 | silence = False 16 | logging.info('Building ' + params['DATASET_NAME'] + ' dataset') 17 | else: 18 | silence = True 19 | 20 | base_path = params['DATA_ROOT_PATH'] 21 | name = params['DATASET_NAME'] 22 | ds = Dataset(name, base_path, silence=silence) 23 | 24 | if not '-vidtext-embed' in params['DATASET_NAME']: 25 | # OUTPUT DATA 26 | # Let's load the train, val and test splits of the descriptions (outputs) 27 | # the files include a description per line. In this dataset a variable number 28 | # of descriptions per video are provided. 29 | ds.setOutput(base_path + '/' + params['DESCRIPTION_FILES']['train'], 30 | 'train', 31 | type='text', 32 | id=params['OUTPUTS_IDS_DATASET'][0], 33 | build_vocabulary=True, 34 | tokenization=params['TOKENIZATION_METHOD'], 35 | fill=params['FILL'], 36 | pad_on_batch=True, 37 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'], 38 | sample_weights=params['SAMPLE_WEIGHTS'], 39 | min_occ=params['MIN_OCCURRENCES_VOCAB']) 40 | 41 | ds.setOutput(base_path + '/' + params['DESCRIPTION_FILES']['val'], 42 | 'val', 43 | type='text', 44 | id=params['OUTPUTS_IDS_DATASET'][0], 45 | build_vocabulary=True, 46 | pad_on_batch=True, 47 | tokenization=params['TOKENIZATION_METHOD'], 48 | sample_weights=params['SAMPLE_WEIGHTS'], 49 | max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'], 50 | min_occ=params['MIN_OCCURRENCES_VOCAB']) 51 | 52 | ds.setOutput(base_path + '/' + params['DESCRIPTION_FILES']['test'], 53 | 'test', 54 | type='text', 55 | id=params['OUTPUTS_IDS_DATASET'][0], 56 | build_vocabulary=True, 57 | pad_on_batch=True, 58 | tokenization=params['TOKENIZATION_METHOD'], 59 | sample_weights=params['SAMPLE_WEIGHTS'], 60 | max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'], 61 | min_occ=params['MIN_OCCURRENCES_VOCAB']) 62 | 63 | else: 64 | # Use descriptions as inputs instead --> 'matching'/'non-matching' as output 65 | ds.setInput(base_path + '/' + params['DESCRIPTION_FILES']['train'], 66 | 'train', 67 | type='text', 68 | id=params['INPUTS_IDS_DATASET'][1], 69 | build_vocabulary=True, 70 | tokenization=params['TOKENIZATION_METHOD'], 71 | fill=params['FILL'], 72 | pad_on_batch=True, 73 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'], 74 | min_occ=params['MIN_OCCURRENCES_VOCAB']) 75 | 76 | ds.setInput(base_path + '/' + params['DESCRIPTION_FILES']['val'], 77 | 'val', 78 | type='text', 79 | id=params['INPUTS_IDS_DATASET'][1], 80 | build_vocabulary=True, 81 | pad_on_batch=True, 82 | tokenization=params['TOKENIZATION_METHOD'], 83 | max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'], 84 | min_occ=params['MIN_OCCURRENCES_VOCAB']) 85 | 86 | ds.setInput(base_path + '/' + params['DESCRIPTION_FILES']['test'], 87 | 'test', 88 | type='text', 89 | id=params['INPUTS_IDS_DATASET'][1], 90 | build_vocabulary=True, 91 | pad_on_batch=True, 92 | tokenization=params['TOKENIZATION_METHOD'], 93 | max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'], 94 | min_occ=params['MIN_OCCURRENCES_VOCAB']) 95 | 96 | # INPUT DATA 97 | # Let's load the associated videos (inputs) 98 | # we must take into account that in this dataset we have a different number of sentences per video, 99 | # for this reason we introduce the parameter 'repeat_set'=num_captions, where num_captions is a list 100 | # containing the number of captions in each video. 101 | 102 | num_captions_train = np.load(base_path + '/' + params['DESCRIPTION_COUNTS_FILES']['train']) 103 | num_captions_val = np.load(base_path + '/' + params['DESCRIPTION_COUNTS_FILES']['val']) 104 | num_captions_test = np.load(base_path + '/' + params['DESCRIPTION_COUNTS_FILES']['test']) 105 | 106 | for feat_type in params['FEATURE_NAMES']: 107 | for split, num_cap in zip(['train', 'val', 'test'], 108 | [num_captions_train, num_captions_val, num_captions_test]): 109 | list_files = base_path + '/' + params['FRAMES_LIST_FILES'][split] % feat_type 110 | counts_files = base_path + '/' + params['FRAMES_COUNTS_FILES'][split] % feat_type 111 | 112 | ds.setInput([list_files, counts_files], 113 | split, 114 | type=params['INPUT_DATA_TYPE'], 115 | id=params['INPUTS_IDS_DATASET'][0], 116 | repeat_set=num_cap, 117 | max_video_len=params['NUM_FRAMES'], 118 | feat_len=params['IMG_FEAT_SIZE'], 119 | data_augmentation_types=params['DATA_AUGMENTATION_TYPE']) 120 | 121 | if not '-vidtext-embed' in params['DATASET_NAME'] and len(params['INPUTS_IDS_DATASET']) > 1: 122 | ds.setInput(base_path + '/' + params['DESCRIPTION_FILES']['train'], 123 | 'train', 124 | type='text', 125 | id=params['INPUTS_IDS_DATASET'][1], 126 | required=False, 127 | tokenization=params['TOKENIZATION_METHOD'], 128 | pad_on_batch=True, 129 | build_vocabulary=params['OUTPUTS_IDS_DATASET'][0], 130 | offset=1, 131 | fill=params['FILL'], 132 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'], 133 | max_words=params['OUTPUT_VOCABULARY_SIZE'], 134 | min_occ=params['MIN_OCCURRENCES_VOCAB']) 135 | 136 | ds.setInput(None, 'val', type='ghost', id=params['INPUTS_IDS_DATASET'][1], required=False) 137 | ds.setInput(None, 'test', type='ghost', id=params['INPUTS_IDS_DATASET'][1], required=False) 138 | 139 | # Set inputs for temporally-linked samples 140 | if not '-vidtext-embed' in params['DATASET_NAME'] and '-linked' in params['DATASET_NAME']: 141 | # Set input captions from previous event/video 142 | if '-upperbound' not in params['DATASET_NAME']: 143 | if '-vidtext' in params['DATASET_NAME']: # use both previous video and previous description 144 | 145 | ds, repeat_images = insertTemporallyLinkedCaptionsVidText(ds, params, 146 | vidtext_set_names={ 147 | 'video': ['train', 'val', 'test'], 148 | 'text': ['train']}) 149 | del repeat_images['test'] 150 | del repeat_images['val'] 151 | # Insert empty prev_descriptions on val and test sets 152 | ds.setInput([], 153 | 'val', 154 | type='text', 155 | id=params['INPUTS_IDS_DATASET'][2], 156 | build_vocabulary=params['OUTPUTS_IDS_DATASET'][0], 157 | tokenization=params['TOKENIZATION_METHOD'], 158 | fill=params['FILL'], 159 | pad_on_batch=True, 160 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'], 161 | min_occ=params['MIN_OCCURRENCES_VOCAB'], 162 | required=False, 163 | overwrite_split=True) 164 | ds.setInput([], 165 | 'test', 166 | type='text', 167 | id=params['INPUTS_IDS_DATASET'][2], 168 | build_vocabulary=params['OUTPUTS_IDS_DATASET'][0], 169 | tokenization=params['TOKENIZATION_METHOD'], 170 | fill=params['FILL'], 171 | pad_on_batch=True, 172 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'], 173 | min_occ=params['MIN_OCCURRENCES_VOCAB'], 174 | required=False, 175 | overwrite_split=True) 176 | 177 | elif '-video' in params['DATASET_NAME']: 178 | ds, repeat_images = insertTemporallyLinkedCaptions(ds, params, 179 | set_names=['train', 'val', 'test'], 180 | video=True) 181 | num_captions_val = repeat_images['val'] 182 | num_captions_test = repeat_images['test'] 183 | else: 184 | ds, repeat_images = insertTemporallyLinkedCaptions(ds, params) 185 | # Insert empty prev_descriptions on val and test sets 186 | ds.setInput([], 187 | 'val', 188 | type='text', 189 | id=params['INPUTS_IDS_DATASET'][2], 190 | build_vocabulary=params['OUTPUTS_IDS_DATASET'][0], 191 | tokenization=params['TOKENIZATION_METHOD'], 192 | fill=params['FILL'], 193 | pad_on_batch=True, 194 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'], 195 | min_occ=params['MIN_OCCURRENCES_VOCAB'], 196 | required=False, 197 | overwrite_split=True) 198 | ds.setInput([], 199 | 'test', 200 | type='text', 201 | id=params['INPUTS_IDS_DATASET'][2], 202 | build_vocabulary=params['OUTPUTS_IDS_DATASET'][0], 203 | tokenization=params['TOKENIZATION_METHOD'], 204 | fill=params['FILL'], 205 | pad_on_batch=True, 206 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'], 207 | min_occ=params['MIN_OCCURRENCES_VOCAB'], 208 | required=False, 209 | overwrite_split=True) 210 | else: 211 | ds, repeat_images = insertTemporallyLinkedCaptions(ds, 212 | params, 213 | set_names=['train', 'val', 'test'], 214 | upperbound=True, 215 | video='-video' in params['DATASET_NAME'], 216 | copy='-copy' in params['DATASET_NAME'], 217 | force_nocopy='-nocopy' in params['DATASET_NAME'], 218 | prev='-prev' in params['DATASET_NAME']) 219 | num_captions_val = repeat_images['val'] 220 | num_captions_test = repeat_images['test'] 221 | 222 | if not '-vidtext-embed' in params['DATASET_NAME']: 223 | # Process dataset for keeping only one caption per video and storing the rest in a dict() with the following format: 224 | # ds.extra_variables[set_name][id_output][img_position] = [cap1, cap2, cap3, ..., capN] 225 | keep_n_captions(ds, repeat=[num_captions_val, num_captions_test], n=1, set_names=['val', 'test']) 226 | 227 | else: 228 | # Set outputs for -vidtext-embed model 229 | insertVidTextEmbedNegativeSamples(ds, params, 230 | repeat=[num_captions_train, num_captions_val, num_captions_test]) 231 | 232 | if not '-vidtext-embed' in params['DATASET_NAME'] and \ 233 | '-linked' in params['DATASET_NAME'] and \ 234 | '-upperbound' not in params['DATASET_NAME'] and \ 235 | '-video' not in params['DATASET_NAME']: 236 | # Set previous data indices 237 | for s, file in params['LINK_SAMPLE_FILES'].iteritems(): 238 | if s in repeat_images: 239 | rep = repeat_images[s] 240 | else: 241 | rep = 1 242 | ds.setInput(base_path + '/' + file, 243 | s, 244 | type='id', 245 | id=params['INPUTS_IDS_DATASET'][-1], 246 | repeat_set=rep) 247 | 248 | # We have finished loading the dataset, now we can store it for using it in the future 249 | saveDataset(ds, params['DATASET_STORE_PATH']) 250 | else: 251 | # We can easily recover it with a single line 252 | ds = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' + params['DATASET_NAME'] + '.pkl') 253 | 254 | # Load vocabulary-related parameters of dataset used for pre-training 255 | if params['PRE_TRAINED_DATASET_NAME'] is not None: 256 | logging.info('Re-using previous dataset vocabulary ' + params['PRE_TRAINED_DATASET_NAME']) 257 | dataset_pretrained = loadDataset( 258 | params['DATASET_STORE_PATH'] + 'Dataset_' + params['PRE_TRAINED_DATASET_NAME'] + '.pkl') 259 | for id_new, id_old in params['VOCABULARIES_MAPPING'].iteritems(): 260 | ds.vocabulary[id_new] = copy.deepcopy(dataset_pretrained.vocabulary[id_old]) 261 | ds.vocabulary_len[id_new] = copy.deepcopy(dataset_pretrained.vocabulary_len[id_old]) 262 | elif params['PRE_TRAINED_VOCABULARY_NAME'] is not None: 263 | logging.info('Re-using previous vocabulary ' + params['PRE_TRAINED_VOCABULARY_NAME']) 264 | dataset_pretrained_vocabulary = pkl2dict( 265 | params['DATASET_STORE_PATH'] + params['PRE_TRAINED_VOCABULARY_NAME'] + '.pkl') 266 | for id_new, id_old in params['VOCABULARIES_MAPPING'].iteritems(): 267 | ds.vocabulary[id_new] = copy.deepcopy(dataset_pretrained_vocabulary[id_old]) 268 | ds.vocabulary_len[id_new] = len(dataset_pretrained_vocabulary[id_old]['idx2words']) 269 | 270 | return ds 271 | 272 | 273 | def keep_n_captions(ds, repeat, n=1, set_names=['val', 'test']): 274 | ''' Keeps only n captions per image and stores the rest in dictionaries for a later evaluation 275 | ''' 276 | 277 | for s, r in zip(set_names, repeat): 278 | logging.info('Keeping ' + str(n) + ' captions per input on the ' + str(s) + ' set.') 279 | 280 | ds.extra_variables[s] = dict() 281 | exec ('n_samples = ds.len_' + s) 282 | 283 | # Process inputs 284 | for id_in in ds.ids_inputs: 285 | new_X = [] 286 | if id_in in ds.optional_inputs: 287 | try: 288 | exec ('X = ds.X_' + s) 289 | i = 0 290 | for next_repeat in r: 291 | for j in range(n): 292 | new_X.append(X[id_in][i + j]) 293 | i += next_repeat 294 | exec ('ds.X_' + s + '[id_in] = new_X') 295 | except: 296 | pass 297 | else: 298 | exec ('X = ds.X_' + s) 299 | i = 0 300 | for next_repeat in r: 301 | for j in range(n): 302 | new_X.append(X[id_in][i + j]) 303 | i += next_repeat 304 | exec ('ds.X_' + s + '[id_in] = new_X') 305 | # Process outputs 306 | for id_out in ds.ids_outputs: 307 | new_Y = [] 308 | exec ('Y = ds.Y_' + s) 309 | dict_Y = dict() 310 | count_samples = 0 311 | i = 0 312 | for next_repeat in r: 313 | dict_Y[count_samples] = [] 314 | for j in range(next_repeat): 315 | if j < n: 316 | new_Y.append(Y[id_out][i + j]) 317 | dict_Y[count_samples].append(Y[id_out][i + j]) 318 | count_samples += 1 319 | i += next_repeat 320 | exec ('ds.Y_' + s + '[id_out] = new_Y') 321 | # store dictionary with vid_pos -> [cap1, cap2, cap3, ..., capNi] 322 | ds.extra_variables[s][id_out] = dict_Y 323 | 324 | new_len = len(new_Y) 325 | exec ('ds.len_' + s + ' = new_len') 326 | logging.info('Samples reduced to ' + str(new_len) + ' in ' + s + ' set.') 327 | 328 | 329 | def insertTemporallyLinkedCaptions(ds, params, set_names=['train'], 330 | upperbound=False, 331 | video=False, copy=False, force_nocopy=False, prev=False): 332 | """ 333 | Inserts an additional input consisting of the desired captions from the previous segment/event 334 | in chronological order. Example: 335 | : 336 | : 337 | . 338 | . 339 | . 340 | : 341 | : 342 | . 343 | . 344 | . 345 | 346 | :param ds: dataset to modify 347 | :param params: parameters from config 348 | :param set_names: names of the splits that will be modified (default 'train' only) 349 | :param upperbound: whether we want to generate a dataset for an upper bound comparison by using the same captions both as input and output 350 | :param video: whether we use the previous' event video as input instead of the previous caption 351 | :param copy: generates an upperbound dataset only intending to copy giving only matching input-output sequences (only valid if upperbound=True) 352 | :param force_nocopy: generates an upperbound dataset using the same captions both as input and output but avoiding direct copies 353 | :param prev: indicates if we want to use the previous event's caption as input for the next, or use the current event's output instead 354 | 355 | :return: dataset modified with the additional input 356 | """ 357 | base_path = params['DATA_ROOT_PATH'] 358 | repeat_images = dict() 359 | 360 | for s in set_names: 361 | # retrieve number of output captions per sample 362 | num_cap = np.load(base_path + '/' + params['DESCRIPTION_COUNTS_FILES'][s]) 363 | 364 | # get temporal links 365 | links = [] 366 | with open(base_path + '/' + params['LINK_SAMPLE_FILES'][s], 'r') as f_links: 367 | for line in f_links: 368 | links.append(int(line.strip())) 369 | 370 | outputs = [] 371 | with open(base_path + '/' + params['DESCRIPTION_FILES'][s], 'r') as f_outs: 372 | for line in f_outs: 373 | outputs.append(line.strip()) 374 | 375 | # get outputs 376 | if video: 377 | prev_videos = [] 378 | for feat_type in params['FEATURE_NAMES']: 379 | list_files = base_path + '/' + params['FRAMES_LIST_FILES'][s] % feat_type 380 | counts_files = base_path + '/' + params['FRAMES_COUNTS_FILES'][s] % feat_type 381 | with open(list_files, 'r') as f_outs, open(counts_files, 'r') as f_outs_counts: 382 | prev_videos.append( 383 | [[line.strip() for line in f_outs], [int(line.strip()) for line in f_outs_counts]]) 384 | 385 | # modify outputs and prepare inputs 386 | images_repeat = [] 387 | upperbound_images_repeat = [] 388 | final_outputs = [] 389 | if video: 390 | final_inputs = dict() 391 | for feat_type in params['FEATURE_NAMES']: 392 | final_inputs[feat_type] = [[], []] 393 | else: 394 | final_inputs = [] 395 | for i, link in enumerate(links): 396 | ini_out = np.sum(num_cap[:i]) 397 | these_outputs = outputs[ini_out:ini_out + num_cap[i]] 398 | 399 | if upperbound: 400 | if copy: 401 | images_repeat.append(num_cap[i]) 402 | upperbound_images_repeat.append(num_cap[i]) 403 | for out in these_outputs: 404 | final_outputs.append(out) 405 | final_inputs.append(out) 406 | elif prev: 407 | # first sample in the temporally-linked sequence 408 | if link == -1: 409 | images_repeat.append(num_cap[i]) 410 | upperbound_images_repeat.append(num_cap[i]) 411 | for out in these_outputs: 412 | final_outputs.append(out) 413 | final_inputs.append('') 414 | else: 415 | prev_ini_out = np.sum(num_cap[:link]) 416 | prev_outputs = outputs[prev_ini_out:prev_ini_out + num_cap[link]] 417 | images_repeat.append(num_cap[i] * num_cap[link]) 418 | for n in range(num_cap[link]): 419 | upperbound_images_repeat.append(num_cap[i]) 420 | for out in these_outputs: 421 | final_outputs.append(out) 422 | final_inputs.append(prev_outputs[n]) 423 | elif force_nocopy: 424 | raise NotImplementedError() 425 | prev_outputs = these_outputs 426 | images_repeat.append(num_cap[i] * (num_cap[i] - 1)) 427 | for n in range(num_cap[i]): 428 | upperbound_images_repeat.append(num_cap[i] - 1) 429 | for nthese, out in enumerate(these_outputs): 430 | if nthese != n: 431 | final_outputs.append(out) 432 | final_inputs.append(prev_outputs[n]) 433 | else: 434 | prev_outputs = these_outputs 435 | images_repeat.append(num_cap[i] * num_cap[i]) 436 | for n in range(num_cap[i]): 437 | upperbound_images_repeat.append(num_cap[i]) 438 | for out in these_outputs: 439 | final_outputs.append(out) 440 | final_inputs.append(prev_outputs[n]) 441 | else: 442 | if video: 443 | # first sample in the temporally-linked sequence 444 | if link == -1: 445 | images_repeat.append(num_cap[i]) 446 | for out in these_outputs: 447 | final_outputs.append(out) 448 | for ifeat, feat_type in enumerate(params['FEATURE_NAMES']): 449 | final_inputs[feat_type][1] += [0] 450 | else: 451 | images_repeat.append(num_cap[i]) 452 | for out in these_outputs: 453 | final_outputs.append(out) 454 | for ifeat, feat_type in enumerate(params['FEATURE_NAMES']): 455 | if link > 0: 456 | init_frame = int(sum(prev_videos[ifeat][1][:link])) 457 | else: 458 | init_frame = 0 459 | this_count = prev_videos[ifeat][1][link] 460 | final_inputs[feat_type][0] += prev_videos[ifeat][0][init_frame:init_frame + this_count] 461 | final_inputs[feat_type][1] += [this_count] 462 | else: 463 | # first sample in the temporally-linked sequence 464 | if link == -1: 465 | images_repeat.append(num_cap[i]) 466 | for out in these_outputs: 467 | final_outputs.append(out) 468 | final_inputs.append('') 469 | else: 470 | prev_ini_out = np.sum(num_cap[:link]) 471 | prev_outputs = outputs[prev_ini_out:prev_ini_out + num_cap[link]] 472 | images_repeat.append(num_cap[i] * num_cap[link]) 473 | for n in range(num_cap[link]): 474 | for out in these_outputs: 475 | final_outputs.append(out) 476 | final_inputs.append(prev_outputs[n]) 477 | 478 | # Overwrite input images assigning the new repeat pattern 479 | for feat_type in params['FEATURE_NAMES']: 480 | list_files = base_path + '/' + params['FRAMES_LIST_FILES'][s] % feat_type 481 | counts_files = base_path + '/' + params['FRAMES_COUNTS_FILES'][s] % feat_type 482 | 483 | ds.setInput([list_files, counts_files], 484 | s, 485 | type=params['INPUT_DATA_TYPE'], 486 | id=params['INPUTS_IDS_DATASET'][0], 487 | repeat_set=images_repeat, 488 | max_video_len=params['NUM_FRAMES'], 489 | feat_len=params['IMG_FEAT_SIZE'], 490 | overwrite_split=True, 491 | data_augmentation_types=params['DATA_AUGMENTATION_TYPE']) 492 | 493 | if not video: 494 | # Overwrite outputs assigning the new outputs repeat pattern 495 | ds.setOutput(final_outputs, 496 | s, 497 | type='text', 498 | id=params['OUTPUTS_IDS_DATASET'][0], 499 | build_vocabulary=True, 500 | tokenization=params['TOKENIZATION_METHOD'], 501 | fill=params['FILL'], 502 | pad_on_batch=True, 503 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'], 504 | sample_weights=params['SAMPLE_WEIGHTS'], 505 | min_occ=params['MIN_OCCURRENCES_VOCAB'], 506 | overwrite_split=True) 507 | 508 | # Overwrite the input state_below assigning the new outputs repeat pattern 509 | ds.setInput(final_outputs, 510 | s, 511 | type='text', 512 | id=params['INPUTS_IDS_DATASET'][1], 513 | required=False, 514 | tokenization=params['TOKENIZATION_METHOD'], 515 | pad_on_batch=True, 516 | build_vocabulary=params['OUTPUTS_IDS_DATASET'][0], 517 | offset=1, 518 | fill=params['FILL'], 519 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'], 520 | max_words=params['OUTPUT_VOCABULARY_SIZE'], 521 | min_occ=params['MIN_OCCURRENCES_VOCAB'], 522 | overwrite_split=True) 523 | 524 | if video: 525 | for feat_type in params['FEATURE_NAMES']: 526 | ds.setInput(final_inputs[feat_type], 527 | s, 528 | type=params['INPUT_DATA_TYPE'], 529 | id=params['INPUTS_IDS_DATASET'][2], 530 | repeat_set=images_repeat, 531 | max_video_len=params['NUM_FRAMES'], 532 | feat_len=params['IMG_FEAT_SIZE'], 533 | overwrite_split=True, 534 | data_augmentation_types=params['DATA_AUGMENTATION_TYPE']) 535 | else: 536 | # Set new input captions from previous temporally-linked event/video 537 | ds.setInput(final_inputs, 538 | s, 539 | type='text', 540 | id=params['INPUTS_IDS_DATASET'][2], 541 | build_vocabulary=params['OUTPUTS_IDS_DATASET'][0], 542 | tokenization=params['TOKENIZATION_METHOD'], 543 | fill=params['FILL'], 544 | pad_on_batch=True, 545 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'], 546 | min_occ=params['MIN_OCCURRENCES_VOCAB']) 547 | 548 | if upperbound: 549 | images_repeat = upperbound_images_repeat 550 | repeat_images[s] = images_repeat 551 | 552 | return ds, repeat_images 553 | 554 | 555 | def insertTemporallyLinkedCaptionsVidText(ds, params, vidtext_set_names={'video': ['train'], 'text': ['train']}): 556 | """ 557 | Inserts two additional input consisting of the videos and captions from the previous segment/event 558 | in chronological order. Example: 559 | : 560 | : 561 | . 562 | . 563 | . 564 | : 565 | : 566 | . 567 | . 568 | . 569 | 570 | :param ds: dataset to modify 571 | :param params: parameters from config 572 | :param vidtext_set_names: dictionary names of the splits that will be modified for 'video' and for 'text' 573 | 574 | :return: dataset modified with the additional input 575 | """ 576 | base_path = params['DATA_ROOT_PATH'] 577 | repeat_images = dict() 578 | 579 | set_names = set(vidtext_set_names['video'] + vidtext_set_names['text']) 580 | for s in set_names: 581 | # retrieve number of output captions per sample 582 | num_cap = np.load(base_path + '/' + params['DESCRIPTION_COUNTS_FILES'][s]) 583 | 584 | # get temporal links 585 | links = [] 586 | with open(base_path + '/' + params['LINK_SAMPLE_FILES'][s], 'r') as f_links: 587 | for line in f_links: 588 | links.append(int(line.strip())) 589 | 590 | outputs = [] 591 | with open(base_path + '/' + params['DESCRIPTION_FILES'][s], 'r') as f_outs: 592 | for line in f_outs: 593 | outputs.append(line.strip()) 594 | 595 | # get outputs 596 | if s in vidtext_set_names['video']: 597 | prev_videos = [] 598 | for feat_type in params['FEATURE_NAMES']: 599 | list_files = base_path + '/' + params['FRAMES_LIST_FILES'][s] % feat_type 600 | counts_files = base_path + '/' + params['FRAMES_COUNTS_FILES'][s] % feat_type 601 | with open(list_files, 'r') as f_outs, open(counts_files, 'r') as f_outs_counts: 602 | prev_videos.append( 603 | [[line.strip() for line in f_outs], [int(line.strip()) for line in f_outs_counts]]) 604 | 605 | # modify outputs and prepare inputs 606 | images_repeat = [] 607 | final_outputs = [] 608 | if s in vidtext_set_names['video']: 609 | final_inputs_vid = dict() 610 | for feat_type in params['FEATURE_NAMES']: 611 | final_inputs_vid[feat_type] = [[], []] 612 | final_inputs_txt = [] 613 | 614 | for i, link in enumerate(links): 615 | ini_out = np.sum(num_cap[:i]) 616 | these_outputs = outputs[ini_out:ini_out + num_cap[i]] 617 | 618 | # first sample in the temporally-linked sequence 619 | if link == -1: 620 | images_repeat.append(num_cap[i]) 621 | for out in these_outputs: 622 | final_outputs.append(out) 623 | if s in vidtext_set_names['text']: 624 | final_inputs_txt.append('') 625 | if s in vidtext_set_names['video']: 626 | for ifeat, feat_type in enumerate(params['FEATURE_NAMES']): 627 | final_inputs_vid[feat_type][1] += [0] 628 | else: 629 | if s in vidtext_set_names['text']: 630 | prev_ini_out = np.sum(num_cap[:link]) 631 | prev_outputs = outputs[prev_ini_out:prev_ini_out + num_cap[link]] 632 | images_repeat.append(num_cap[i] * num_cap[link]) 633 | else: 634 | images_repeat.append(num_cap[i]) 635 | 636 | # video only 637 | if s not in vidtext_set_names['text'] and s in vidtext_set_names['video']: 638 | for out in these_outputs: 639 | final_outputs.append(out) 640 | 641 | for ifeat, feat_type in enumerate(params['FEATURE_NAMES']): 642 | if link > 0: 643 | init_frame = int(sum(prev_videos[ifeat][1][:link])) 644 | else: 645 | init_frame = 0 646 | this_count = prev_videos[ifeat][1][link] 647 | final_inputs_vid[feat_type][0] += prev_videos[ifeat][0][init_frame:init_frame + this_count] 648 | final_inputs_vid[feat_type][1] += [this_count] 649 | 650 | # text only 651 | elif s in vidtext_set_names['text'] and s not in vidtext_set_names['video']: 652 | for n in range(num_cap[link]): 653 | for out in these_outputs: 654 | final_outputs.append(out) 655 | final_inputs_txt.append(prev_outputs[n]) 656 | 657 | # both 658 | else: 659 | for n in range(num_cap[link]): 660 | for out in these_outputs: 661 | final_outputs.append(out) 662 | final_inputs_txt.append(prev_outputs[n]) 663 | 664 | for ifeat, feat_type in enumerate(params['FEATURE_NAMES']): 665 | if link > 0: 666 | init_frame = int(sum(prev_videos[ifeat][1][:link])) 667 | else: 668 | init_frame = 0 669 | this_count = prev_videos[ifeat][1][link] 670 | final_inputs_vid[feat_type][0] += prev_videos[ifeat][0][init_frame:init_frame + this_count] 671 | final_inputs_vid[feat_type][1] += [this_count] 672 | 673 | # Overwrite input images assigning the new repeat pattern 674 | for feat_type in params['FEATURE_NAMES']: 675 | list_files = base_path + '/' + params['FRAMES_LIST_FILES'][s] % feat_type 676 | counts_files = base_path + '/' + params['FRAMES_COUNTS_FILES'][s] % feat_type 677 | 678 | ds.setInput([list_files, counts_files], 679 | s, 680 | type=params['INPUT_DATA_TYPE'], 681 | id=params['INPUTS_IDS_DATASET'][0], 682 | repeat_set=images_repeat, 683 | max_video_len=params['NUM_FRAMES'], 684 | feat_len=params['IMG_FEAT_SIZE'], 685 | overwrite_split=True, 686 | data_augmentation_types=params['DATA_AUGMENTATION_TYPE']) 687 | 688 | # if text 689 | if s in vidtext_set_names['text']: 690 | # Overwrite outputs assigning the new outputs repeat pattern 691 | ds.setOutput(final_outputs, 692 | s, 693 | type='text', 694 | id=params['OUTPUTS_IDS_DATASET'][0], 695 | build_vocabulary=True, 696 | tokenization=params['TOKENIZATION_METHOD'], 697 | fill=params['FILL'], 698 | pad_on_batch=True, 699 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'], 700 | sample_weights=params['SAMPLE_WEIGHTS'], 701 | min_occ=params['MIN_OCCURRENCES_VOCAB'], 702 | overwrite_split=True) 703 | 704 | # Overwrite the input state_below assigning the new outputs repeat pattern 705 | ds.setInput(final_outputs, 706 | s, 707 | type='text', 708 | id=params['INPUTS_IDS_DATASET'][1], 709 | required=False, 710 | tokenization=params['TOKENIZATION_METHOD'], 711 | pad_on_batch=True, 712 | build_vocabulary=params['OUTPUTS_IDS_DATASET'][0], 713 | offset=1, 714 | fill=params['FILL'], 715 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'], 716 | max_words=params['OUTPUT_VOCABULARY_SIZE'], 717 | min_occ=params['MIN_OCCURRENCES_VOCAB'], 718 | overwrite_split=True) 719 | 720 | if s in vidtext_set_names['video']: 721 | for feat_type in params['FEATURE_NAMES']: 722 | ds.setInput(final_inputs_vid[feat_type], 723 | s, 724 | type=params['INPUT_DATA_TYPE'], 725 | id=params['INPUTS_IDS_DATASET'][3], 726 | repeat_set=images_repeat, 727 | max_video_len=params['NUM_FRAMES'], 728 | feat_len=params['IMG_FEAT_SIZE'], 729 | overwrite_split=True, 730 | data_augmentation_types=params['DATA_AUGMENTATION_TYPE']) 731 | 732 | if s in vidtext_set_names['text']: 733 | # Set new input captions from previous temporally-linked event/video 734 | ds.setInput(final_inputs_txt, 735 | s, 736 | type='text', 737 | id=params['INPUTS_IDS_DATASET'][2], 738 | required=False, 739 | build_vocabulary=params['OUTPUTS_IDS_DATASET'][0], 740 | tokenization=params['TOKENIZATION_METHOD'], 741 | fill=params['FILL'], 742 | pad_on_batch=True, 743 | max_text_len=params['MAX_OUTPUT_TEXT_LEN'], 744 | min_occ=params['MIN_OCCURRENCES_VOCAB'], 745 | overwrite_split=True) 746 | 747 | repeat_images[s] = images_repeat 748 | 749 | return ds, repeat_images 750 | 751 | 752 | def insertVidTextEmbedNegativeSamples(ds, params, repeat): 753 | """ 754 | Inserts negative balanced examples for training a Video-Text Embedding model. 755 | 756 | :param ds: dataset object with inputs of positive samples inserted 757 | :param params: config params 758 | :param repeat: number of times each video was repeated 759 | """ 760 | 761 | for s, r in zip(['train', 'val', 'test'], repeat): 762 | 763 | # Get data from dataset 764 | X = None 765 | num_samples = 0 766 | exec ('num_samples = ds.len_' + s) 767 | exec ('X = ds.X_' + s) 768 | 769 | video_indices = X[params['INPUTS_IDS_DATASET'][0]] 770 | descriptions = X[params['INPUTS_IDS_DATASET'][1]] 771 | 772 | # Get real indices considering repetitions 773 | desc_real_indices = np.repeat(range(len(r)), r) 774 | 775 | # Let's generate some random video-description pairs 776 | negative_videos = np.random.choice(video_indices, num_samples, replace=True) 777 | for neg_id in negative_videos: 778 | # Insert index of repeated video (now as negative sample) 779 | video_indices.append(neg_id) 780 | 781 | # New find random description (avoiding correct descriptions for the selected video) 782 | real_id = desc_real_indices[neg_id] 783 | desc_id = np.random.choice([ind for ind in range(num_samples) if desc_real_indices[ind] != real_id], 1)[0] 784 | 785 | # Insert description of negative sample 786 | descriptions.append(descriptions[desc_id]) 787 | 788 | # Re-insert videos and descriptions, including new length 789 | exec ('ds.X_' + s + '["' + params['INPUTS_IDS_DATASET'][0] + '"] = video_indices') 790 | exec ('ds.X_' + s + '["' + params['INPUTS_IDS_DATASET'][1] + '"] = descriptions') 791 | exec ('ds.len_' + s + ' = num_samples*2') 792 | 793 | # Insert output, which consists in 'matching'/'non-matching labels' 794 | matches = [1 for i in range(num_samples)] + [0 for i in range(num_samples)] 795 | ds.setOutput(matches, 796 | s, 797 | type='categorical', 798 | id=params['OUTPUTS_IDS_DATASET'][0]) 799 | 800 | ds.setClasses(['matching', 'non-matching'], id=params['OUTPUTS_IDS_DATASET'][0]) 801 | -------------------------------------------------------------------------------- /data_engine/split_data.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | 4 | import numpy as np 5 | import xlrd 6 | 7 | # Split the existent data in train, val and test 8 | data_path = '/media/HDD_3TB/DATASETS/EDUB-SegDesc' 9 | split_prop = {'train': 0.7, 10 | 'val': 0.15, 11 | 'test': 0.15, 12 | } 13 | sets_names = ['Estefania1', 'Estefania2', 'Estefania3', 'Estefania4', 'Estefania5', 14 | 'Gabriel1', 'Gabriel2', 'Gabriel3', 'Gabriel4', 15 | 'MAngeles1', 'MAngeles2', 'MAngeles3', 'MAngeles4', 16 | 'Marc1', 'Marc2', 'Marc3', 'Marc4', 'Marc5', 'Marc6', 'Marc7', 'Marc8', 'Marc9', 17 | 'Marc10', 'Marc11', 'Marc12', 'Marc13', 'Marc14', 'Marc15', 'Marc16', 'Marc17', 'Marc18', 18 | 'MarcC1', 19 | 'Mariella', 'Mariella2', 'Mariella3', 20 | 'Maya1', 'Maya2', 'Maya3', 'Maya4', 'Maya5', 'Maya6', 'Maya7', 'Maya8', 21 | 'Maya9', 'Maya10', 'Maya11', 'Maya12', 'Maya13', 'Maya14', 22 | 'Pedro1', 'Pedro2', 'Pedro3', 'Pedro4', 23 | # 'Txell1' 24 | 'Petia1', 'Petia2', 25 | ] 26 | 27 | sets = {'train': ['Maya14', 'Maya11', 'Maya10', 'Maya13', 'Maya12', 'Petia2', 28 | 'MAngeles4', 'Mariella', 'MAngeles1', 'Pedro1', 'MAngeles3', 29 | 'Pedro3', 'MarcC1', 'Estefania1', 'Estefania3', 'Marc18', 'Maya5', 30 | 'Gabriel3', 'Maya6', 'Maya1', 'Maya3', 'Marc16', 'Marc17', 31 | 'Marc15', 'Maya9', 'Maya8', 'Marc10', 'Marc11', 'Gabriel2', 32 | 'Marc7', 'Maya4', 'MAngeles2', 'Gabriel1', 'Marc8', 'Marc12', 33 | 'Marc5', 'Mariella3', 'Marc2', 'Marc3'], 34 | 'val': ['Pedro4', 'Pedro2', 'Estefania4', 'Maya7', 'Marc6', 'Petia1', 'Mariella2'], 35 | 'test': ['Estefania2', 'Marc1', 'Estefania5', 'Marc9', 'Gabriel4', 'Maya2', 'Marc4', 'Marc14', 'Marc13'], 36 | } 37 | 38 | # input data paths 39 | in_features_path = 'Features/Features_original' # /.csv 40 | in_descriptions_path = 'GT/descriptions' # .txt 41 | in_segments_path = 'GT/segmentations' # GT_.xls(x) 42 | in_images_path = 'Images' # /.jpg 43 | in_features_name = 'GoogleNet_ImageNet' 44 | format = '.jpg' 45 | # list of non-informative images stored in /NonInfo/.csv 46 | # leave empty for not using it 47 | in_noninfo_path = 'Features/NonInfo' 48 | noninformative_prefix = 'infoCNN_outputClasses' 49 | 50 | # output data paths 51 | out_features_path = 'Features' # __all_frames.csv & __all_frames_counts.txt 52 | out_descriptions_path = 'Annotations' # captions.id.en & _list.txt 53 | out_features_name = 'ImageNet_Without_NonInfo' 54 | separator = '----' 55 | 56 | #################################### 57 | 58 | if noninformative_prefix: 59 | suffix_name = '_without_noninfo' 60 | else: 61 | suffix_name = '' 62 | 63 | # Only apply random selection if the sets split is not already provided 64 | if not sets: 65 | # generate data splits 66 | available_sets = len(sets_names) 67 | randomized = np.random.choice(sets_names, available_sets, replace=False) 68 | 69 | # randomized = np.array(sets_names) 70 | 71 | sets = dict() 72 | picked_so_far = 0 73 | for s, p in split_prop.iteritems(): 74 | last_picked = np.ceil(picked_so_far + available_sets * p) 75 | sets[s] = randomized[picked_so_far:last_picked] 76 | picked_so_far = last_picked 77 | 78 | # read images 79 | images = dict() 80 | for n, s in sets.iteritems(): 81 | for set in s: 82 | images[set] = [] 83 | these_images = glob.glob(data_path + '/' + in_images_path + '/' + set + '/*' + format) 84 | for im in these_images: 85 | images[set].append(im.split('/')[-1].split('.')[0]) 86 | images[set] = sorted(images[set]) 87 | 88 | # read segmentations 89 | events = dict() 90 | for n, s in sets.iteritems(): 91 | for set in s: 92 | possible_names = ['/GT_' + set + '.xls', '/GT_' + set + '.xlsx', '/' + set + '.xls', '/' + set + '.xlsx'] 93 | exists = False 94 | i = 0 95 | while not os.path.isfile(data_path + '/' + in_segments_path + possible_names[i]): 96 | i += 1 97 | file = xlrd.open_workbook(data_path + '/' + in_segments_path + possible_names[i]) 98 | sheet = file.sheet_by_index(0) 99 | 100 | these_events = [] 101 | empty = False 102 | i = 2 # 1st row with info 103 | while not empty: 104 | try: 105 | evt = sheet.cell(i, 1).value.split() 106 | if len(evt) == 1: 107 | evt = sheet.cell(i, 1).value.split('-') 108 | if evt: 109 | these_events.append([evt[0].strip(), evt[1].strip()]) 110 | else: 111 | empty = True 112 | i += 1 113 | except: 114 | empty = True 115 | events[set] = these_events 116 | 117 | # get frames counts from segments and images lists 118 | counts = dict() 119 | for n, s in sets.iteritems(): 120 | counts[n] = [] 121 | for set in s: 122 | counts[set] = [] 123 | prev = -1 124 | for e in events[set]: 125 | if e[1] not in images[set]: 126 | e[1] = '0' + e[1] 127 | if e[0] not in images[set]: 128 | e[0] = '0' + e[0] 129 | 130 | if prev != -1 and images[set].index(e[0]) - images[set].index(prev) > 1: 131 | raise Exception(images[set].index(e[0]), images[set].index(prev)) 132 | c = images[set].index(e[1]) - images[set].index(e[0]) + 1 133 | prev = e[1] 134 | 135 | counts[set].append(c) 136 | counts[n].append(c) 137 | 138 | assert np.sum(counts[set]) == len(images[set]) 139 | 140 | # get erroneous segments 141 | to_remove = dict() 142 | for n, s in sets.iteritems(): 143 | to_remove[n] = dict() 144 | for set in s: 145 | to_remove[n][set] = [] 146 | with open(data_path + '/' + in_descriptions_path + '/' + set + '.txt', 'r') as desc_file: 147 | prev_segm = -1 148 | count = 0 149 | segm_count = 0 150 | segm_count_show = 0 151 | for cline, line in enumerate(desc_file): 152 | if line: 153 | line = line.rstrip('\n').split(',') 154 | segm = line[0] 155 | desc = ','.join(line[1:]) 156 | desc = desc.strip().lower() 157 | if desc == 'error': 158 | to_remove[n][set].append(segm_count) 159 | else: 160 | if prev_segm != segm: 161 | segm_count_show += 1 162 | count = 0 163 | count += 1 164 | assert segm[:7] == 'Segment', set + ', line ' + str(cline) 165 | if prev_segm != segm: 166 | if prev_segm == -1: 167 | assert int(segm[7:]) == 1 168 | else: 169 | assert int(segm[7:]) == int(prev_segm[7:]) + 1, set + ', line ' + str(cline) + ': ' + str( 170 | int(segm[7:])) + ' != ' + str(int(prev_segm[7:]) + 1) 171 | segm_count += 1 172 | prev_segm = segm 173 | 174 | # get features for each data splits 175 | print 'Building features files...' 176 | print '----------------------------------------' 177 | for n, s in sets.iteritems(): 178 | extra_removed = 0 179 | written_in_file = 0 180 | all_total = 0 181 | all_error = 0 182 | feats_file = open( 183 | data_path + '/' + out_features_path + '/' + n + '_' + out_features_name + '_all_frames' + suffix_name + '.csv', 184 | 'w') 185 | counts_file = open( 186 | data_path + '/' + out_features_path + '/' + n + '_' + out_features_name + '_all_frames_counts' + suffix_name + '.txt', 187 | 'w') 188 | for set in s: 189 | these_removed = to_remove[n][set] 190 | these_counts = counts[set] 191 | feats_set = open(data_path + '/' + in_features_path + '/' + set + '/' + in_features_name + '.csv', 'r') 192 | if noninformative_prefix: 193 | noninfo_file = open(data_path + '/' + in_noninfo_path + '/' + noninformative_prefix + '_' + set + '.csv', 194 | 'r') 195 | for ic, count in enumerate(these_counts): 196 | all_total += 1 197 | new_count = 0 198 | these_feats = [] 199 | for c in range(count): 200 | line = feats_set.next().rstrip('\n') 201 | is_informative = True 202 | if noninformative_prefix: 203 | noninfo_line = noninfo_file.next().rstrip('\n') 204 | # checks if the current frame is non-informative and discards it 205 | if float(noninfo_line.split(',')[0]) >= 0.5: 206 | is_informative = False 207 | if is_informative: 208 | these_feats.append(line) 209 | new_count += 1 210 | if ic in these_removed: 211 | all_error += 1 212 | # Empty sequence due to non-informative removal. Let's introduce it into to_remove list 213 | if noninformative_prefix and len(these_feats) == 0: 214 | if ic not in these_removed: 215 | extra_removed += 1 216 | to_remove[n][set].append(ic) 217 | these_removed.append(ic) 218 | if ic not in these_removed: 219 | written_in_file += 1 220 | for feat in these_feats: 221 | feats_file.write(feat + '\n') 222 | counts_file.write(str(new_count) + '\n') 223 | 224 | if noninformative_prefix: 225 | noninfo_file.close() 226 | feats_set.close() 227 | feats_file.close() 228 | counts_file.close() 229 | 230 | print 'Extra removed', n, ':', extra_removed 231 | print 'Written in file', n, ':', written_in_file 232 | print '"ERROR" events', n, ':', all_error 233 | print 'Total original events', n, ':', all_total 234 | print 235 | 236 | # get descriptions for each segment 237 | print 'Building captions files...' 238 | print '----------------------------------------' 239 | caption_general = open(data_path + '/' + out_descriptions_path + '/' + 'captions_final' + suffix_name + '.id.en', 'w') 240 | for n, s in sets.iteritems(): 241 | written_in_file = 0 242 | all_total = 0 243 | all_error = 0 244 | split_file = open(data_path + '/' + out_descriptions_path + '/' + n + '_list_final' + suffix_name + '.txt', 'w') 245 | for set in s: 246 | with open(data_path + '/' + in_descriptions_path + '/' + set + '.txt', 'r') as desc_file: 247 | prev_segm = -1 248 | count = 0 249 | segm_count = -1 250 | segm_count_show = 0 251 | for cline, line in enumerate(desc_file): 252 | if line: 253 | line = line.rstrip('\n').split(',') 254 | segm = line[0] 255 | desc = ','.join(line[1:]) 256 | desc = desc.strip().lower() 257 | if prev_segm != segm: 258 | all_total += 1 259 | if prev_segm == -1: 260 | assert int(segm[7:]) == 1 261 | else: 262 | assert int(segm[7:]) == int(prev_segm[7:]) + 1, set + ', line ' + str(cline) + ': ' + str( 263 | int(segm[7:])) + ' != ' + str(int(prev_segm[7:]) + 1) 264 | segm_count += 1 265 | if desc != 'error' and segm_count not in to_remove[n][set]: 266 | if prev_segm != segm: 267 | written_in_file += 1 268 | segm_count_show += 1 269 | split_file.write(set + '_Segment_' + str(segm_count_show) + '\n') 270 | count = 0 271 | caption_general.write(set + '_Segment_' + str(segm_count_show) 272 | + '#' + str(count) + separator + desc + '\n') 273 | count += 1 274 | else: 275 | if prev_segm != segm: 276 | all_error += 1 277 | assert segm[:7] == 'Segment', set + ', line ' + str(cline) 278 | 279 | prev_segm = segm 280 | try: 281 | int(segm[7:]) 282 | except: 283 | raise Exception(set + ' wrong Segment identifier: ' + segm) 284 | assert segm_count + 1 == int(segm[7:]), set + ': ' + str(segm_count + 1) + ' != ' + segm[7:] 285 | assert len(counts[set]) == segm_count + 1, set + ': ' + str(segm_count + 1) + ' != ' + str(len(counts[set])) 286 | 287 | split_file.close() 288 | 289 | print 'Written in file', n, ':', written_in_file 290 | print 'All removed events', n, ':', all_error 291 | print 'Total original events', n, ':', all_total 292 | print 293 | 294 | caption_general.close() 295 | 296 | print 'DONE!' 297 | -------------------------------------------------------------------------------- /data_engine/subsample_frames_features.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | base_path = '/media/HDD_2TB/DATASETS/MSVD/' 4 | features_path = 'Features/Full_Features' 5 | output_path = 'Features' 6 | 7 | n_frames_per_video_subsample = 26 # subsample fixed number of equidistant frames per video 8 | repeat_frames = False # decides if we are going to repeate some frames when needed for filling the desired 9 | # "n_frames_per_video_subsample", or we are simply filling the video frames with 0s 10 | 11 | 12 | # Inputs 13 | features_name = 'C3D_fc8_ImageNet' 14 | features_files = ['train_' + features_name + '_features.csv', 'val_' + features_name + '_features.csv', 15 | 'test_' + features_name + '_features.csv'] 16 | features_counts_files = ['train_' + features_name + '_counts.txt', 'val_' + features_name + '_counts.txt', 17 | 'test_' + features_name + '_counts.txt'] 18 | 19 | # Outputs 20 | out_features_name = 'C3D_fc8_ImageNet' 21 | out_features = ['train_' + out_features_name + '.csv', 'val_' + out_features_name + '.csv', 22 | 'test_' + out_features_name + '.csv'] 23 | out_features_counts = ['train_' + out_features_name + '_counts.txt', 'val_' + out_features_name + '_counts.txt', 24 | 'test_' + out_features_name + '_counts.txt'] 25 | 26 | ######### 27 | 28 | for ff_, fc_, of_, oc_ in zip(features_files, features_counts_files, out_features, out_features_counts): 29 | 30 | print 'Processing file', base_path + '/' + features_path + '/' + ff_ 31 | 32 | # Open files 33 | ff = open(base_path + '/' + features_path + '/' + ff_, 'r') 34 | fc = open(base_path + '/' + features_path + '/' + fc_, 'r') 35 | of = open(base_path + '/' + output_path + '/' + of_, 'w') 36 | oc = open(base_path + '/' + output_path + '/' + oc_, 'w') 37 | 38 | # Process each video 39 | for count_videos, count in enumerate(fc): 40 | # Calculate chosen frames 41 | count = int(count.strip('\n')) 42 | # pick_pos = np.round(np.linspace(0,count-1,n_frames_per_video_subsample)).astype('int64') 43 | pick_pos = np.linspace(0, count - 1, n_frames_per_video_subsample).astype('int64') 44 | if not repeat_frames: 45 | pick_pos = np.unique(pick_pos) 46 | count_pick = len(pick_pos) 47 | 48 | # Get all frames from current video 49 | feats = [[] for i in range(count)] 50 | for i in range(count): 51 | feats[i] = ff.next() 52 | 53 | # Get chosen frames 54 | for p in pick_pos: 55 | of.write(feats[p]) 56 | oc.write(str(count_pick) + '\n') 57 | if count_pick != n_frames_per_video_subsample: 58 | print "different", count_videos 59 | print "num", count_pick 60 | 61 | ff.close() 62 | fc.close() 63 | of.close() 64 | oc.close() 65 | 66 | print 'Output stored in', base_path + '/' + output_path + '/' + of_ 67 | -------------------------------------------------------------------------------- /docs/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarcBS/TMA/76f4e16b3f10c056c45ada5df4a64cc564b69011/docs/model.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import logging 3 | import sys 4 | from timeit import default_timer as timer 5 | 6 | from config import load_parameters 7 | from data_engine.prepare_data import build_dataset 8 | from keras_wrapper.cnn_model import loadModel, transferWeights, updateModel 9 | from keras_wrapper.extra.callbacks import EvalPerformance, Sample 10 | from keras_wrapper.extra.evaluation import selectMetric 11 | from keras_wrapper.extra.read_write import dict2pkl, list2file 12 | from keras_wrapper.utils import decode_predictions_beam_search, decode_predictions 13 | from viddesc_model import VideoDesc_Model 14 | 15 | logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(message)s', datefmt='%d/%m/%Y %H:%M:%S') 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | def train_model(params): 20 | """ 21 | Training function. Sets the training parameters from params. Build or loads the model and launches the training. 22 | :param params: Dictionary of network hyperparameters. 23 | :return: None 24 | """ 25 | 26 | if params['RELOAD'] > 0: 27 | logging.info('Resuming training.') 28 | 29 | check_params(params) 30 | 31 | ########### Load data 32 | dataset = build_dataset(params) 33 | if not '-vidtext-embed' in params['DATASET_NAME']: 34 | params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] 35 | else: 36 | params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][1]] 37 | ########### 38 | 39 | 40 | ########### Build model 41 | 42 | if params['MODE'] == 'finetuning': 43 | # video_model = loadModel(params['PRE_TRAINED_MODEL_STORE_PATHS'], params['RELOAD']) 44 | video_model = VideoDesc_Model(params, 45 | type=params['MODEL_TYPE'], 46 | verbose=params['VERBOSE'], 47 | model_name=params['MODEL_NAME'] + '_reloaded', 48 | vocabularies=dataset.vocabulary, 49 | store_path=params['STORE_PATH'], 50 | set_optimizer=False, 51 | clear_dirs=False) 52 | video_model = updateModel(video_model, params['RELOAD_PATH'], params['RELOAD'], reload_epoch=False) 53 | video_model.setParams(params) 54 | 55 | # Define the inputs and outputs mapping from our Dataset instance to our model 56 | inputMapping = dict() 57 | for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): 58 | if len(video_model.ids_inputs) > i: 59 | pos_source = dataset.ids_inputs.index(id_in) 60 | id_dest = video_model.ids_inputs[i] 61 | inputMapping[id_dest] = pos_source 62 | video_model.setInputsMapping(inputMapping) 63 | 64 | outputMapping = dict() 65 | for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): 66 | if len(video_model.ids_outputs) > i: 67 | pos_target = dataset.ids_outputs.index(id_out) 68 | id_dest = video_model.ids_outputs[i] 69 | outputMapping[id_dest] = pos_target 70 | video_model.setOutputsMapping(outputMapping) 71 | 72 | video_model.setOptimizer() 73 | params['MAX_EPOCH'] += params['RELOAD'] 74 | 75 | else: 76 | if params['RELOAD'] == 0 or params['LOAD_WEIGHTS_ONLY']: # build new model 77 | video_model = VideoDesc_Model(params, 78 | type=params['MODEL_TYPE'], 79 | verbose=params['VERBOSE'], 80 | model_name=params['MODEL_NAME'], 81 | vocabularies=dataset.vocabulary, 82 | store_path=params['STORE_PATH'], 83 | set_optimizer=True) 84 | dict2pkl(params, params['STORE_PATH'] + '/config') 85 | 86 | # Define the inputs and outputs mapping from our Dataset instance to our model 87 | inputMapping = dict() 88 | for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): 89 | if len(video_model.ids_inputs) > i: 90 | pos_source = dataset.ids_inputs.index(id_in) 91 | id_dest = video_model.ids_inputs[i] 92 | inputMapping[id_dest] = pos_source 93 | video_model.setInputsMapping(inputMapping) 94 | 95 | outputMapping = dict() 96 | for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): 97 | if len(video_model.ids_outputs) > i: 98 | pos_target = dataset.ids_outputs.index(id_out) 99 | id_dest = video_model.ids_outputs[i] 100 | outputMapping[id_dest] = pos_target 101 | video_model.setOutputsMapping(outputMapping) 102 | 103 | # Only load weights from pre-trained model 104 | if params['LOAD_WEIGHTS_ONLY'] and params['RELOAD'] > 0: 105 | for i in range(0, len(params['RELOAD'])): 106 | old_model = loadModel(params['PRE_TRAINED_MODEL_STORE_PATHS'][i], params['RELOAD'][i]) 107 | video_model = transferWeights(old_model, video_model, params['LAYERS_MAPPING'][i]) 108 | video_model.setOptimizer() 109 | params['RELOAD'] = 0 110 | else: # resume from previously trained model 111 | video_model = loadModel(params['PRE_TRAINED_MODEL_STORE_PATHS'], params['RELOAD']) 112 | video_model.params['LR'] = params['LR'] 113 | video_model.setOptimizer() 114 | 115 | if video_model.model_path != params['STORE_PATH']: 116 | video_model.setName(params['MODEL_NAME'], models_path=params['STORE_PATH'], clear_dirs=False) 117 | # Update optimizer either if we are loading or building a model 118 | video_model.params = params 119 | video_model.setOptimizer() 120 | ########### 121 | 122 | 123 | ########### Test model saving/loading functions 124 | # saveModel(video_model, params['RELOAD']) 125 | # video_model = loadModel(params['STORE_PATH'], params['RELOAD']) 126 | ########### 127 | 128 | 129 | ########### Callbacks 130 | callbacks = buildCallbacks(params, video_model, dataset) 131 | ########### 132 | 133 | 134 | ########### Training 135 | total_start_time = timer() 136 | 137 | logger.debug('Starting training!') 138 | training_params = {'n_epochs': params['MAX_EPOCH'], 'batch_size': params['BATCH_SIZE'], 139 | 'homogeneous_batches': params['HOMOGENEOUS_BATCHES'], 'maxlen': params['MAX_OUTPUT_TEXT_LEN'], 140 | 'lr_decay': params['LR_DECAY'], 'lr_gamma': params['LR_GAMMA'], 141 | 'epochs_for_save': params['EPOCHS_FOR_SAVE'], 'verbose': params['VERBOSE'], 142 | 'eval_on_sets': params['EVAL_ON_SETS_KERAS'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 143 | 'extra_callbacks': callbacks, 'reload_epoch': params['RELOAD'], 'epoch_offset': params['RELOAD'], 144 | 'data_augmentation': params['DATA_AUGMENTATION'], 145 | 'patience': params.get('PATIENCE', 0), # early stopping parameters 146 | 'metric_check': params.get('STOP_METRIC', None), 147 | 'eval_on_epochs': params.get('EVAL_EACH_EPOCHS', True), 148 | 'each_n_epochs': params.get('EVAL_EACH', 1), 149 | 'start_eval_on_epoch': params.get('START_EVAL_ON_EPOCH', 0) 150 | } 151 | 152 | video_model.trainNet(dataset, training_params) 153 | 154 | total_end_time = timer() 155 | time_difference = total_end_time - total_start_time 156 | logging.info('In total is {0:.2f}s = {1:.2f}m'.format(time_difference, time_difference / 60.0)) 157 | 158 | 159 | def apply_Video_model(params): 160 | """ 161 | Function for using a previously trained model for sampling. 162 | """ 163 | 164 | ########### Load data 165 | dataset = build_dataset(params) 166 | if not '-vidtext-embed' in params['DATASET_NAME']: 167 | params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] 168 | else: 169 | params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][1]] 170 | ########### 171 | 172 | 173 | ########### Load model 174 | video_model = loadModel(params['STORE_PATH'], params['SAMPLING_RELOAD_POINT'], 175 | reload_epoch=params['SAMPLING_RELOAD_EPOCH']) 176 | video_model.setOptimizer() 177 | ########### 178 | 179 | 180 | ########### Apply sampling 181 | extra_vars = dict() 182 | extra_vars['tokenize_f'] = eval('dataset.' + params['TOKENIZATION_METHOD']) 183 | extra_vars['language'] = params.get('TRG_LAN', 'en') 184 | 185 | for s in params["EVAL_ON_SETS"]: 186 | 187 | # Apply model predictions 188 | params_prediction = {'max_batch_size': params['BATCH_SIZE'], 189 | 'n_parallel_loaders': params['PARALLEL_LOADERS'], 190 | 'predict_on_sets': [s]} 191 | 192 | # Convert predictions into sentences 193 | if not '-vidtext-embed' in params['DATASET_NAME']: 194 | vocab = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['idx2words'] 195 | else: 196 | vocab = None 197 | 198 | if params['BEAM_SEARCH']: 199 | params_prediction['beam_size'] = params['BEAM_SIZE'] 200 | params_prediction['maxlen'] = params['MAX_OUTPUT_TEXT_LEN_TEST'] 201 | params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH'] and '-upperbound' not in params[ 202 | 'DATASET_NAME'] 203 | params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL'] 204 | params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL'] 205 | params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET'] 206 | params_prediction['dataset_outputs'] = params['OUTPUTS_IDS_DATASET'] 207 | params_prediction['normalize_probs'] = params['NORMALIZE_SAMPLING'] 208 | params_prediction['alpha_factor'] = params['ALPHA_FACTOR'] 209 | params_prediction['temporally_linked'] = '-linked' in params['DATASET_NAME'] and '-upperbound' not in \ 210 | params[ 211 | 'DATASET_NAME'] and '-video' not in \ 212 | params[ 213 | 'DATASET_NAME'] 214 | predictions = video_model.predictBeamSearchNet(dataset, params_prediction)[s] 215 | predictions = decode_predictions_beam_search(predictions, vocab, verbose=params['VERBOSE']) 216 | else: 217 | predictions = video_model.predictNet(dataset, params_prediction)[s] 218 | predictions = decode_predictions(predictions, 1, vocab, params['SAMPLING'], verbose=params['VERBOSE']) 219 | 220 | # Store result 221 | filepath = video_model.model_path + '/' + s + '_sampling.pred' # results file 222 | if params['SAMPLING_SAVE_MODE'] == 'list': 223 | list2file(filepath, predictions) 224 | else: 225 | raise Exception, 'Only "list" is allowed in "SAMPLING_SAVE_MODE"' 226 | 227 | # Evaluate if any metric in params['METRICS'] 228 | for metric in params['METRICS']: 229 | logging.info('Evaluating on metric ' + metric) 230 | filepath = video_model.model_path + '/' + s + '_sampling.' + metric # results file 231 | 232 | # Evaluate on the chosen metric 233 | extra_vars[s] = dict() 234 | extra_vars[s]['references'] = dataset.extra_variables[s][params['OUTPUTS_IDS_DATASET'][0]] 235 | metrics = selectMetric[metric]( 236 | pred_list=predictions, 237 | verbose=1, 238 | extra_vars=extra_vars, 239 | split=s) 240 | 241 | # Print results to file 242 | with open(filepath, 'w') as f: 243 | header = '' 244 | line = '' 245 | for metric_ in sorted(metrics): 246 | value = metrics[metric_] 247 | header += metric_ + ',' 248 | line += str(value) + ',' 249 | f.write(header + '\n') 250 | f.write(line + '\n') 251 | logging.info('Done evaluating on metric ' + metric) 252 | 253 | 254 | def buildCallbacks(params, model, dataset): 255 | """ 256 | Builds the selected set of callbacks run during the training of the model. 257 | 258 | :param params: Dictionary of network hyperparameters. 259 | :param model: Model instance on which to apply the callback. 260 | :param dataset: Dataset instance on which to apply the callback. 261 | :return: 262 | """ 263 | 264 | callbacks = [] 265 | 266 | if params['METRICS']: 267 | # Evaluate training 268 | extra_vars = {'language': params.get('TRG_LAN', 'en'), 269 | 'n_parallel_loaders': params['PARALLEL_LOADERS'], 270 | 'tokenize_f': eval('dataset.' + params['TOKENIZATION_METHOD'])} 271 | 272 | if not '-vidtext-embed' in params['DATASET_NAME']: 273 | vocab = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['idx2words'] 274 | for s in params['EVAL_ON_SETS']: 275 | extra_vars[s] = dict() 276 | extra_vars[s]['references'] = dataset.extra_variables[s][params['OUTPUTS_IDS_DATASET'][0]] 277 | else: 278 | vocab = None 279 | extra_vars['n_classes'] = len(dataset.dic_classes[params['OUTPUTS_IDS_DATASET'][0]].values()) 280 | for s in params['EVAL_ON_SETS']: 281 | extra_vars[s] = dict() 282 | extra_vars[s]['references'] = eval('dataset.Y_' + s + '["' + params['OUTPUTS_IDS_DATASET'][0] + '"]') 283 | 284 | if params['BEAM_SEARCH']: 285 | extra_vars['beam_size'] = params.get('BEAM_SIZE', 6) 286 | extra_vars['state_below_index'] = params.get('BEAM_SEARCH_COND_INPUT', -1) 287 | extra_vars['maxlen'] = params.get('MAX_OUTPUT_TEXT_LEN_TEST', 30) 288 | extra_vars['optimized_search'] = params.get('OPTIMIZED_SEARCH', True) and '-upperbound' not in params[ 289 | 'DATASET_NAME'] 290 | extra_vars['model_inputs'] = params['INPUTS_IDS_MODEL'] 291 | extra_vars['model_outputs'] = params['OUTPUTS_IDS_MODEL'] 292 | extra_vars['dataset_inputs'] = params['INPUTS_IDS_DATASET'] 293 | extra_vars['dataset_outputs'] = params['OUTPUTS_IDS_DATASET'] 294 | extra_vars['normalize_probs'] = params.get('NORMALIZE_SAMPLING', False) 295 | extra_vars['alpha_factor'] = params.get('ALPHA_FACTOR', 1.) 296 | extra_vars['temporally_linked'] = '-linked' in params['DATASET_NAME'] and '-upperbound' not in params[ 297 | 'DATASET_NAME'] and '-video' not in params['DATASET_NAME'] 298 | input_text_id = None 299 | vocab_src = None 300 | 301 | callback_metric = EvalPerformance(model, 302 | dataset, 303 | gt_id=params['OUTPUTS_IDS_DATASET'][0], 304 | metric_name=params['METRICS'], 305 | set_name=params['EVAL_ON_SETS'], 306 | batch_size=params['BATCH_SIZE'], 307 | each_n_epochs=params['EVAL_EACH'], 308 | extra_vars=extra_vars, 309 | reload_epoch=params['RELOAD'], 310 | is_text=True, 311 | input_text_id=input_text_id, 312 | index2word_y=vocab, 313 | index2word_x=vocab_src, 314 | sampling_type=params['SAMPLING'], 315 | beam_search=params['BEAM_SEARCH'], 316 | save_path=model.model_path, 317 | start_eval_on_epoch=params['START_EVAL_ON_EPOCH'], 318 | write_samples=True, 319 | write_type=params['SAMPLING_SAVE_MODE'], 320 | eval_on_epochs=params['EVAL_EACH_EPOCHS'], 321 | save_each_evaluation=params['SAVE_EACH_EVALUATION'], 322 | verbose=params['VERBOSE']) 323 | else: 324 | callback_metric = EvalPerformance(model, 325 | dataset, 326 | gt_id=params['OUTPUTS_IDS_DATASET'][0], 327 | metric_name=params['METRICS'], 328 | set_name=params['EVAL_ON_SETS'], 329 | batch_size=params['BATCH_SIZE'], 330 | each_n_epochs=params['EVAL_EACH'], 331 | extra_vars=extra_vars, 332 | reload_epoch=params['RELOAD'], 333 | save_path=model.model_path, 334 | start_eval_on_epoch=params[ 335 | 'START_EVAL_ON_EPOCH'], 336 | write_samples=True, 337 | write_type=params['SAMPLING_SAVE_MODE'], 338 | eval_on_epochs=params['EVAL_EACH_EPOCHS'], 339 | save_each_evaluation=params[ 340 | 'SAVE_EACH_EVALUATION'], 341 | verbose=params['VERBOSE']) 342 | 343 | callbacks.append(callback_metric) 344 | 345 | if params['SAMPLE_ON_SETS']: 346 | # Write some samples 347 | extra_vars = {'language': params.get('TRG_LAN', 'en'), 'n_parallel_loaders': params['PARALLEL_LOADERS']} 348 | if not '-vidtext-embed' in params['DATASET_NAME']: 349 | vocab = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['idx2words'] 350 | else: 351 | vocab = None 352 | if params['BEAM_SEARCH']: 353 | extra_vars['beam_size'] = params['BEAM_SIZE'] 354 | extra_vars['state_below_index'] = params.get('BEAM_SEARCH_COND_INPUT', -1) 355 | extra_vars['maxlen'] = params['MAX_OUTPUT_TEXT_LEN_TEST'] 356 | extra_vars['optimized_search'] = params['OPTIMIZED_SEARCH'] and '-upperbound' not in params['DATASET_NAME'] 357 | extra_vars['model_inputs'] = params['INPUTS_IDS_MODEL'] 358 | extra_vars['model_outputs'] = params['OUTPUTS_IDS_MODEL'] 359 | extra_vars['dataset_inputs'] = params['INPUTS_IDS_DATASET'] 360 | extra_vars['dataset_outputs'] = params['OUTPUTS_IDS_DATASET'] 361 | extra_vars['normalize_probs'] = params['NORMALIZE_SAMPLING'] 362 | extra_vars['alpha_factor'] = params['ALPHA_FACTOR'] 363 | extra_vars['temporally_linked'] = '-linked' in params['DATASET_NAME'] and '-upperbound' not in params[ 364 | 'DATASET_NAME'] and '-video' not in params['DATASET_NAME'] 365 | 366 | callback_sampling = Sample(model, 367 | dataset, 368 | gt_id=params['OUTPUTS_IDS_DATASET'][0], 369 | set_name=params['SAMPLE_ON_SETS'], 370 | n_samples=params['N_SAMPLES'], 371 | each_n_updates=params['SAMPLE_EACH_UPDATES'], 372 | extra_vars=extra_vars, 373 | reload_epoch=params['RELOAD'], 374 | batch_size=params['BATCH_SIZE'], 375 | is_text=True, 376 | index2word_y=vocab, # text info 377 | in_pred_idx=params['INPUTS_IDS_DATASET'][0], 378 | sampling_type=params['SAMPLING'], # text info 379 | beam_search=params['BEAM_SEARCH'], 380 | start_sampling_on_epoch=params['START_SAMPLING_ON_EPOCH'], 381 | verbose=params['VERBOSE']) 382 | callbacks.append(callback_sampling) 383 | 384 | return callbacks 385 | 386 | 387 | def check_params(params): 388 | if 'Glove' in params['MODEL_TYPE'] and params['GLOVE_VECTORS'] is None: 389 | logger.warning("You set a model that uses pretrained word vectors but you didn't specify a vector file." 390 | "We'll train WITHOUT pretrained embeddings!") 391 | if params["USE_DROPOUT"] and params["USE_BATCH_NORMALIZATION"]: 392 | logger.warning("It's not recommended to use both dropout and batch normalization") 393 | 394 | 395 | if __name__ == "__main__": 396 | 397 | parameters = load_parameters() 398 | try: 399 | for arg in sys.argv[1:]: 400 | k, v = arg.split('=') 401 | parameters[k] = ast.literal_eval(v) 402 | except ValueError: 403 | print 'Overwritten arguments must have the form key=Value' 404 | exit(1) 405 | check_params(parameters) 406 | if parameters['MODE'] == 'training' or parameters['MODE'] == 'finetuning': 407 | logging.info('Running training.') 408 | train_model(parameters) 409 | elif parameters['MODE'] == 'sampling': 410 | logging.info('Running sampling.') 411 | apply_Video_model(parameters) 412 | 413 | logging.info('Done!') 414 | -------------------------------------------------------------------------------- /meta-optimizers/spearmint/README.md: -------------------------------------------------------------------------------- 1 | Package for performing hyperparameter optimization with [Spearmint] (https://github.com/HIPS/Spearmint). 2 | 3 | Requirements: Those specified in the [Spearmint] (https://github.com/HIPS/Spearmint) package: 4 | 5 | * [NumPy](http://www.numpy.org/) 6 | * [scikit learn](http://scikit-learn.org/stable/index.html) 7 | * [pymongo](https://api.mongodb.org/python/current) 8 | * [MongoDB](https://www.mongodb.org) 9 | 10 | Installation: 11 | 12 | * Install [Spearmint] (https://github.com/HIPS/Spearmint/blob/master/README.md) 13 | 14 | Usage: 15 | 16 | 1) Set your experimental settings (see `${nmt_keras_path}/spearmint/config.json` for an example) 17 | 18 | * **_WARNING!_**: It is highly recommendable to specify an absolute path to the data files in `config.py` when launching spearmint! 19 | 20 | 2) Run the `launch_spearmint.sh` script. It will execute the following steps: 21 | 22 | * Get NMT-Keras directory: 23 | 24 | ```bash 25 | cd nmt-keras 26 | nmt_keras_path=`pwd` 27 | ``` 28 | 29 | * Create directory for storing the database: 30 | 31 | ```bash 32 | mkdir ${nmt_keras_path}/spearmint/db 33 | ``` 34 | 35 | * Start the Mongo database: 36 | 37 | ```bash 38 | mongod --fork --logpath ${nmt_keras_path}/spearmint/db/log --dbpath ${nmt_keras_path}/spearmint/db 39 | ``` 40 | 41 | * Remove eventual instances of previous experiments 42 | 43 | ```bash 44 | ${spearmint_path}/spearmint/cleanup.sh ${nmt_keras_path}/spearmint/ 45 | ``` 46 | 47 | * Lauch Spearmint! Assuming that it is installed under `${spearmint_path}`: 48 | 49 | ```bash 50 | cd ${nmt_keras_path}; nohup python ${spearmint_path}/spearmint/main.py ${dest_dir} --config=${nmt_keras_path}/meta-optimizers/spearmint/config.json >> ${dest_dir}/logs/out.log 2> ${dest_dir}/logs/out.err & 51 | ``` 52 | 53 | * The results will appear at `${nmt_keras_path}/spearmint/output` 54 | 55 | -------------------------------------------------------------------------------- /meta-optimizers/spearmint/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarcBS/TMA/76f4e16b3f10c056c45ada5df4a64cc564b69011/meta-optimizers/spearmint/__init__.py -------------------------------------------------------------------------------- /meta-optimizers/spearmint/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "language": "PYTHON", 3 | "main-file": "spearmint_opt.py", 4 | "experiment-name": "TemporallyLinkedVideoDescriptionAtt", 5 | "likelihood": "GAUSSIAN", 6 | "variables": { 7 | "TARGET_TEXT_EMBEDDING_SIZE": { 8 | "type": "INT", 9 | "size": 1, 10 | "min": 50, 11 | "max": 600 12 | }, 13 | "ENCODER_HIDDEN_SIZE": { 14 | "type": "INT", 15 | "size": 1, 16 | "min": 100, 17 | "max": 500 18 | }, 19 | "LR_GAMMA": { 20 | "type": "FLOAT", 21 | "size": 1, 22 | "min": 0.95, 23 | "max": 1.0 24 | }, 25 | "N_LAYERS_ENCODER": { 26 | "type": "INT", 27 | "size": 1, 28 | "min": 1, 29 | "max": 2 30 | }, 31 | "N_LAYERS_PREV_SENT_ENCODER": { 32 | "type": "INT", 33 | "size": 1, 34 | "min": 1, 35 | "max": 2 36 | }, 37 | "DECODER_HIDDEN_SIZE": { 38 | "type": "INT", 39 | "size": 1, 40 | "min": 100, 41 | "max": 600 42 | }, 43 | "PREV_SENT_ENCODER_HIDDEN_SIZE": { 44 | "type": "INT", 45 | "size": 1, 46 | "min": 100, 47 | "max": 500 48 | } 49 | } 50 | } 51 | 52 | -------------------------------------------------------------------------------- /meta-optimizers/spearmint/launch_spearmint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | spearmint_path=${SOFTWARE_PREFIX}/Spearmint 4 | nmt_keras_path=${SOFTWARE_PREFIX}/egocentric-video-description 5 | dest_dir=${nmt_keras_path}/meta-optimizers/spearmint 6 | mkdir -p ${dest_dir}/db 7 | mkdir -p ${dest_dir}/logs 8 | 9 | #Launch mongodb if it is not already launched 10 | if [ `ps -wuax |grep mongod |wc -l` -lt 2 ]; then 11 | mongod --fork --logpath ${dest_dir}/db/log --dbpath ${dest_dir}/db; 12 | fi 13 | 14 | 15 | ${spearmint_path}/spearmint/cleanup.sh ${dest_dir} 16 | 17 | cd ${nmt_keras_path}; nohup python ${spearmint_path}/spearmint/main.py ${dest_dir} --config=${nmt_keras_path}/meta-optimizers/spearmint/config.json >> ${dest_dir}/logs/out.log 2> ${dest_dir}/logs/out.err & 18 | echo "Main Spearmint process PID:" $! >> ${dest_dir}/logs/out.log -------------------------------------------------------------------------------- /meta-optimizers/spearmint/spearmint_opt.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import subprocess 4 | import sys 5 | 6 | # sys.path.append("../../") # Adds higher directory to python modules path. 7 | sys.path.insert(1, os.path.abspath(".")) 8 | sys.path.insert(0, os.path.abspath("../../")) 9 | 10 | print sys.path 11 | 12 | from config import load_parameters 13 | from main import check_params, train_model 14 | 15 | logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(message)s', datefmt='%d/%m/%Y %H:%M:%S') 16 | logger = logging.getLogger(__name__) 17 | metric_name = 'Bleu_4' 18 | maximize = True # Select whether we want to maximize the metric or minimize it 19 | d = dict(os.environ.copy()) 20 | d['LC_NUMERIC'] = 'en_US.utf-8' 21 | 22 | 23 | def invoke_model(parameters): 24 | model_params = load_parameters() 25 | model_name = model_params["MODEL_TYPE"] 26 | for parameter in parameters.keys(): 27 | model_params[parameter] = parameters[parameter][0] 28 | logger.debug("Assigning to %s the value %s" % (str(parameter), parameters[parameter][0])) 29 | model_name += '_' + str(parameter) + '_' + str(parameters[parameter][0]) 30 | model_params["SKIP_VECTORS_HIDDEN_SIZE"] = model_params["TARGET_TEXT_EMBEDDING_SIZE"] 31 | model_params["MODEL_NAME"] = model_name 32 | # models and evaluation results will be stored here 33 | model_params[ 34 | "STORE_PATH"] = '/home/lvapeab/smt/software/egocentric-video-description/meta-optimizers/spearmint/trained_models/' + \ 35 | model_params["MODEL_NAME"] + '/' 36 | check_params(model_params) 37 | assert model_params['MODE'] == 'training', 'You can only launch Spearmint when training!' 38 | logging.info('Running training.') 39 | train_model(model_params) 40 | 41 | results_path = model_params['STORE_PATH'] + '/' + model_params['EVAL_ON_SETS'][0] + '.' + model_params['METRICS'][0] 42 | 43 | # Recover the highest metric score 44 | metric_pos_cmd = "head -n 1 " + results_path + \ 45 | " |awk -v metric=" + metric_name + \ 46 | " 'BEGIN{FS=\",\"}" \ 47 | "{for (i=1; i<=NF; i++) if ($i == metric) print i;}'" 48 | metric_pos = \ 49 | subprocess.Popen(metric_pos_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True).communicate()[0][:-1] 50 | cmd = "tail -n +2 " + results_path + \ 51 | " |awk -v m_pos=" + str(metric_pos) + \ 52 | " 'BEGIN{FS=\",\"}{print $m_pos}'|sort -gr|head -n 1" 53 | ps = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, env=d) 54 | metric_value = float(ps.communicate()[0]) 55 | print "Best %s: %f" % (metric_name, metric_value) 56 | 57 | return 1. - metric_value if maximize else metric_value # Spearmint minimizes a function 58 | 59 | 60 | def main(job_id, params): 61 | print params 62 | return invoke_model(params) 63 | 64 | 65 | if __name__ == "__main__": 66 | # Testing function 67 | params = {'SOURCE_TEXT_EMBEDDING_SIZE': [1], 68 | 'ENCODER_HIDDEN_SIZE': [2], 69 | 'TARGET_TEXT_EMBEDDING_SIZE': [1], 70 | 'DECODER_HIDDEN_SIZE': [2], 71 | 'MAX_EPOCH': [2], 72 | 'START_EVAL_ON_EPOCH': [1]} 73 | main(1, params) 74 | -------------------------------------------------------------------------------- /train.sh: -------------------------------------------------------------------------------- 1 | PYTHONPATH=$PYTHONPATH:/media/HDD_2TB/marc/multimodal_keras_wrapper python -u main.py 2 | -------------------------------------------------------------------------------- /turing_test.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import random 3 | import sys 4 | 5 | import numpy as np 6 | 7 | from config import load_parameters 8 | from data_engine.prepare_data import build_dataset 9 | from viddesc_model import VideoDesc_Model 10 | 11 | logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(message)s', datefmt='%d/%m/%Y %H:%M:%S') 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def build(params): 16 | ds = build_dataset(params) 17 | params['OUTPUT_VOCABULARY_SIZE'] = ds.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] 18 | vocab = ds.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['idx2words'] 19 | 20 | # We only want the model for decoding 21 | video_model = VideoDesc_Model(params, 22 | type=params['MODEL_TYPE'], 23 | verbose=0, 24 | model_name=params['MODEL_NAME'], 25 | vocabularies=ds.vocabulary, 26 | store_path=params['STORE_PATH'], 27 | set_optimizer=False) 28 | 29 | return ds, vocab, video_model 30 | 31 | 32 | def sample(ds, vocab, video_model, n_samples, split='train', verbose=1): 33 | truth_data = np.random.randint(0, high=eval('ds.len_' + split), size=n_samples) 34 | 35 | matches = 0 36 | misses = 0 37 | guesses = 0 38 | 39 | [truth_X, truth_Y] = ds.getXY_FromIndices('train', truth_data) 40 | 41 | truth_Xs = video_model.decode_predictions_beam_search(np.asarray(truth_X[-2]), vocab, verbose=0, pad_sequences=True) 42 | truth_Ys = video_model.decode_predictions_one_hot(np.asarray(truth_Y[0][0]), vocab) 43 | 44 | for i, (truth_X, truth_Y) in enumerate(zip(truth_Xs, truth_Ys)): 45 | try: 46 | fake_data = np.random.randint(0, high=eval('ds.len_' + split), size=n_samples) 47 | [fake_X, fake_Y] = ds.getXY_FromIndices('train', fake_data) 48 | fake_Xs = video_model.decode_predictions_beam_search(np.asarray(fake_X[-2]), vocab, verbose=0, 49 | pad_sequences=True) 50 | fake_Ys = video_model.decode_predictions_one_hot(np.asarray(fake_Y[0][0]), vocab) 51 | 52 | print "Input", i, ":", truth_X 53 | print "Which is the following event?" 54 | 55 | answer_list = [truth_Y] + fake_Ys 56 | correctness_list = [True] + [False] * len(fake_Ys) 57 | answer_correctness_list = list(zip(answer_list, correctness_list)) 58 | random.shuffle(answer_correctness_list) 59 | shuffled_answer_list, shuffled_correctness_list = zip(*answer_correctness_list) 60 | for j, answer in enumerate(shuffled_answer_list): 61 | print "\t", j, ":", answer 62 | action = int(raw_input('Select the upcoming event. \n')) 63 | if shuffled_correctness_list[action]: 64 | matches += 1 65 | if verbose: 66 | print "Correct!" 67 | else: 68 | misses += 1 69 | if verbose: 70 | print "Not correct!. The correct one was:", shuffled_answer_list[ 71 | shuffled_correctness_list.index(True)] 72 | guesses += 1 73 | print "" 74 | print "" 75 | except KeyboardInterrupt: 76 | return matches, misses, guesses 77 | 78 | return matches, misses, guesses 79 | 80 | 81 | if __name__ == "__main__": 82 | 83 | parameters = load_parameters() 84 | ########### 85 | ds, vocab, model = build(parameters) 86 | total_matches = 0 87 | total_misses = 0 88 | total_guesses = 0 89 | while True: 90 | try: 91 | matches, misses, guesses = sample(ds, vocab, model, 4, split='train', verbose=0) 92 | total_matches += matches 93 | total_misses += misses 94 | total_guesses += guesses 95 | except KeyboardInterrupt: 96 | print "Interrupted!" 97 | print "Total number of matches: %d/%d" % (total_matches, total_guesses) 98 | print "Total number of misses: %d/%d" % (total_misses, total_guesses) 99 | print "Precision: %f" % (float(total_matches) / total_guesses) 100 | sys.exit(0) 101 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarcBS/TMA/76f4e16b3f10c056c45ada5df4a64cc564b69011/utils/__init__.py -------------------------------------------------------------------------------- /utils/common.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import json 4 | import os 5 | import re 6 | 7 | from toolz import itemmap 8 | 9 | from keras.optimizers import Adadelta 10 | from keras.optimizers import Adagrad 11 | from keras.optimizers import Adam 12 | from keras.optimizers import RMSprop 13 | from keras.optimizers import SGD 14 | 15 | PADDING = '' 16 | UNKNOWN = 'UNK' 17 | EOA = '' # end of answer 18 | EOQ = '' # end of question 19 | EXTRA_WORDS_NAMES = [PADDING, UNKNOWN, EOA, EOQ] 20 | EXTRA_WORDS = {PADDING: 0, UNKNOWN: 1, EOA: 2, EOQ: 3} 21 | EXTRA_WORDS_ID = itemmap(reversed, EXTRA_WORDS) 22 | MAXLEN = 50 23 | 24 | OPTIMIZERS = { \ 25 | 'sgd': SGD, 26 | 'adagrad': Adagrad, 27 | 'adadelta': Adadelta, 28 | 'rmsprop': RMSprop, 29 | 'adam': Adam, 30 | } 31 | 32 | 33 | ### 34 | # Functions 35 | ### 36 | def static_vars(**kwargs): 37 | def decorate(func): 38 | for k in kwargs: 39 | setattr(func, k, kwargs[k]) 40 | return func 41 | 42 | return decorate 43 | 44 | 45 | @static_vars(counter=len(EXTRA_WORDS)) 46 | def _myinc(d): 47 | """ 48 | Gets a tuple d, and returns d[0]: id. 49 | """ 50 | x = d[0] 51 | _myinc.counter += 1 52 | return (x, _myinc.counter - 1) 53 | 54 | 55 | def create_dir_if_not_exists(directory): 56 | if not os.path.exists(directory): 57 | print 'creating directory %s' % directory 58 | os.makedirs(directory) 59 | else: 60 | print "%s already exists!" % directory 61 | 62 | 63 | def preprocess_line(line): 64 | cap_tmp = line.strip().decode('utf-8').lower().encode('utf8') 65 | return cap_tmp 66 | 67 | 68 | def preprocess_caption(cap): 69 | commaStrip = re.compile("(\d)(\,)(\d)") 70 | punct = [';', r"/", '[', ']', '"', '{', '}', '(', ')', '=', '+', '\\', '_', '-', '>', '<', '@', '`', ',', '?', '!'] 71 | periodStrip = re.compile("(?!<=\d)(\.)(?!\d)") 72 | 73 | def processPunctuation(inText): 74 | outText = inText 75 | for p in punct: 76 | if (p + ' ' in inText or ' ' + p in inText) or (re.search(commaStrip, inText) != None): 77 | outText = outText.replace(p, '') 78 | else: 79 | outText = outText.replace(p, ' ') 80 | outText = periodStrip.sub("", outText, re.UNICODE) 81 | return outText 82 | 83 | cap_tmp = cap.strip().decode('utf-8').lower().encode('utf8') 84 | cap_tmp = processPunctuation(cap_tmp) 85 | return cap_tmp 86 | 87 | 88 | def preprocess_question(q): 89 | contractions = {"aint": "ain't", "arent": "aren't", "cant": "can't", "couldve": "could've", "couldnt": "couldn't", 90 | "couldn'tve": "couldn’t’ve", "couldnt’ve": "couldn’t’ve", "didnt": "didn’t", "doesnt": "doesn’t", 91 | "dont": "don’t", "hadnt": "hadn’t", "hadnt’ve": "hadn’t’ve", "hadn'tve": "hadn’t’ve", 92 | "hasnt": "hasn’t", "havent": "haven’t", "hed": "he’d", "hed’ve": "he’d’ve", "he’dve": "he’d’ve", 93 | "hes": "he’s", "howd": "how’d", "howll": "how’ll", "hows": "how’s", "Id’ve": "I’d’ve", 94 | "I’dve": "I’d’ve", "Im": "I’m", "Ive": "I’ve", "isnt": "isn’t", "itd": "it’d", "itd’ve": "it’d’ve", 95 | "it’dve": "it’d’ve", "itll": "it’ll", "let’s": "let’s", "maam": "ma’am", "mightnt": "mightn’t", 96 | "mightnt’ve": "mightn’t’ve", "mightn’tve": "mightn’t’ve", "mightve": "might’ve", 97 | "mustnt": "mustn’t", 98 | "mustve": "must’ve", "neednt": "needn’t", "notve": "not’ve", "oclock": "o’clock", 99 | "oughtnt": "oughtn’t", 100 | "ow’s’at": "’ow’s’at", "’ows’at": "’ow’s’at", "’ow’sat": "’ow’s’at", "shant": "shan’t", 101 | "shed’ve": "she’d’ve", "she’dve": "she’d’ve", "she’s": "she’s", "shouldve": "should’ve", 102 | "shouldnt": "shouldn’t", "shouldnt’ve": "shouldn’t’ve", "shouldn’tve": "shouldn’t’ve", 103 | "somebody’d": "somebodyd", "somebodyd’ve": "somebody’d’ve", "somebody’dve": "somebody’d’ve", 104 | "somebodyll": "somebody’ll", "somebodys": "somebody’s", "someoned": "someone’d", 105 | "someoned’ve": "someone’d’ve", "someone’dve": "someone’d’ve", "someonell": "someone’ll", 106 | "someones": "someone’s", "somethingd": "something’d", "somethingd’ve": "something’d’ve", 107 | "something’dve": "something’d’ve", "somethingll": "something’ll", "thats": "that’s", 108 | "thered": "there’d", "thered’ve": "there’d’ve", "there’dve": "there’d’ve", "therere": "there’re", 109 | "theres": "there’s", "theyd": "they’d", "theyd’ve": "they’d’ve", "they’dve": "they’d’ve", 110 | "theyll": "they’ll", "theyre": "they’re", "theyve": "they’ve", "twas": "’twas", "wasnt": "wasn’t", 111 | "wed’ve": "we’d’ve", "we’dve": "we’d’ve", "weve": "we've", "werent": "weren’t", "whatll": "what’ll", 112 | "whatre": "what’re", "whats": "what’s", "whatve": "what’ve", "whens": "when’s", "whered": 113 | "where’d", "wheres": "where's", "whereve": "where’ve", "whod": "who’d", "whod’ve": "who’d’ve", 114 | "who’dve": "who’d’ve", "wholl": "who’ll", "whos": "who’s", "whove": "who've", "whyll": "why’ll", 115 | "whyre": "why’re", "whys": "why’s", "wont": "won’t", "wouldve": "would’ve", "wouldnt": "wouldn’t", 116 | "wouldnt’ve": "wouldn’t’ve", "wouldn’tve": "wouldn’t’ve", "yall": "y’all", "yall’ll": "y’all’ll", 117 | "y’allll": "y’all’ll", "yall’d’ve": "y’all’d’ve", "y’alld’ve": "y’all’d’ve", 118 | "y’all’dve": "y’all’d’ve", 119 | "youd": "you’d", "youd’ve": "you’d’ve", "you’dve": "you’d’ve", "youll": "you’ll", 120 | "youre": "you’re", "youve": "you’ve"} 121 | manualMap = {'none': '0', 'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6', 122 | 'seven': '7', 'eight': '8', 'nine': '9', 'ten': '10'} 123 | articles = ['a', 'an', 'the'] 124 | commaStrip = re.compile("(\d)(\,)(\d)") 125 | punct = [';', r"/", '[', ']', '"', '{', '}', '(', ')', '=', '+', '\\', '_', '-', '>', '<', '@', '`', ',', '?', '!'] 126 | periodStrip = re.compile("(?!<=\d)(\.)(?!\d)") 127 | 128 | def processPunctuation(inText): 129 | outText = inText 130 | for p in punct: 131 | if (p + ' ' in inText or ' ' + p in inText) or (re.search(commaStrip, inText) != None): 132 | outText = outText.replace(p, '') 133 | else: 134 | outText = outText.replace(p, ' ') 135 | outText = periodStrip.sub("", outText, re.UNICODE) 136 | return outText 137 | 138 | def processDigitArticle(inText): 139 | outText = [] 140 | tempText = inText.lower().split() 141 | for word in tempText: 142 | word = manualMap.setdefault(word, word) 143 | if word not in articles: 144 | outText.append(word) 145 | else: 146 | pass 147 | for wordId, word in enumerate(outText): 148 | if word in contractions: 149 | outText[wordId] = contractions[word] 150 | outText = ' '.join(outText) 151 | return outText 152 | 153 | q_tmp = q.strip().lower().encode('utf8') 154 | # q_tmp = processPunctuation(q_tmp) 155 | # q_tmp = processDigitArticle(q_tmp) 156 | if q_tmp[-1] == '?' and q_tmp[-2] != ' ': 157 | # separate word token from the question mark 158 | q_tmp = q_tmp[:-1] + ' ?' 159 | # remove question mark 160 | if q_tmp[-1] == '?': q_tmp = q_tmp[:-1] 161 | return q_tmp 162 | 163 | 164 | def save_txt_answers(samples, savefile='./sample', whichset='val', step=''): 165 | with open(savefile + '_' + whichset + '_samples_' + str(step) + '.json', 'w') as f: 166 | print >> f, '\n'.join(samples) 167 | 168 | 169 | def save_json_answers(samples, savefile='./sample', whichset='val', step=''): 170 | with open(savefile + '_' + whichset + '_samples_' + str(step) + '.json', 'w') as f: 171 | json.dump(samples, f) 172 | 173 | 174 | def build_vocabulary(this_wordcount, extra_words=EXTRA_WORDS, 175 | is_reset=True, truncate_to_most_frequent=0): 176 | """ 177 | Builds vocabulary from wordcount. 178 | It also adds extra words to the vocabulary. 179 | 180 | In: 181 | this_wordcount - dictionary of wordcounts, e.g. {'cpu':3} 182 | extra_words - additional words to build the vocabulary 183 | dictionary of {word: id} 184 | by default {UNKNOWN: 0} 185 | is_reset - if True we restart the vocabulary counting 186 | by defaults False 187 | truncate_to_most_frequent - if positive then the vocabulary 188 | is truncated to 'truncate_to_most_frequent' words; 189 | by default 0 190 | Out: 191 | word2index - mapping from words to indices 192 | index2word - mapping from indices to words 193 | """ 194 | if is_reset: 195 | _myinc.counter = len(EXTRA_WORDS) 196 | if truncate_to_most_frequent > 0: 197 | sorted_wordcount = dict(sorted( 198 | this_wordcount.items(), key=lambda x: x[1], reverse=True)[:truncate_to_most_frequent]) 199 | this_wordcount = sorted_wordcount 200 | word2index = itemmap(_myinc, this_wordcount) 201 | if not extra_words == {}: 202 | assert (all([el not in word2index.values() for el in extra_words.values()])) 203 | word2index.update(extra_words) 204 | index2word = itemmap(reversed, word2index) 205 | return word2index, index2word 206 | 207 | 208 | def index_sequence(x, word2index): 209 | """ 210 | Converts list of words into a list of its indices wrt. word2index, that is into 211 | index encoded sequence. 212 | 213 | In: 214 | x - list of lines 215 | word2index - mapping from words to indices 216 | 217 | Out: 218 | a list of the list of indices that encode the words 219 | """ 220 | one_hot_x = [] 221 | for line in x: 222 | line_list = [] 223 | for w in line.split(): 224 | w = w.strip() 225 | if w in word2index: 226 | this_ind = word2index[w] 227 | else: 228 | this_ind = word2index[UNKNOWN] 229 | line_list.append(this_ind) 230 | one_hot_x.append(line_list) 231 | return one_hot_x 232 | -------------------------------------------------------------------------------- /utils/evaluate_from_file.py: -------------------------------------------------------------------------------- 1 | """ 2 | Scores a file of hypothesis. 3 | Usage: 4 | 1. Set the references in this file (questions and annotations). 5 | 2. python evaluate_vqa.py hypothesis.json 6 | """ 7 | 8 | import argparse 9 | 10 | from pycocoevalcap.bleu.bleu import Bleu 11 | from pycocoevalcap.cider.cider import Cider 12 | from pycocoevalcap.meteor.meteor import Meteor 13 | from pycocoevalcap.rouge.rouge import Rouge 14 | from pycocoevalcap.vqa import vqaEval, visual_qa 15 | 16 | # ROOT_PATH = '/home/lvapeab/smt/tasks/image_desc/' 17 | ROOT_PATH = '/media/HDD_2TB/DATASETS/' 18 | 19 | questions = ROOT_PATH + '/VQA/Questions/OpenEnded_mscoco_val2014_questions.json' 20 | annotations = ROOT_PATH + '/VQA/Annotations/mscoco_val2014_annotations.json' 21 | 22 | parser = argparse.ArgumentParser( 23 | description="""This takes two files and a path the references (source, references), 24 | computes bleu, meteor, rouge and cider metrics""", formatter_class=argparse.RawTextHelpFormatter) 25 | parser.add_argument('-vqa', default=False, action="store_true", help='Compute VQA metrics') 26 | 27 | parser.add_argument('-q', type=str, default=questions, help='Path to questions file (only if the -vqa flag is active)') 28 | parser.add_argument('-a', type=str, default=annotations, 29 | help='Path to annotations file (only if the -vqa flag is active)') 30 | parser.add_argument('-hyp', type=str, help='Hypotheses file') 31 | 32 | parser.add_argument('-l', type=str, default='en', help='Meteor language') 33 | parser.add_argument('-r', type=argparse.FileType('r'), nargs="+", 34 | help='Path to all the reference files (single-reference files)') 35 | 36 | 37 | def score_vqa(resFile, quesFile, annFile): 38 | # create vqa object and vqaRes object 39 | vqa_ = visual_qa.VQA(annFile, quesFile) 40 | vqaRes = vqa_.loadRes(resFile, quesFile) 41 | vqaEval_ = vqaEval.VQAEval(vqa_, vqaRes, 42 | n=2) # n is precision of accuracy (number of places after decimal), default is 2 43 | vqaEval_.evaluate() 44 | print "Overall Accuracy is: %.02f\n" % (vqaEval_.accuracy['overall']) 45 | return vqaEval_.accuracy['overall'] 46 | 47 | 48 | def load_textfiles(references, hypothesis): 49 | print "The number of references is {}".format(len(references)) 50 | hypo = {idx: [lines.strip()] for (idx, lines) in enumerate(hypothesis)} 51 | # take out newlines before creating dictionary 52 | raw_refs = [map(str.strip, r) for r in zip(*references)] 53 | refs = {idx: rr for idx, rr in enumerate(raw_refs)} 54 | # sanity check that we have the same number of references as hypothesis 55 | if len(hypo) != len(refs): 56 | raise ValueError("There is a sentence number mismatch between the inputs: \n" 57 | "\t # sentences in references: %d\n" 58 | "\t # sentences in hypothesis: %d" % (len(refs), len(hypo))) 59 | return refs, hypo 60 | 61 | 62 | def CocoScore(ref, hypo, language='en'): 63 | """ 64 | ref, dictionary of reference sentences (id, sentence) 65 | hypo, dictionary of hypothesis sentences (id, sentence) 66 | score, dictionary of scores 67 | """ 68 | scorers = [ 69 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 70 | (Meteor(language), "METEOR"), 71 | (Rouge(), "ROUGE_L"), 72 | (Cider(), "CIDEr") 73 | ] 74 | final_scores = {} 75 | for scorer, method in scorers: 76 | score, scores = scorer.compute_score(ref, hypo) 77 | if type(score) == list: 78 | for m, s in zip(method, score): 79 | final_scores[m] = s 80 | else: 81 | final_scores[method] = score 82 | return final_scores 83 | 84 | 85 | if __name__ == "__main__": 86 | 87 | args = parser.parse_args() 88 | vqa_evaluation = args.vqa 89 | if vqa_evaluation: 90 | questions = args.q 91 | annotations = args.a 92 | hypotheses = args.hyp 93 | print "hypotheses file:", hypotheses 94 | score = score_vqa(hypotheses, questions, annotations) 95 | print "Score: ", score 96 | else: 97 | language = args.l 98 | hypotheses = open(args.hyp, 'r') 99 | ref, hypo = load_textfiles(args.r, hypotheses) 100 | score = CocoScore(ref, hypo, language=language) 101 | print "Score: ", score 102 | -------------------------------------------------------------------------------- /utils/plot_metric.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Read and plot several logs from cococaption 4 | 5 | if [ $# -lt 1 ]; 6 | then 7 | echo "Usage $0 [train.log] [val.log] [test.log]" 8 | fi 9 | 10 | metric_pos="3" 11 | metric_name="Bleu_4" 12 | out_name="./${metric_name}_plot" 13 | tail -n +2 $1 | awk 'BEGIN{FS=","}{print 1}'>/tmp/epochs; 14 | 15 | i=1 16 | for result in "$@"; do 17 | basename=$(basename $result) 18 | tail -n +2 $result | awk -v pos=${metric_pos} 'BEGIN{FS=","}{print $pos}'>/tmp/${basename}; 19 | names[$i]="${basename%.*}" 20 | i=$(( i + 1 )) 21 | basenames=${basenames}" /tmp/`basename $result`" 22 | done 23 | echo "Epoch ${names[*]}" > /tmp/scores 24 | 25 | paste -d " " /tmp/epochs $basenames >> /tmp/scores 26 | 27 | echo "set encoding iso_8859_1 28 | 29 | set style data lines 30 | set key font ',20' height 2 31 | set xtics font ',18' 32 | set ytics font ',18' 33 | set xlabel font ',20' '# Epoch' 34 | set ylabel font ',20' '${metric_name}'; 35 | 36 | set title '' 37 | set terminal pdf enhanced 38 | set termoption dash 39 | set output '${out_name}.pdf' 40 | set key left 41 | 42 | set yrange[0:1] 43 | set ytics (0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0) 44 | 45 | set bmargin 4 46 | plot for [col=2:$(( $# + 1 ))] '/tmp/scores' using 0:col with lines lt col lw 5 title columnheader " | gnuplot 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /utils/prepare_features.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from common import create_dir_if_not_exists 4 | 5 | ###### Parameters 6 | 7 | ROOT_PATH = '/media/HDD_2TB/DATASETS/' 8 | 9 | base_path = ROOT_PATH + '/Flickr8k/Features/' 10 | features = 'KCNN' # KCNN, Scenes, Objects 11 | base_path_save = base_path + features 12 | 13 | feats_paths = ['train_' + features + '_features.csv', 14 | 'val_' + features + '_features.csv', 15 | 'test_' + features + '_features.csv'] 16 | 17 | names_lists = ['train_list.txt', 'val_list.txt', 'test_list.txt'] 18 | folders_save = ['train', 'val', 'test'] 19 | 20 | apply_L2 = False 21 | n_feats = 1024 22 | 23 | ############ 24 | 25 | if apply_L2: 26 | file_save = features + '_L2' 27 | else: 28 | file_save = features 29 | 30 | 31 | def csv2npy(): 32 | # Process each data split separately 33 | for n, f, fs in zip(names_lists, feats_paths, folders_save): 34 | print "Preparing features %s" % f 35 | feats_dict = dict() 36 | # Get file names 37 | names = [] 38 | with open(base_path + '/' + n, 'r') as file: 39 | for line in file: 40 | line = line.rstrip('\n') 41 | line = line.split('.')[0] 42 | names.append(line) 43 | # Get features 44 | with open(base_path + '/' + f, 'r') as file: 45 | for i, line in enumerate(file): 46 | feats = np.fromstring(line.rstrip('\n'), sep=',') 47 | if (apply_L2): 48 | feats = feats / np.linalg.norm(feats, ord=2) 49 | # Insert in dictionary 50 | feats_dict[names[i]] = feats[:n_feats] 51 | 52 | # Store dict 53 | print "Saving features in %s" % (base_path_save + '/' + fs + '/' + file_save + '.npy') 54 | create_dir_if_not_exists(base_path_save + '/' + fs) 55 | np.save(base_path_save + '/' + fs + '/' + file_save + '.npy', feats_dict) 56 | print 57 | 58 | 59 | if __name__ == "__main__": 60 | csv2npy() 61 | -------------------------------------------------------------------------------- /utils/pretrain_word_vectors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # Parameters 4 | # ROOT_PATH = '/home/lvapeab/smt/tasks/image_desc/VQA/' 5 | ROOT_PATH = '/media/HDD_2TB/DATASETS/VQA/' 6 | base_path = ROOT_PATH + 'Glove/' 7 | glove_path = base_path + 'glove.42B.300d.txt' 8 | dest_file = 'glove_300' 9 | 10 | 11 | def glove2npy(glove_path, base_path_save, dest_file): 12 | vecs_dict = dict() 13 | print "Loading vectors from %s" % (glove_path) 14 | 15 | glove_vectors = [x[:-1] for x in open(glove_path).readlines()] 16 | n_vecs = len(glove_vectors) 17 | print "Found %d vectors in %s" % (n_vecs, glove_path) 18 | i = 0 19 | for vector in glove_vectors: 20 | v = vector.split() 21 | word = v[0] 22 | vec = np.asarray(v[1:], dtype='float32') 23 | vecs_dict[word] = vec 24 | i += 1 25 | if i % 1000 == 0: 26 | print "Processed", i, "vectors (", 100 * float(i) / n_vecs, "%)\r", 27 | print 28 | # Store dict 29 | print "Saving word vectors in %s" % (base_path_save + '/' + dest_file + '.npy') 30 | # create_dir_if_not_exists(base_path_save) 31 | np.save(base_path_save + '/' + dest_file + '.npy', vecs_dict) 32 | print 33 | 34 | 35 | if __name__ == "__main__": 36 | glove2npy(glove_path, base_path, dest_file) 37 | -------------------------------------------------------------------------------- /utils/sort_by_split.py: -------------------------------------------------------------------------------- 1 | # Retrieves the images of a given split and sorts them according to that split 2 | import shutil 3 | 4 | from common import create_dir_if_not_exists 5 | 6 | image_dir = '/data/DATASETS/Flickr8k/Images' 7 | annotatios_dir = '/data/DATASETS/Flickr8k/Annotations' 8 | split_name = 'val' 9 | dest_dir = image_dir + '/' + split_name + '_images' 10 | ext = '.jpg' 11 | 12 | with open(annotatios_dir + '/' + split_name + '_list_ids.txt') as f: 13 | lines = f.readlines() 14 | 15 | create_dir_if_not_exists(dest_dir) 16 | n_items = len(str(len(lines))) + 1 17 | i = 0 18 | for filename in lines: 19 | i += 1 20 | shutil.copyfile(image_dir + '/' + filename[:-1] + ext, dest_dir + '/' + str(i).zfill(n_items) + ext) 21 | -------------------------------------------------------------------------------- /utils/split_features.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def iter_loadtxt(filename, delimiter=',', skiprows=0, dtype=np.float32): 5 | def iter_func(): 6 | with open(filename, 'r') as infile: 7 | for _ in range(skiprows): 8 | next(infile) 9 | for line in infile: 10 | line = line.rstrip().split(delimiter) 11 | for item in line: 12 | yield dtype(item) 13 | iter_loadtxt.rowlength = len(line) 14 | 15 | data = np.fromiter(iter_func(), dtype=dtype) 16 | data = data.reshape((-1, iter_loadtxt.rowlength)) 17 | return data 18 | 19 | 20 | base_path = '/media/HDD_2TB/DATASETS/MSVD/Features/' 21 | feature = 'ImageNetFV_Places_C3Dfc8' 22 | out_feature = 'ImageNetFV' 23 | 24 | for split in ['train', 'val', 'test']: 25 | print "Loading %s features" % str(split + '_' + feature) 26 | # feats = np.genfromtxt(open(base_path + split + '_' + feature + "_features.csv", "rb"), delimiter=",", dtype='float32') 27 | feats = iter_loadtxt(base_path + split + '_' + feature + "_features.csv") 28 | new_feats = feats[:, :1024] # Modify this instruction to get the desired features! 29 | print "Saving %s features" % str(split + '_' + feature) 30 | np.savetxt(base_path + split + '_' + out_feature + "_features.csv", new_feats, delimiter=",") 31 | -------------------------------------------------------------------------------- /utils/vocabulary_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | if [ $# -lt 1 ] 5 | then 6 | echo "Usage: $0 text_file" 7 | echo "Computes the vocabulary size of text_file" 8 | exit 1 9 | fi 10 | 11 | 12 | for file in $* ;do 13 | vocab=`cat $file | tr " " '\n' | sort -u |wc -l` 14 | echo "$file: $vocab" 15 | done 16 | --------------------------------------------------------------------------------