├── LICENSE ├── README.md ├── __init__.py ├── average_precision_calculator.py ├── cloudml-gpu-distributed.yaml ├── cloudml-gpu.yaml ├── convert_prediction_from_json_to_csv.py ├── eval.py ├── eval_util.py ├── export_model.py ├── file_averaging.py ├── frame_level_models.py ├── inference.py ├── losses.py ├── mean_average_precision_calculator.py ├── model_utils.py ├── models.py ├── readers.py ├── train.py ├── utils.py └── video_level_models.py /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Youtube-8M-WILLOW 2 | 3 | NEW: I just released a pretrained model (Gated NetVLAD) as of 11th December 2017! 4 | you can download the pretrained model here: https://www.rocq.inria.fr/cluster-willow/amiech/pretrainedmodel.zip 5 | the model is: gatednetvladLF-256k-1024-80-0002-300iter-norelu-basic-gatedmoe 6 | 7 | This is code of the kaggle Youtube-8M Large-Scale Video Understanding challenge winners (https://www.kaggle.com/c/youtube8m). 8 | For more details about our models, please read our arXiv paper: https://arxiv.org/abs/1706.06905 . 9 | 10 | This repo contains some code to reproduce a winning submission for the kaggle challenge. If you are just looking for an efficient Tensorflow implementation of NetVLAD, NetRVLAD, NetFV, Soft-DBoW and their gated version, please directly consult our Tensorflow toolbox (LOUPE): https://github.com/antoine77340/LOUPE. 11 | 12 | The code is built on top of the Google Youtube-8M starter code (https://github.com/google/youtube-8m). 13 | Please look at their README to see the needed dependencies to run the code (mainly Tensorflow 1.0). 14 | 15 | You will additionally only need to have the pandas python library installed. 16 | 17 | Hardware requirement: Each model have been run on a single NVIDIA TITAN X 12 GB GPU. Be aware that some of the models 18 | do not fit with a GPU with less than 9GB of memory. Please do not modify the training batch size 19 | of these models as it might affect the final results. 20 | 21 | Our best submitted model (GAP: 0.84967% on the private leaderboard) is a weighted ensemble of 25 models. 22 | However for the sake of simplicity, we present a much more simple ensemble of 23 | 7 models that is enough to reach the first place with a significant margin (GAP ~ 84.7%). The 25 models trained 24 | are only some very similar variant (of hyper-parameter) of these seven main models. 25 | 26 | Please note that because of the time constraint, we did not have time to try to run the code from scratch. 27 | It might be possible, but rather unlikely, that something is not working properly. If so please create an issue on 28 | github. 29 | 30 | ## Training the single models 31 | 32 | Each of the following command lines train a single model. They are scheduled to stop training at the good time. 33 | 34 | Our models were trained on all the training set and almost all the validation set. 35 | We only discarded 21k videos to build a smaller validation set. 36 | This validation set (used in the arXiv paper) is composed of all the tensorflow record file of the form: 'validatea*.tfrecord'. 37 | We will however, train the models on all both training and validation set as it was allowed in the kaggle competition. It should not make any huge difference. 38 | 39 | Each model takes several days to train, so each command line are separated in order to be run in parallel if possible. 40 | Please replace 'path_to_features' with the folder path which contains all the tensorflow record frame level feature. 41 | ```sh 42 | path_to_features='path_to_features' 43 | ``` 44 | 45 | Training Gated NetVLAD (256 Clusters): 46 | 47 | ```sh 48 | python train.py --train_data_pattern="$path_to_features/*a*??.tfrecord" --model=NetVLADModelLF --train_dir=gatednetvladLF-256k-1024-80-0002-300iter-norelu-basic-gatedmoe --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=80 --base_learning_rate=0.0002 --netvlad_cluster_size=256 --netvlad_hidden_size=1024 --moe_l2=1e-6 --iterations=300 --learning_rate_decay=0.8 --netvlad_relu=False --gating=True --moe_prob_gating=True --max_step=700000 49 | ``` 50 | 51 | Note: The best single model is this one but with the flag --max_step=300000. We somehow need it to train longer for better effect on the ensemble. 52 | G 53 | Training Gated NetFV (128 Clusters): 54 | 55 | 56 | ```sh 57 | python train.py --train_data_pattern="$path_to_features/*a*??.tfrecord" --model=NetFVModelLF --train_dir=gatednetfvLF-128k-1024-80-0002-300iter-norelu-basic-gatedmoe --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=80 --base_learning_rate=0.0002 --fv_cluster_size=128 --fv_hidden_size=1024 --moe_l2=1e-6 --iterations=300 --learning_rate_decay=0.8 --fv_relu=False --gating=True --moe_prob_gating=True --fv_couple_weights=False --max_step=600000 58 | ``` 59 | 60 | Training Gated Soft-DBoW (4096 Clusters): 61 | 62 | ```sh 63 | python train.py --train_data_pattern="$path_to_features/*a*??.tfrecord" --model=GatedDbofModelLF --train_dir=gateddboflf-4096-1024-80-0002-300iter --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=80 --base_learning_rate=0.0002 --dbof_cluster_size=4096 --dbof_hidden_size=1024 --moe_l2=1e-6 --iterations=300 --dbof_relu=False --max_step=1000000 64 | ``` 65 | 66 | Training Soft-DBoW (8000 Clusters): 67 | 68 | ```sh 69 | python train.py --train_data_pattern="$path_to_features/*a*??.tfrecord" --model=SoftDbofModelLF --train_dir=softdboflf-8000-1024-80-0002-300iter --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=80 --base_learning_rate=0.0002 --dbof_cluster_size=8000 --dbof_hidden_size=1024 --iterations=300 --dbof_relu=False --max_step=800000 70 | ``` 71 | 72 | Training Gated NetRVLAD (256 Clusters): 73 | 74 | ```sh 75 | python train.py --train_data_pattern="$path_to_features/*a*??.tfrecord" --model=NetVLADModelLF --train_dir=gatedlightvladLF-256k-1024-80-0002-300iter-norelu-basic-gatedmoe --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=80 --base_learning_rate=0.0002 --netvlad_cluster_size=256 --netvlad_hidden_size=1024 --moe_l2=1e-6 --iterations=300 --learning_rate_decay=0.8 --netvlad_relu=False --gating=True --moe_prob_gating=True --lightvlad=True --max_step=600000 76 | ``` 77 | 78 | Training GRU (2 layers): 79 | 80 | ```sh 81 | python train.py --train_data_pattern="$path_to_features/*a*??.tfrecord" --model=GruModel --train_dir=GRU-0002-1200 --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=128 --base_learning_rate=0.0002 --gru_cells=1200 --learning_rate_decay=0.9 --moe_l2=1e-6 --max_step=300000 82 | ``` 83 | 84 | Training LSTM (2 layers): 85 | 86 | ```sh 87 | python train.py --train_data_pattern="$path_to_features/*a*??.tfrecord" --model=LstmModel --train_dir=lstm-0002-val-150-random --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=128 --base_learning_rate=0.0002 --iterations=150 --lstm_random_sequence=True --max_step=400000 88 | ``` 89 | 90 | 91 | ## Inference 92 | 93 | After training, we will write the predictions into 7 different files and then ensemble them. 94 | Run each one of this command to run inference for each model. 95 | 96 | ```sh 97 | python inference.py --output_file=test-lstm-0002-val-150-random.csv --input_data_pattern="$path_to_features/test*.tfrecord" --model=LstmModel --train_dir=lstm-0002-val-150-random --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=1024 --base_learning_rate=0.0002 --iterations=150 --lstm_random_sequence=True --run_once=True --top_k=50 98 | 99 | python inference.py --output_file=test-gatedlightvladLF-256k-1024-80-0002-300iter-norelu-basic-gatedmoe.csv --input_data_pattern="$path_to_features/test*.tfrecord" --model=NetVLADModelLF --train_dir=gatedlightvladLF-256k-1024-80-0002-300iter-norelu-basic-gatedmoe --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=1024 --base_learning_rate=0.0002 --netvlad_cluster_size=256 --netvlad_hidden_size=1024 --moe_l2=1e-6 --iterations=300 --learning_rate_decay=0.8 --netvlad_relu=False --gating=True --moe_prob_gating=True --lightvlad=True --run_once=True --top_k=50 100 | 101 | python inference.py --output_file=test-gateddboflf-4096-1024-80-0002-300iter-gatedmoe.csv --input_data_pattern="$path_to_features/test*.tfrecord" --model=GatedDbofModelLF --train_dir=gateddboflf-4096-1024-80-0002-300iter-gatedmoe --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=512 --base_learning_rate=0.0002 --dbof_cluster_size=4096 --dbof_hidden_size=1024 --moe_l2=1e-6 --iterations=300 --dbof_relu=False --moe_prob_gating=True --run_once=True --top_k=50 102 | 103 | python inference.py --output_file=test-gatednetfvLF-128k-1024-80-0002-300iter-norelu-basic-gatedmoe.csv --input_data_pattern="$path_to_features/test*.tfrecord" --model=NetFVModelLF --train_dir=gatednetfvLF-128k-1024-80-0002-300iter-norelu-basic-gatedmoe --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=1024 --base_learning_rate=0.0002 --fv_cluster_size=128 --fv_hidden_size=1024 --moe_l2=1e-6 --iterations=300 --learning_rate_decay=0.8 --fv_relu=False --gating=True --moe_prob_gating=True --fv_couple_weights=False --top_k=50 104 | 105 | python inference.py --output_file=test-GRU-0002-1200.csv --input_data_pattern="$path_to_features/test*.tfrecord" --model=GruModel --train_dir=GRU-0002-1200 --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=1024 --base_learning_rate=0.0002 --gru_cells=1200 --learning_rate_decay=0.9 --moe_l2=1e-6 --run_once=True --top_k=50 106 | 107 | python inference.py --output_file=test-gatednetvladLF-256k-1024-80-0002-300iter-norelu-basic-gatedmoe.csv --input_data_pattern="$path_to_features/test*.tfrecord" --model=NetVLADModelLF --train_dir=gatednetvladLF-256k-1024-80-0002-300iter-norelu-basic-gatedmoe --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=1024 --base_learning_rate=0.0002 --netvlad_cluster_size=256 --netvlad_hidden_size=1024 --moe_l2=1e-6 --iterations=300 --learning_rate_decay=0.8 --netvlad_relu=False --gating=True --moe_prob_gating=True --run_once=True --top_k=50 108 | 109 | python inference.py --output_file=test-softdboflf-8000-1024-80-0002-300iter.csv --input_data_pattern="$path_to_features/test*.tfrecord" --model=SoftDbofModelLF --train_dir=softdboflf-8000-1024-80-0002-300iter --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=256 --base_learning_rate=0.0002 --dbof_cluster_size=8000 --dbof_hidden_size=1024 --iterations=300 --dbof_relu=False --run_once=True --top_k=50 110 | ``` 111 | 112 | ## Averaging the models 113 | 114 | After inference done for all models just run: 115 | 116 | 117 | ```sh 118 | python file_averaging.py 119 | ``` 120 | 121 | It will just take you the time to make a coffee and the submission file will be written in WILLOW_submission.csv 122 | before you finish to drink it :D. 123 | 124 | Antoine Miech 125 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /average_precision_calculator.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Calculate or keep track of the interpolated average precision. 16 | 17 | It provides an interface for calculating interpolated average precision for an 18 | entire list or the top-n ranked items. For the definition of the 19 | (non-)interpolated average precision: 20 | http://trec.nist.gov/pubs/trec15/appendices/CE.MEASURES06.pdf 21 | 22 | Example usages: 23 | 1) Use it as a static function call to directly calculate average precision for 24 | a short ranked list in the memory. 25 | 26 | ``` 27 | import random 28 | 29 | p = np.array([random.random() for _ in xrange(10)]) 30 | a = np.array([random.choice([0, 1]) for _ in xrange(10)]) 31 | 32 | ap = average_precision_calculator.AveragePrecisionCalculator.ap(p, a) 33 | ``` 34 | 35 | 2) Use it as an object for long ranked list that cannot be stored in memory or 36 | the case where partial predictions can be observed at a time (Tensorflow 37 | predictions). In this case, we first call the function accumulate many times 38 | to process parts of the ranked list. After processing all the parts, we call 39 | peek_interpolated_ap_at_n. 40 | ``` 41 | p1 = np.array([random.random() for _ in xrange(5)]) 42 | a1 = np.array([random.choice([0, 1]) for _ in xrange(5)]) 43 | p2 = np.array([random.random() for _ in xrange(5)]) 44 | a2 = np.array([random.choice([0, 1]) for _ in xrange(5)]) 45 | 46 | # interpolated average precision at 10 using 1000 break points 47 | calculator = average_precision_calculator.AveragePrecisionCalculator(10) 48 | calculator.accumulate(p1, a1) 49 | calculator.accumulate(p2, a2) 50 | ap3 = calculator.peek_ap_at_n() 51 | ``` 52 | """ 53 | 54 | import heapq 55 | import random 56 | import numbers 57 | 58 | import numpy 59 | 60 | 61 | class AveragePrecisionCalculator(object): 62 | """Calculate the average precision and average precision at n.""" 63 | 64 | def __init__(self, top_n=None): 65 | """Construct an AveragePrecisionCalculator to calculate average precision. 66 | 67 | This class is used to calculate the average precision for a single label. 68 | 69 | Args: 70 | top_n: A positive Integer specifying the average precision at n, or 71 | None to use all provided data points. 72 | 73 | Raises: 74 | ValueError: An error occurred when the top_n is not a positive integer. 75 | """ 76 | if not ((isinstance(top_n, int) and top_n >= 0) or top_n is None): 77 | raise ValueError("top_n must be a positive integer or None.") 78 | 79 | self._top_n = top_n # average precision at n 80 | self._total_positives = 0 # total number of positives have seen 81 | self._heap = [] # max heap of (prediction, actual) 82 | 83 | @property 84 | def heap_size(self): 85 | """Gets the heap size maintained in the class.""" 86 | return len(self._heap) 87 | 88 | @property 89 | def num_accumulated_positives(self): 90 | """Gets the number of positive samples that have been accumulated.""" 91 | return self._total_positives 92 | 93 | def accumulate(self, predictions, actuals, num_positives=None): 94 | """Accumulate the predictions and their ground truth labels. 95 | 96 | After the function call, we may call peek_ap_at_n to actually calculate 97 | the average precision. 98 | Note predictions and actuals must have the same shape. 99 | 100 | Args: 101 | predictions: a list storing the prediction scores. 102 | actuals: a list storing the ground truth labels. Any value 103 | larger than 0 will be treated as positives, otherwise as negatives. 104 | num_positives = If the 'predictions' and 'actuals' inputs aren't complete, 105 | then it's possible some true positives were missed in them. In that case, 106 | you can provide 'num_positives' in order to accurately track recall. 107 | 108 | Raises: 109 | ValueError: An error occurred when the format of the input is not the 110 | numpy 1-D array or the shape of predictions and actuals does not match. 111 | """ 112 | if len(predictions) != len(actuals): 113 | raise ValueError("the shape of predictions and actuals does not match.") 114 | 115 | if not num_positives is None: 116 | if not isinstance(num_positives, numbers.Number) or num_positives < 0: 117 | raise ValueError("'num_positives' was provided but it wan't a nonzero number.") 118 | 119 | if not num_positives is None: 120 | self._total_positives += num_positives 121 | else: 122 | self._total_positives += numpy.size(numpy.where(actuals > 0)) 123 | topk = self._top_n 124 | heap = self._heap 125 | 126 | for i in range(numpy.size(predictions)): 127 | if topk is None or len(heap) < topk: 128 | heapq.heappush(heap, (predictions[i], actuals[i])) 129 | else: 130 | if predictions[i] > heap[0][0]: # heap[0] is the smallest 131 | heapq.heappop(heap) 132 | heapq.heappush(heap, (predictions[i], actuals[i])) 133 | 134 | def clear(self): 135 | """Clear the accumulated predictions.""" 136 | self._heap = [] 137 | self._total_positives = 0 138 | 139 | def peek_ap_at_n(self): 140 | """Peek the non-interpolated average precision at n. 141 | 142 | Returns: 143 | The non-interpolated average precision at n (default 0). 144 | If n is larger than the length of the ranked list, 145 | the average precision will be returned. 146 | """ 147 | if self.heap_size <= 0: 148 | return 0 149 | predlists = numpy.array(list(zip(*self._heap))) 150 | 151 | ap = self.ap_at_n(predlists[0], 152 | predlists[1], 153 | n=self._top_n, 154 | total_num_positives=self._total_positives) 155 | return ap 156 | 157 | @staticmethod 158 | def ap(predictions, actuals): 159 | """Calculate the non-interpolated average precision. 160 | 161 | Args: 162 | predictions: a numpy 1-D array storing the sparse prediction scores. 163 | actuals: a numpy 1-D array storing the ground truth labels. Any value 164 | larger than 0 will be treated as positives, otherwise as negatives. 165 | 166 | Returns: 167 | The non-interpolated average precision at n. 168 | If n is larger than the length of the ranked list, 169 | the average precision will be returned. 170 | 171 | Raises: 172 | ValueError: An error occurred when the format of the input is not the 173 | numpy 1-D array or the shape of predictions and actuals does not match. 174 | """ 175 | return AveragePrecisionCalculator.ap_at_n(predictions, 176 | actuals, 177 | n=None) 178 | 179 | @staticmethod 180 | def ap_at_n(predictions, actuals, n=20, total_num_positives=None): 181 | """Calculate the non-interpolated average precision. 182 | 183 | Args: 184 | predictions: a numpy 1-D array storing the sparse prediction scores. 185 | actuals: a numpy 1-D array storing the ground truth labels. Any value 186 | larger than 0 will be treated as positives, otherwise as negatives. 187 | n: the top n items to be considered in ap@n. 188 | total_num_positives : (optionally) you can specify the number of total 189 | positive 190 | in the list. If specified, it will be used in calculation. 191 | 192 | Returns: 193 | The non-interpolated average precision at n. 194 | If n is larger than the length of the ranked list, 195 | the average precision will be returned. 196 | 197 | Raises: 198 | ValueError: An error occurred when 199 | 1) the format of the input is not the numpy 1-D array; 200 | 2) the shape of predictions and actuals does not match; 201 | 3) the input n is not a positive integer. 202 | """ 203 | if len(predictions) != len(actuals): 204 | raise ValueError("the shape of predictions and actuals does not match.") 205 | 206 | if n is not None: 207 | if not isinstance(n, int) or n <= 0: 208 | raise ValueError("n must be 'None' or a positive integer." 209 | " It was '%s'." % n) 210 | 211 | ap = 0.0 212 | 213 | predictions = numpy.array(predictions) 214 | actuals = numpy.array(actuals) 215 | 216 | # add a shuffler to avoid overestimating the ap 217 | predictions, actuals = AveragePrecisionCalculator._shuffle(predictions, 218 | actuals) 219 | sortidx = sorted( 220 | range(len(predictions)), 221 | key=lambda k: predictions[k], 222 | reverse=True) 223 | 224 | if total_num_positives is None: 225 | numpos = numpy.size(numpy.where(actuals > 0)) 226 | else: 227 | numpos = total_num_positives 228 | 229 | if numpos == 0: 230 | return 0 231 | 232 | if n is not None: 233 | numpos = min(numpos, n) 234 | delta_recall = 1.0 / numpos 235 | poscount = 0.0 236 | 237 | # calculate the ap 238 | r = len(sortidx) 239 | if n is not None: 240 | r = min(r, n) 241 | for i in range(r): 242 | if actuals[sortidx[i]] > 0: 243 | poscount += 1 244 | ap += poscount / (i + 1) * delta_recall 245 | return ap 246 | 247 | @staticmethod 248 | def _shuffle(predictions, actuals): 249 | random.seed(0) 250 | suffidx = random.sample(range(len(predictions)), len(predictions)) 251 | predictions = predictions[suffidx] 252 | actuals = actuals[suffidx] 253 | return predictions, actuals 254 | 255 | @staticmethod 256 | def _zero_one_normalize(predictions, epsilon=1e-7): 257 | """Normalize the predictions to the range between 0.0 and 1.0. 258 | 259 | For some predictions like SVM predictions, we need to normalize them before 260 | calculate the interpolated average precision. The normalization will not 261 | change the rank in the original list and thus won't change the average 262 | precision. 263 | 264 | Args: 265 | predictions: a numpy 1-D array storing the sparse prediction scores. 266 | epsilon: a small constant to avoid denominator being zero. 267 | 268 | Returns: 269 | The normalized prediction. 270 | """ 271 | denominator = numpy.max(predictions) - numpy.min(predictions) 272 | ret = (predictions - numpy.min(predictions)) / numpy.max(denominator, 273 | epsilon) 274 | return ret 275 | -------------------------------------------------------------------------------- /cloudml-gpu-distributed.yaml: -------------------------------------------------------------------------------- 1 | trainingInput: 2 | runtimeVersion: "1.0" 3 | scaleTier: CUSTOM 4 | masterType: standard_gpu 5 | workerCount: 2 6 | workerType: standard_gpu 7 | parameterServerCount: 2 8 | parameterServerType: standard 9 | -------------------------------------------------------------------------------- /cloudml-gpu.yaml: -------------------------------------------------------------------------------- 1 | trainingInput: 2 | scaleTier: CUSTOM 3 | # standard_gpu provides 1 GPU. Change to complex_model_m_gpu for 4 GPUs 4 | masterType: standard_gpu 5 | runtimeVersion: "1.0" 6 | -------------------------------------------------------------------------------- /convert_prediction_from_json_to_csv.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Utility to convert the output of batch prediction into a CSV submission. 16 | 17 | It converts the JSON files created by the command 18 | 'gcloud beta ml jobs submit prediction' into a CSV file ready for submission. 19 | """ 20 | 21 | import json 22 | import tensorflow as tf 23 | 24 | from builtins import range 25 | from tensorflow import app 26 | from tensorflow import flags 27 | from tensorflow import gfile 28 | from tensorflow import logging 29 | 30 | 31 | FLAGS = flags.FLAGS 32 | 33 | if __name__ == '__main__': 34 | 35 | flags.DEFINE_string( 36 | "json_prediction_files_pattern", None, 37 | "Pattern specifying the list of JSON files that the command " 38 | "'gcloud beta ml jobs submit prediction' outputs. These files are " 39 | "located in the output path of the prediction command and are prefixed " 40 | "with 'prediction.results'.") 41 | flags.DEFINE_string( 42 | "csv_output_file", None, 43 | "The file to save the predictions converted to the CSV format.") 44 | 45 | 46 | def get_csv_header(): 47 | return "VideoId,LabelConfidencePairs\n" 48 | 49 | def to_csv_row(json_data): 50 | 51 | video_id = json_data["video_id"] 52 | 53 | class_indexes = json_data["class_indexes"] 54 | predictions = json_data["predictions"] 55 | 56 | if isinstance(video_id, list): 57 | video_id = video_id[0] 58 | class_indexes = class_indexes[0] 59 | predictions = predictions[0] 60 | 61 | if len(class_indexes) != len(predictions): 62 | raise ValueError( 63 | "The number of indexes (%s) and predictions (%s) must be equal." 64 | % (len(class_indexes), len(predictions))) 65 | 66 | return (video_id.decode('utf-8') + "," + " ".join("%i %f" % 67 | (class_indexes[i], predictions[i]) 68 | for i in range(len(class_indexes))) + "\n") 69 | 70 | def main(unused_argv): 71 | logging.set_verbosity(tf.logging.INFO) 72 | 73 | if not FLAGS.json_prediction_files_pattern: 74 | raise ValueError( 75 | "The flag --json_prediction_files_pattern must be specified.") 76 | 77 | if not FLAGS.csv_output_file: 78 | raise ValueError("The flag --csv_output_file must be specified.") 79 | 80 | logging.info("Looking for prediction files with pattern: %s", 81 | FLAGS.json_prediction_files_pattern) 82 | 83 | file_paths = gfile.Glob(FLAGS.json_prediction_files_pattern) 84 | logging.info("Found files: %s", file_paths) 85 | 86 | logging.info("Writing submission file to: %s", FLAGS.csv_output_file) 87 | with gfile.Open(FLAGS.csv_output_file, "w+") as output_file: 88 | output_file.write(get_csv_header()) 89 | 90 | for file_path in file_paths: 91 | logging.info("processing file: %s", file_path) 92 | 93 | with gfile.Open(file_path) as input_file: 94 | 95 | for line in input_file: 96 | json_data = json.loads(line) 97 | output_file.write(to_csv_row(json_data)) 98 | 99 | output_file.flush() 100 | logging.info("done") 101 | 102 | if __name__ == "__main__": 103 | app.run() 104 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Binary for evaluating Tensorflow models on the YouTube-8M dataset.""" 15 | 16 | import time 17 | 18 | import eval_util 19 | import losses 20 | import frame_level_models 21 | import video_level_models 22 | import readers 23 | import tensorflow as tf 24 | from tensorflow import app 25 | from tensorflow import flags 26 | from tensorflow import gfile 27 | from tensorflow import logging 28 | import utils 29 | 30 | FLAGS = flags.FLAGS 31 | 32 | if __name__ == "__main__": 33 | # Dataset flags. 34 | flags.DEFINE_string("train_dir", "/tmp/yt8m_model/", 35 | "The directory to load the model files from. " 36 | "The tensorboard metrics files are also saved to this " 37 | "directory.") 38 | flags.DEFINE_string( 39 | "eval_data_pattern", "", 40 | "File glob defining the evaluation dataset in tensorflow.SequenceExample " 41 | "format. The SequenceExamples are expected to have an 'rgb' byte array " 42 | "sequence feature as well as a 'labels' int64 context feature.") 43 | flags.DEFINE_string("feature_names", "mean_rgb", "Name of the feature " 44 | "to use for training.") 45 | flags.DEFINE_string("feature_sizes", "1024", "Length of the feature vectors.") 46 | 47 | # Model flags. 48 | flags.DEFINE_bool( 49 | "frame_features", False, 50 | "If set, then --eval_data_pattern must be frame-level features. " 51 | "Otherwise, --eval_data_pattern must be aggregated video-level " 52 | "features. The model must also be set appropriately (i.e. to read 3D " 53 | "batches VS 4D batches.") 54 | flags.DEFINE_string( 55 | "model", "LogisticModel", 56 | "Which architecture to use for the model. Options include 'Logistic', " 57 | "'SingleMixtureMoe', and 'TwoLayerSigmoid'. See aggregated_models.py and " 58 | "frame_level_models.py for the model definitions.") 59 | flags.DEFINE_integer("batch_size", 1024, 60 | "How many examples to process per batch.") 61 | flags.DEFINE_string("label_loss", "CrossEntropyLoss", 62 | "Loss computed on validation data") 63 | 64 | # Other flags. 65 | flags.DEFINE_integer("num_readers", 4, 66 | "How many threads to use for reading input files.") 67 | flags.DEFINE_boolean("run_once", False, "Whether to run eval only once.") 68 | flags.DEFINE_integer("top_k", 20, "How many predictions to output per video.") 69 | flags.DEFINE_integer("check_point",-1, 70 | "Model checkpoint to load, -1 for latest.") 71 | 72 | 73 | 74 | def find_class_by_name(name, modules): 75 | """Searches the provided modules for the named class and returns it.""" 76 | modules = [getattr(module, name, None) for module in modules] 77 | return next(a for a in modules if a) 78 | 79 | 80 | def get_input_evaluation_tensors(reader, 81 | data_pattern, 82 | batch_size=1024, 83 | num_readers=1): 84 | """Creates the section of the graph which reads the evaluation data. 85 | 86 | Args: 87 | reader: A class which parses the training data. 88 | data_pattern: A 'glob' style path to the data files. 89 | batch_size: How many examples to process at a time. 90 | num_readers: How many I/O threads to use. 91 | 92 | Returns: 93 | A tuple containing the features tensor, labels tensor, and optionally a 94 | tensor containing the number of frames per video. The exact dimensions 95 | depend on the reader being used. 96 | 97 | Raises: 98 | IOError: If no files matching the given pattern were found. 99 | """ 100 | logging.info("Using batch size of " + str(batch_size) + " for evaluation.") 101 | with tf.name_scope("eval_input"): 102 | files = gfile.Glob(data_pattern) 103 | if not files: 104 | raise IOError("Unable to find the evaluation files.") 105 | logging.info("number of evaluation files: " + str(len(files))) 106 | filename_queue = tf.train.string_input_producer( 107 | files, shuffle=False, num_epochs=1) 108 | eval_data = [ 109 | reader.prepare_reader(filename_queue) for _ in range(num_readers) 110 | ] 111 | return tf.train.batch_join( 112 | eval_data, 113 | batch_size=batch_size, 114 | capacity=3 * batch_size, 115 | allow_smaller_final_batch=True, 116 | enqueue_many=True) 117 | 118 | 119 | def build_graph(reader, 120 | model, 121 | eval_data_pattern, 122 | label_loss_fn, 123 | batch_size=1024, 124 | num_readers=1): 125 | """Creates the Tensorflow graph for evaluation. 126 | 127 | Args: 128 | reader: The data file reader. It should inherit from BaseReader. 129 | model: The core model (e.g. logistic or neural net). It should inherit 130 | from BaseModel. 131 | eval_data_pattern: glob path to the evaluation data files. 132 | label_loss_fn: What kind of loss to apply to the model. It should inherit 133 | from BaseLoss. 134 | batch_size: How many examples to process at a time. 135 | num_readers: How many threads to use for I/O operations. 136 | """ 137 | 138 | global_step = tf.Variable(0, trainable=False, name="global_step") 139 | video_id_batch, model_input_raw, labels_batch, num_frames = get_input_evaluation_tensors( # pylint: disable=g-line-too-long 140 | reader, 141 | eval_data_pattern, 142 | batch_size=batch_size, 143 | num_readers=num_readers) 144 | tf.summary.histogram("model_input_raw", model_input_raw) 145 | 146 | feature_dim = len(model_input_raw.get_shape()) - 1 147 | 148 | # Normalize input features. 149 | model_input = tf.nn.l2_normalize(model_input_raw, feature_dim) 150 | 151 | with tf.name_scope("model"): 152 | result = model.create_model(model_input, 153 | num_frames=num_frames, 154 | vocab_size=reader.num_classes, 155 | labels=labels_batch, 156 | is_training=False) 157 | predictions = result["predictions"] 158 | tf.summary.histogram("model_activations", predictions) 159 | if "loss" in result.keys(): 160 | label_loss = result["loss"] 161 | else: 162 | label_loss = label_loss_fn.calculate_loss(predictions, labels_batch) 163 | 164 | tf.add_to_collection("global_step", global_step) 165 | tf.add_to_collection("loss", label_loss) 166 | tf.add_to_collection("predictions", predictions) 167 | tf.add_to_collection("input_batch", model_input) 168 | tf.add_to_collection("video_id_batch", video_id_batch) 169 | tf.add_to_collection("num_frames", num_frames) 170 | tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) 171 | tf.add_to_collection("summary_op", tf.summary.merge_all()) 172 | 173 | 174 | def evaluation_loop(video_id_batch, prediction_batch, label_batch, loss, 175 | summary_op, saver, summary_writer, evl_metrics, 176 | last_global_step_val): 177 | """Run the evaluation loop once. 178 | 179 | Args: 180 | video_id_batch: a tensor of video ids mini-batch. 181 | prediction_batch: a tensor of predictions mini-batch. 182 | label_batch: a tensor of label_batch mini-batch. 183 | loss: a tensor of loss for the examples in the mini-batch. 184 | summary_op: a tensor which runs the tensorboard summary operations. 185 | saver: a tensorflow saver to restore the model. 186 | summary_writer: a tensorflow summary_writer 187 | evl_metrics: an EvaluationMetrics object. 188 | last_global_step_val: the global step used in the previous evaluation. 189 | 190 | Returns: 191 | The global_step used in the latest model. 192 | """ 193 | 194 | global_step_val = -1 195 | with tf.Session() as sess: 196 | if FLAGS.check_point < 0: 197 | latest_checkpoint = tf.train.latest_checkpoint(FLAGS.train_dir) 198 | else: 199 | latest_checkpoint = FLAGS.train_dir + "/model.ckpt-" + str(FLAGS.check_point) 200 | 201 | if latest_checkpoint: 202 | logging.info("Loading checkpoint for eval: " + latest_checkpoint) 203 | # Restores from checkpoint 204 | saver.restore(sess, latest_checkpoint) 205 | # Assuming model_checkpoint_path looks something like: 206 | # /my-favorite-path/yt8m_train/model.ckpt-0, extract global_step from it. 207 | global_step_val = latest_checkpoint.split("/")[-1].split("-")[-1] 208 | else: 209 | logging.info("No checkpoint file found.") 210 | return global_step_val 211 | 212 | if global_step_val == last_global_step_val: 213 | logging.info("skip this checkpoint global_step_val=%s " 214 | "(same as the previous one).", global_step_val) 215 | return global_step_val 216 | 217 | sess.run([tf.local_variables_initializer()]) 218 | 219 | # Start the queue runners. 220 | fetches = [video_id_batch, prediction_batch, label_batch, loss, summary_op] 221 | coord = tf.train.Coordinator() 222 | try: 223 | threads = [] 224 | for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS): 225 | threads.extend(qr.create_threads( 226 | sess, coord=coord, daemon=True, 227 | start=True)) 228 | logging.info("enter eval_once loop global_step_val = %s. ", 229 | global_step_val) 230 | 231 | evl_metrics.clear() 232 | 233 | examples_processed = 0 234 | while not coord.should_stop(): 235 | batch_start_time = time.time() 236 | _, predictions_val, labels_val, loss_val, summary_val = sess.run( 237 | fetches) 238 | seconds_per_batch = time.time() - batch_start_time 239 | example_per_second = labels_val.shape[0] / seconds_per_batch 240 | examples_processed += labels_val.shape[0] 241 | 242 | iteration_info_dict = evl_metrics.accumulate(predictions_val, 243 | labels_val, loss_val) 244 | iteration_info_dict["examples_per_second"] = example_per_second 245 | 246 | iterinfo = utils.AddGlobalStepSummary( 247 | summary_writer, 248 | global_step_val, 249 | iteration_info_dict, 250 | summary_scope="Eval") 251 | logging.info("examples_processed: %d | %s", examples_processed, 252 | iterinfo) 253 | 254 | except tf.errors.OutOfRangeError as e: 255 | logging.info( 256 | "Done with batched inference. Now calculating global performance " 257 | "metrics.") 258 | # calculate the metrics for the entire epoch 259 | epoch_info_dict = evl_metrics.get() 260 | epoch_info_dict["epoch_id"] = global_step_val 261 | 262 | summary_writer.add_summary(summary_val, global_step_val) 263 | epochinfo = utils.AddEpochSummary( 264 | summary_writer, 265 | global_step_val, 266 | epoch_info_dict, 267 | summary_scope="Eval") 268 | logging.info(epochinfo) 269 | evl_metrics.clear() 270 | except Exception as e: # pylint: disable=broad-except 271 | logging.info("Unexpected exception: " + str(e)) 272 | coord.request_stop(e) 273 | 274 | coord.request_stop() 275 | coord.join(threads, stop_grace_period_secs=10) 276 | 277 | return global_step_val 278 | 279 | 280 | def evaluate(): 281 | tf.set_random_seed(0) # for reproducibility 282 | with tf.Graph().as_default(): 283 | # convert feature_names and feature_sizes to lists of values 284 | feature_names, feature_sizes = utils.GetListOfFeatureNamesAndSizes( 285 | FLAGS.feature_names, FLAGS.feature_sizes) 286 | 287 | if FLAGS.frame_features: 288 | reader = readers.YT8MFrameFeatureReader(feature_names=feature_names, 289 | feature_sizes=feature_sizes) 290 | else: 291 | reader = readers.YT8MAggregatedFeatureReader(feature_names=feature_names, 292 | feature_sizes=feature_sizes) 293 | 294 | model = find_class_by_name(FLAGS.model, 295 | [frame_level_models, video_level_models])() 296 | label_loss_fn = find_class_by_name(FLAGS.label_loss, [losses])() 297 | 298 | if FLAGS.eval_data_pattern is "": 299 | raise IOError("'eval_data_pattern' was not specified. " + 300 | "Nothing to evaluate.") 301 | 302 | build_graph( 303 | reader=reader, 304 | model=model, 305 | eval_data_pattern=FLAGS.eval_data_pattern, 306 | label_loss_fn=label_loss_fn, 307 | num_readers=FLAGS.num_readers, 308 | batch_size=FLAGS.batch_size) 309 | logging.info("built evaluation graph") 310 | video_id_batch = tf.get_collection("video_id_batch")[0] 311 | prediction_batch = tf.get_collection("predictions")[0] 312 | label_batch = tf.get_collection("labels")[0] 313 | loss = tf.get_collection("loss")[0] 314 | summary_op = tf.get_collection("summary_op")[0] 315 | 316 | saver = tf.train.Saver(tf.global_variables()) 317 | summary_writer = tf.summary.FileWriter( 318 | FLAGS.train_dir, graph=tf.get_default_graph()) 319 | 320 | evl_metrics = eval_util.EvaluationMetrics(reader.num_classes, FLAGS.top_k) 321 | 322 | last_global_step_val = -1 323 | while True: 324 | last_global_step_val = evaluation_loop(video_id_batch, prediction_batch, 325 | label_batch, loss, summary_op, 326 | saver, summary_writer, evl_metrics, 327 | last_global_step_val) 328 | if FLAGS.run_once: 329 | break 330 | 331 | 332 | def main(unused_argv): 333 | logging.set_verbosity(tf.logging.INFO) 334 | print("tensorflow version: %s" % tf.__version__) 335 | evaluate() 336 | 337 | 338 | if __name__ == "__main__": 339 | app.run() 340 | 341 | -------------------------------------------------------------------------------- /eval_util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Provides functions to help with evaluating models.""" 16 | import datetime 17 | import numpy 18 | 19 | #from tensorflow.python.platform import gfile 20 | 21 | import mean_average_precision_calculator as map_calculator 22 | import average_precision_calculator as ap_calculator 23 | 24 | def flatten(l): 25 | """ Merges a list of lists into a single list. """ 26 | return [item for sublist in l for item in sublist] 27 | 28 | def calculate_hit_at_one(predictions, actuals): 29 | """Performs a local (numpy) calculation of the hit at one. 30 | 31 | Args: 32 | predictions: Matrix containing the outputs of the model. 33 | Dimensions are 'batch' x 'num_classes'. 34 | actuals: Matrix containing the ground truth labels. 35 | Dimensions are 'batch' x 'num_classes'. 36 | 37 | Returns: 38 | float: The average hit at one across the entire batch. 39 | """ 40 | top_prediction = numpy.argmax(predictions, 1) 41 | hits = actuals[numpy.arange(actuals.shape[0]), top_prediction] 42 | return numpy.average(hits) 43 | 44 | 45 | def calculate_precision_at_equal_recall_rate(predictions, actuals): 46 | """Performs a local (numpy) calculation of the PERR. 47 | 48 | Args: 49 | predictions: Matrix containing the outputs of the model. 50 | Dimensions are 'batch' x 'num_classes'. 51 | actuals: Matrix containing the ground truth labels. 52 | Dimensions are 'batch' x 'num_classes'. 53 | 54 | Returns: 55 | float: The average precision at equal recall rate across the entire batch. 56 | """ 57 | aggregated_precision = 0.0 58 | num_videos = actuals.shape[0] 59 | for row in numpy.arange(num_videos): 60 | num_labels = int(numpy.sum(actuals[row])) 61 | top_indices = numpy.argpartition(predictions[row], 62 | -num_labels)[-num_labels:] 63 | item_precision = 0.0 64 | for label_index in top_indices: 65 | if predictions[row][label_index] > 0: 66 | item_precision += actuals[row][label_index] 67 | item_precision /= top_indices.size 68 | aggregated_precision += item_precision 69 | aggregated_precision /= num_videos 70 | return aggregated_precision 71 | 72 | def calculate_gap(predictions, actuals, top_k=20): 73 | """Performs a local (numpy) calculation of the global average precision. 74 | 75 | Only the top_k predictions are taken for each of the videos. 76 | 77 | Args: 78 | predictions: Matrix containing the outputs of the model. 79 | Dimensions are 'batch' x 'num_classes'. 80 | actuals: Matrix containing the ground truth labels. 81 | Dimensions are 'batch' x 'num_classes'. 82 | top_k: How many predictions to use per video. 83 | 84 | Returns: 85 | float: The global average precision. 86 | """ 87 | gap_calculator = ap_calculator.AveragePrecisionCalculator() 88 | sparse_predictions, sparse_labels, num_positives = top_k_by_class(predictions, actuals, top_k) 89 | gap_calculator.accumulate(flatten(sparse_predictions), flatten(sparse_labels), sum(num_positives)) 90 | return gap_calculator.peek_ap_at_n() 91 | 92 | 93 | def top_k_by_class(predictions, labels, k=20): 94 | """Extracts the top k predictions for each video, sorted by class. 95 | 96 | Args: 97 | predictions: A numpy matrix containing the outputs of the model. 98 | Dimensions are 'batch' x 'num_classes'. 99 | k: the top k non-zero entries to preserve in each prediction. 100 | 101 | Returns: 102 | A tuple (predictions,labels, true_positives). 'predictions' and 'labels' 103 | are lists of lists of floats. 'true_positives' is a list of scalars. The 104 | length of the lists are equal to the number of classes. The entries in the 105 | predictions variable are probability predictions, and 106 | the corresponding entries in the labels variable are the ground truth for 107 | those predictions. The entries in 'true_positives' are the number of true 108 | positives for each class in the ground truth. 109 | 110 | Raises: 111 | ValueError: An error occurred when the k is not a positive integer. 112 | """ 113 | if k <= 0: 114 | raise ValueError("k must be a positive integer.") 115 | k = min(k, predictions.shape[1]) 116 | num_classes = predictions.shape[1] 117 | prediction_triplets= [] 118 | for video_index in range(predictions.shape[0]): 119 | prediction_triplets.extend(top_k_triplets(predictions[video_index],labels[video_index], k)) 120 | out_predictions = [[] for v in range(num_classes)] 121 | out_labels = [[] for v in range(num_classes)] 122 | for triplet in prediction_triplets: 123 | out_predictions[triplet[0]].append(triplet[1]) 124 | out_labels[triplet[0]].append(triplet[2]) 125 | out_true_positives = [numpy.sum(labels[:,i]) for i in range(num_classes)] 126 | 127 | return out_predictions, out_labels, out_true_positives 128 | 129 | def top_k_triplets(predictions, labels, k=20): 130 | """Get the top_k for a 1-d numpy array. Returns a sparse list of tuples in 131 | (prediction, class) format""" 132 | m = len(predictions) 133 | k = min(k, m) 134 | indices = numpy.argpartition(predictions, -k)[-k:] 135 | return [(index, predictions[index], labels[index]) for index in indices] 136 | 137 | class EvaluationMetrics(object): 138 | """A class to store the evaluation metrics.""" 139 | 140 | def __init__(self, num_class, top_k): 141 | """Construct an EvaluationMetrics object to store the evaluation metrics. 142 | 143 | Args: 144 | num_class: A positive integer specifying the number of classes. 145 | top_k: A positive integer specifying how many predictions are considered per video. 146 | 147 | Raises: 148 | ValueError: An error occurred when MeanAveragePrecisionCalculator cannot 149 | not be constructed. 150 | """ 151 | self.sum_hit_at_one = 0.0 152 | self.sum_perr = 0.0 153 | self.sum_loss = 0.0 154 | self.map_calculator = map_calculator.MeanAveragePrecisionCalculator(num_class) 155 | self.global_ap_calculator = ap_calculator.AveragePrecisionCalculator() 156 | self.top_k = top_k 157 | self.num_examples = 0 158 | 159 | def accumulate(self, predictions, labels, loss): 160 | """Accumulate the metrics calculated locally for this mini-batch. 161 | 162 | Args: 163 | predictions: A numpy matrix containing the outputs of the model. 164 | Dimensions are 'batch' x 'num_classes'. 165 | labels: A numpy matrix containing the ground truth labels. 166 | Dimensions are 'batch' x 'num_classes'. 167 | loss: A numpy array containing the loss for each sample. 168 | 169 | Returns: 170 | dictionary: A dictionary storing the metrics for the mini-batch. 171 | 172 | Raises: 173 | ValueError: An error occurred when the shape of predictions and actuals 174 | does not match. 175 | """ 176 | batch_size = labels.shape[0] 177 | mean_hit_at_one = calculate_hit_at_one(predictions, labels) 178 | mean_perr = calculate_precision_at_equal_recall_rate(predictions, labels) 179 | mean_loss = numpy.mean(loss) 180 | 181 | # Take the top 20 predictions. 182 | sparse_predictions, sparse_labels, num_positives = top_k_by_class(predictions, labels, self.top_k) 183 | self.map_calculator.accumulate(sparse_predictions, sparse_labels, num_positives) 184 | self.global_ap_calculator.accumulate(flatten(sparse_predictions), flatten(sparse_labels), sum(num_positives)) 185 | 186 | self.num_examples += batch_size 187 | self.sum_hit_at_one += mean_hit_at_one * batch_size 188 | self.sum_perr += mean_perr * batch_size 189 | self.sum_loss += mean_loss * batch_size 190 | 191 | return {"hit_at_one": mean_hit_at_one, "perr": mean_perr, "loss": mean_loss} 192 | 193 | def get(self): 194 | """Calculate the evaluation metrics for the whole epoch. 195 | 196 | Raises: 197 | ValueError: If no examples were accumulated. 198 | 199 | Returns: 200 | dictionary: a dictionary storing the evaluation metrics for the epoch. The 201 | dictionary has the fields: avg_hit_at_one, avg_perr, avg_loss, and 202 | aps (default nan). 203 | """ 204 | if self.num_examples <= 0: 205 | raise ValueError("total_sample must be positive.") 206 | avg_hit_at_one = self.sum_hit_at_one / self.num_examples 207 | avg_perr = self.sum_perr / self.num_examples 208 | avg_loss = self.sum_loss / self.num_examples 209 | 210 | aps = self.map_calculator.peek_map_at_n() 211 | gap = self.global_ap_calculator.peek_ap_at_n() 212 | 213 | epoch_info_dict = {} 214 | return {"avg_hit_at_one": avg_hit_at_one, "avg_perr": avg_perr, 215 | "avg_loss": avg_loss, "aps": aps, "gap": gap} 216 | 217 | def clear(self): 218 | """Clear the evaluation metrics and reset the EvaluationMetrics object.""" 219 | self.sum_hit_at_one = 0.0 220 | self.sum_perr = 0.0 221 | self.sum_loss = 0.0 222 | self.map_calculator.clear() 223 | self.global_ap_calculator.clear() 224 | self.num_examples = 0 225 | -------------------------------------------------------------------------------- /export_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Utilities to export a model for batch prediction.""" 15 | 16 | import tensorflow as tf 17 | import tensorflow.contrib.slim as slim 18 | 19 | from tensorflow.python.saved_model import builder as saved_model_builder 20 | from tensorflow.python.saved_model import signature_constants 21 | from tensorflow.python.saved_model import signature_def_utils 22 | from tensorflow.python.saved_model import tag_constants 23 | from tensorflow.python.saved_model import utils as saved_model_utils 24 | 25 | _TOP_PREDICTIONS_IN_OUTPUT = 20 26 | 27 | class ModelExporter(object): 28 | 29 | def __init__(self, frame_features, model, reader): 30 | self.frame_features = frame_features 31 | self.model = model 32 | self.reader = reader 33 | 34 | with tf.Graph().as_default() as graph: 35 | self.inputs, self.outputs = self.build_inputs_and_outputs() 36 | self.graph = graph 37 | self.saver = tf.train.Saver(tf.trainable_variables(), sharded=True) 38 | 39 | def export_model(self, model_dir, global_step_val, last_checkpoint): 40 | """Exports the model so that it can used for batch predictions.""" 41 | 42 | with self.graph.as_default(): 43 | with tf.Session() as session: 44 | session.run(tf.global_variables_initializer()) 45 | self.saver.restore(session, last_checkpoint) 46 | 47 | signature = signature_def_utils.build_signature_def( 48 | inputs=self.inputs, 49 | outputs=self.outputs, 50 | method_name=signature_constants.PREDICT_METHOD_NAME) 51 | 52 | signature_map = {signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: 53 | signature} 54 | 55 | model_builder = saved_model_builder.SavedModelBuilder(model_dir) 56 | model_builder.add_meta_graph_and_variables(session, 57 | tags=[tag_constants.SERVING], 58 | signature_def_map=signature_map, 59 | clear_devices=True) 60 | model_builder.save() 61 | 62 | def build_inputs_and_outputs(self): 63 | 64 | if self.frame_features: 65 | 66 | serialized_examples = tf.placeholder(tf.string, shape=(None,)) 67 | 68 | fn = lambda x: self.build_prediction_graph(x) 69 | video_id_output, top_indices_output, top_predictions_output = ( 70 | tf.map_fn(fn, serialized_examples, 71 | dtype=(tf.string, tf.int32, tf.float32))) 72 | 73 | else: 74 | 75 | serialized_examples = tf.placeholder(tf.string, shape=(None,)) 76 | 77 | video_id_output, top_indices_output, top_predictions_output = ( 78 | self.build_prediction_graph(serialized_examples)) 79 | 80 | inputs = {"example_bytes": 81 | saved_model_utils.build_tensor_info(serialized_examples)} 82 | 83 | outputs = { 84 | "video_id": saved_model_utils.build_tensor_info(video_id_output), 85 | "class_indexes": saved_model_utils.build_tensor_info(top_indices_output), 86 | "predictions": saved_model_utils.build_tensor_info(top_predictions_output)} 87 | 88 | return inputs, outputs 89 | 90 | def build_prediction_graph(self, serialized_examples): 91 | 92 | video_id, model_input_raw, labels_batch, num_frames = ( 93 | self.reader.prepare_serialized_examples(serialized_examples)) 94 | 95 | feature_dim = len(model_input_raw.get_shape()) - 1 96 | model_input = tf.nn.l2_normalize(model_input_raw, feature_dim) 97 | 98 | with tf.name_scope("model"): 99 | result = self.model.create_model( 100 | model_input, 101 | num_frames=num_frames, 102 | vocab_size=self.reader.num_classes, 103 | labels=labels_batch, 104 | is_training=False) 105 | 106 | for variable in slim.get_model_variables(): 107 | tf.summary.histogram(variable.op.name, variable) 108 | 109 | predictions = result["predictions"] 110 | 111 | top_predictions, top_indices = tf.nn.top_k(predictions, 112 | _TOP_PREDICTIONS_IN_OUTPUT) 113 | return video_id, top_indices, top_predictions 114 | -------------------------------------------------------------------------------- /file_averaging.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Antoine Miech All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | 17 | 18 | import os 19 | from collections import defaultdict, Counter 20 | import pickle 21 | import pandas as pd 22 | 23 | SUBMIT_PATH = '' 24 | SIGFIGS = 6 25 | 26 | def read_models(model_weights, blend=None): 27 | if not blend: 28 | blend = defaultdict(Counter) 29 | for m, w in model_weights.items(): 30 | print(m, w) 31 | with open(os.path.join(SUBMIT_PATH, m + '.csv'), 'r') as f: 32 | f.readline() 33 | for l in f: 34 | id, r = l.split(',') 35 | id, r = int(id), r.split(' ') 36 | n = len(r) // 2 37 | for i in range(0, n, 2): 38 | k = int(r[i]) 39 | v = int(10**(SIGFIGS - 1) * float(r[i+1])) 40 | blend[id][k] += w * v 41 | return blend 42 | 43 | 44 | def write_models(blend, file_name, total_weight): 45 | with open(os.path.join(SUBMIT_PATH, file_name + '.csv'), 'w') as f: 46 | f.write('VideoID,LabelConfidencePairs\n') 47 | for id, v in blend.items(): 48 | l = ' '.join(['{} {:{}f}'.format(t[0] 49 | , float(t[1]) / 10 ** (SIGFIGS - 1) / total_weight 50 | , SIGFIGS) for t in v.most_common(20)]) 51 | f.write(','.join([str(id), l + '\n'])) 52 | return None 53 | 54 | 55 | model_pred = {'test-gatednetvladLF-256k-1024-80-0002-300iter-norelu-basic-gatedmoe': 1 56 | , 'test-GRU-0002-1200-2': 1 57 | , 'test-gatednetfvLF-128k-1024-80-0002-300iter-norelu-basic-gatedmoe': 1 58 | , 'test-gateddboflf-4096-1024-80-0002-300iter': 1 59 | , 'test-softdboflf-8000-1024-80-0002-300iter': 1 60 | , 'test-gatedlightvladLF-256k-1024-80-0002-300iter-norelu-basic-gatedmoe': 1 61 | , 'test-lstm-0002-val-150-random': 1 62 | } 63 | 64 | avg = read_models(model_pred) 65 | write_models(avg, 'WILLOW_submission', sum(model_pred.values())) 66 | -------------------------------------------------------------------------------- /frame_level_models.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Antoine Miech All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Contains a collection of models which operate on variable-length sequences. 16 | """ 17 | import math 18 | 19 | import models 20 | import video_level_models 21 | import tensorflow as tf 22 | import model_utils as utils 23 | 24 | import tensorflow.contrib.slim as slim 25 | from tensorflow import flags 26 | 27 | import scipy.io as sio 28 | import numpy as np 29 | 30 | FLAGS = flags.FLAGS 31 | 32 | 33 | flags.DEFINE_bool("gating_remove_diag", False, 34 | "Remove diag for self gating") 35 | flags.DEFINE_bool("lightvlad", False, 36 | "Light or full NetVLAD") 37 | flags.DEFINE_bool("vlagd", False, 38 | "vlagd of vlad") 39 | 40 | 41 | 42 | flags.DEFINE_integer("iterations", 30, 43 | "Number of frames per batch for DBoF.") 44 | flags.DEFINE_bool("dbof_add_batch_norm", True, 45 | "Adds batch normalization to the DBoF model.") 46 | flags.DEFINE_bool( 47 | "sample_random_frames", True, 48 | "If true samples random frames (for frame level models). If false, a random" 49 | "sequence of frames is sampled instead.") 50 | flags.DEFINE_integer("dbof_cluster_size", 16384, 51 | "Number of units in the DBoF cluster layer.") 52 | flags.DEFINE_integer("dbof_hidden_size", 2048, 53 | "Number of units in the DBoF hidden layer.") 54 | flags.DEFINE_bool("dbof_relu", True, 'add ReLU to hidden layer') 55 | flags.DEFINE_integer("dbof_var_features", 0, 56 | "Variance features on top of Dbof cluster layer.") 57 | 58 | flags.DEFINE_string("dbof_activation", "relu", 'dbof activation') 59 | 60 | flags.DEFINE_bool("softdbof_maxpool", False, 'add max pool to soft dbof') 61 | 62 | flags.DEFINE_integer("netvlad_cluster_size", 64, 63 | "Number of units in the NetVLAD cluster layer.") 64 | flags.DEFINE_bool("netvlad_relu", True, 'add ReLU to hidden layer') 65 | flags.DEFINE_integer("netvlad_dimred", -1, 66 | "NetVLAD output dimension reduction") 67 | flags.DEFINE_integer("gatednetvlad_dimred", 1024, 68 | "GatedNetVLAD output dimension reduction") 69 | 70 | flags.DEFINE_bool("gating", False, 71 | "Gating for NetVLAD") 72 | flags.DEFINE_integer("hidden_size", 1024, 73 | "size of hidden layer for BasicStatModel.") 74 | 75 | 76 | flags.DEFINE_integer("netvlad_hidden_size", 1024, 77 | "Number of units in the NetVLAD hidden layer.") 78 | 79 | flags.DEFINE_integer("netvlad_hidden_size_video", 1024, 80 | "Number of units in the NetVLAD video hidden layer.") 81 | 82 | flags.DEFINE_integer("netvlad_hidden_size_audio", 64, 83 | "Number of units in the NetVLAD audio hidden layer.") 84 | 85 | 86 | 87 | flags.DEFINE_bool("netvlad_add_batch_norm", True, 88 | "Adds batch normalization to the DBoF model.") 89 | 90 | flags.DEFINE_integer("fv_cluster_size", 64, 91 | "Number of units in the NetVLAD cluster layer.") 92 | 93 | flags.DEFINE_integer("fv_hidden_size", 2048, 94 | "Number of units in the NetVLAD hidden layer.") 95 | flags.DEFINE_bool("fv_relu", True, 96 | "ReLU after the NetFV hidden layer.") 97 | 98 | 99 | flags.DEFINE_bool("fv_couple_weights", True, 100 | "Coupling cluster weights or not") 101 | 102 | flags.DEFINE_float("fv_coupling_factor", 0.01, 103 | "Coupling factor") 104 | 105 | 106 | flags.DEFINE_string("dbof_pooling_method", "max", 107 | "The pooling method used in the DBoF cluster layer. " 108 | "Choices are 'average' and 'max'.") 109 | flags.DEFINE_string("video_level_classifier_model", "MoeModel", 110 | "Some Frame-Level models can be decomposed into a " 111 | "generalized pooling operation followed by a " 112 | "classifier layer") 113 | flags.DEFINE_integer("lstm_cells", 1024, "Number of LSTM cells.") 114 | flags.DEFINE_integer("lstm_layers", 2, "Number of LSTM layers.") 115 | flags.DEFINE_integer("lstm_cells_video", 1024, "Number of LSTM cells (video).") 116 | flags.DEFINE_integer("lstm_cells_audio", 128, "Number of LSTM cells (audio).") 117 | 118 | 119 | 120 | flags.DEFINE_integer("gru_cells", 1024, "Number of GRU cells.") 121 | flags.DEFINE_integer("gru_cells_video", 1024, "Number of GRU cells (video).") 122 | flags.DEFINE_integer("gru_cells_audio", 128, "Number of GRU cells (audio).") 123 | flags.DEFINE_integer("gru_layers", 2, "Number of GRU layers.") 124 | flags.DEFINE_bool("lstm_random_sequence", False, 125 | "Random sequence input for lstm.") 126 | flags.DEFINE_bool("gru_random_sequence", False, 127 | "Random sequence input for gru.") 128 | flags.DEFINE_bool("gru_backward", False, "BW reading for GRU") 129 | flags.DEFINE_bool("lstm_backward", False, "BW reading for LSTM") 130 | 131 | 132 | flags.DEFINE_bool("fc_dimred", True, "Adding FC dimred after pooling") 133 | 134 | class LightVLAD(): 135 | def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training): 136 | self.feature_size = feature_size 137 | self.max_frames = max_frames 138 | self.is_training = is_training 139 | self.add_batch_norm = add_batch_norm 140 | self.cluster_size = cluster_size 141 | 142 | def forward(self,reshaped_input): 143 | 144 | 145 | cluster_weights = tf.get_variable("cluster_weights", 146 | [self.feature_size, self.cluster_size], 147 | initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) 148 | 149 | activation = tf.matmul(reshaped_input, cluster_weights) 150 | 151 | if self.add_batch_norm: 152 | activation = slim.batch_norm( 153 | activation, 154 | center=True, 155 | scale=True, 156 | is_training=self.is_training, 157 | scope="cluster_bn") 158 | else: 159 | cluster_biases = tf.get_variable("cluster_biases", 160 | [cluster_size], 161 | initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) 162 | tf.summary.histogram("cluster_biases", cluster_biases) 163 | activation += cluster_biases 164 | 165 | activation = tf.nn.softmax(activation) 166 | 167 | activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size]) 168 | 169 | activation = tf.transpose(activation,perm=[0,2,1]) 170 | 171 | reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size]) 172 | vlad = tf.matmul(activation,reshaped_input) 173 | 174 | vlad = tf.transpose(vlad,perm=[0,2,1]) 175 | vlad = tf.nn.l2_normalize(vlad,1) 176 | 177 | vlad = tf.reshape(vlad,[-1,self.cluster_size*self.feature_size]) 178 | vlad = tf.nn.l2_normalize(vlad,1) 179 | 180 | return vlad 181 | 182 | 183 | class NetVLAD(): 184 | def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training): 185 | self.feature_size = feature_size 186 | self.max_frames = max_frames 187 | self.is_training = is_training 188 | self.add_batch_norm = add_batch_norm 189 | self.cluster_size = cluster_size 190 | 191 | def forward(self,reshaped_input): 192 | 193 | 194 | cluster_weights = tf.get_variable("cluster_weights", 195 | [self.feature_size, self.cluster_size], 196 | initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) 197 | 198 | tf.summary.histogram("cluster_weights", cluster_weights) 199 | activation = tf.matmul(reshaped_input, cluster_weights) 200 | 201 | if self.add_batch_norm: 202 | activation = slim.batch_norm( 203 | activation, 204 | center=True, 205 | scale=True, 206 | is_training=self.is_training, 207 | scope="cluster_bn") 208 | else: 209 | cluster_biases = tf.get_variable("cluster_biases", 210 | [cluster_size], 211 | initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) 212 | tf.summary.histogram("cluster_biases", cluster_biases) 213 | activation += cluster_biases 214 | 215 | activation = tf.nn.softmax(activation) 216 | tf.summary.histogram("cluster_output", activation) 217 | 218 | activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size]) 219 | 220 | a_sum = tf.reduce_sum(activation,-2,keep_dims=True) 221 | 222 | cluster_weights2 = tf.get_variable("cluster_weights2", 223 | [1,self.feature_size, self.cluster_size], 224 | initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) 225 | 226 | a = tf.multiply(a_sum,cluster_weights2) 227 | 228 | activation = tf.transpose(activation,perm=[0,2,1]) 229 | 230 | reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size]) 231 | vlad = tf.matmul(activation,reshaped_input) 232 | vlad = tf.transpose(vlad,perm=[0,2,1]) 233 | vlad = tf.subtract(vlad,a) 234 | 235 | 236 | vlad = tf.nn.l2_normalize(vlad,1) 237 | 238 | vlad = tf.reshape(vlad,[-1,self.cluster_size*self.feature_size]) 239 | vlad = tf.nn.l2_normalize(vlad,1) 240 | 241 | return vlad 242 | 243 | 244 | class NetVLAGD(): 245 | def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training): 246 | self.feature_size = feature_size 247 | self.max_frames = max_frames 248 | self.is_training = is_training 249 | self.add_batch_norm = add_batch_norm 250 | self.cluster_size = cluster_size 251 | 252 | def forward(self,reshaped_input): 253 | 254 | 255 | cluster_weights = tf.get_variable("cluster_weights", 256 | [self.feature_size, self.cluster_size], 257 | initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) 258 | 259 | activation = tf.matmul(reshaped_input, cluster_weights) 260 | 261 | if self.add_batch_norm: 262 | activation = slim.batch_norm( 263 | activation, 264 | center=True, 265 | scale=True, 266 | is_training=self.is_training, 267 | scope="cluster_bn") 268 | else: 269 | cluster_biases = tf.get_variable("cluster_biases", 270 | [cluster_size], 271 | initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) 272 | 273 | activation = tf.nn.softmax(activation) 274 | 275 | activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size]) 276 | 277 | gate_weights = tf.get_variable("gate_weights", 278 | [1, self.cluster_size,self.feature_size], 279 | initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) 280 | 281 | gate_weights = tf.sigmoid(gate_weights) 282 | 283 | activation = tf.transpose(activation,perm=[0,2,1]) 284 | 285 | reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size]) 286 | 287 | vlagd = tf.matmul(activation,reshaped_input) 288 | vlagd = tf.multiply(vlagd,gate_weights) 289 | 290 | vlagd = tf.transpose(vlagd,perm=[0,2,1]) 291 | 292 | vlagd = tf.nn.l2_normalize(vlagd,1) 293 | 294 | vlagd = tf.reshape(vlagd,[-1,self.cluster_size*self.feature_size]) 295 | vlagd = tf.nn.l2_normalize(vlagd,1) 296 | 297 | return vlagd 298 | 299 | 300 | 301 | 302 | class GatedDBoF(): 303 | def __init__(self, feature_size,max_frames,cluster_size, max_pool, add_batch_norm, is_training): 304 | self.feature_size = feature_size 305 | self.max_frames = max_frames 306 | self.is_training = is_training 307 | self.add_batch_norm = add_batch_norm 308 | self.cluster_size = cluster_size 309 | self.max_pool = max_pool 310 | 311 | def forward(self, reshaped_input): 312 | 313 | feature_size = self.feature_size 314 | cluster_size = self.cluster_size 315 | add_batch_norm = self.add_batch_norm 316 | max_frames = self.max_frames 317 | is_training = self.is_training 318 | max_pool = self.max_pool 319 | 320 | cluster_weights = tf.get_variable("cluster_weights", 321 | [feature_size, cluster_size], 322 | initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size))) 323 | 324 | tf.summary.histogram("cluster_weights", cluster_weights) 325 | activation = tf.matmul(reshaped_input, cluster_weights) 326 | 327 | if add_batch_norm: 328 | activation = slim.batch_norm( 329 | activation, 330 | center=True, 331 | scale=True, 332 | is_training=is_training, 333 | scope="cluster_bn") 334 | else: 335 | cluster_biases = tf.get_variable("cluster_biases", 336 | [cluster_size], 337 | initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size))) 338 | tf.summary.histogram("cluster_biases", cluster_biases) 339 | activation += cluster_biases 340 | 341 | activation = tf.nn.softmax(activation) 342 | 343 | activation = tf.reshape(activation, [-1, max_frames, cluster_size]) 344 | 345 | activation_sum = tf.reduce_sum(activation,1) 346 | 347 | activation_max = tf.reduce_max(activation,1) 348 | activation_max = tf.nn.l2_normalize(activation_max,1) 349 | 350 | 351 | dim_red = tf.get_variable("dim_red", 352 | [cluster_size, feature_size], 353 | initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size))) 354 | 355 | cluster_weights_2 = tf.get_variable("cluster_weights_2", 356 | [feature_size, cluster_size], 357 | initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size))) 358 | 359 | tf.summary.histogram("cluster_weights_2", cluster_weights_2) 360 | 361 | activation = tf.matmul(activation_max, dim_red) 362 | activation = tf.matmul(activation, cluster_weights_2) 363 | 364 | if add_batch_norm: 365 | activation = slim.batch_norm( 366 | activation, 367 | center=True, 368 | scale=True, 369 | is_training=is_training, 370 | scope="cluster_bn_2") 371 | else: 372 | cluster_biases = tf.get_variable("cluster_biases_2", 373 | [cluster_size], 374 | initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size))) 375 | tf.summary.histogram("cluster_biases_2", cluster_biases) 376 | activation += cluster_biases 377 | 378 | activation = tf.sigmoid(activation) 379 | 380 | activation = tf.multiply(activation,activation_sum) 381 | activation = tf.nn.l2_normalize(activation,1) 382 | 383 | return activation 384 | 385 | 386 | 387 | class SoftDBoF(): 388 | def __init__(self, feature_size,max_frames,cluster_size, max_pool, add_batch_norm, is_training): 389 | self.feature_size = feature_size 390 | self.max_frames = max_frames 391 | self.is_training = is_training 392 | self.add_batch_norm = add_batch_norm 393 | self.cluster_size = cluster_size 394 | self.max_pool = max_pool 395 | 396 | def forward(self, reshaped_input): 397 | 398 | feature_size = self.feature_size 399 | cluster_size = self.cluster_size 400 | add_batch_norm = self.add_batch_norm 401 | max_frames = self.max_frames 402 | is_training = self.is_training 403 | max_pool = self.max_pool 404 | 405 | cluster_weights = tf.get_variable("cluster_weights", 406 | [feature_size, cluster_size], 407 | initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size))) 408 | 409 | tf.summary.histogram("cluster_weights", cluster_weights) 410 | activation = tf.matmul(reshaped_input, cluster_weights) 411 | 412 | if add_batch_norm: 413 | activation = slim.batch_norm( 414 | activation, 415 | center=True, 416 | scale=True, 417 | is_training=is_training, 418 | scope="cluster_bn") 419 | else: 420 | cluster_biases = tf.get_variable("cluster_biases", 421 | [cluster_size], 422 | initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size))) 423 | tf.summary.histogram("cluster_biases", cluster_biases) 424 | activation += cluster_biases 425 | 426 | activation = tf.nn.softmax(activation) 427 | 428 | activation = tf.reshape(activation, [-1, max_frames, cluster_size]) 429 | 430 | activation_sum = tf.reduce_sum(activation,1) 431 | activation_sum = tf.nn.l2_normalize(activation_sum,1) 432 | 433 | if max_pool: 434 | activation_max = tf.reduce_max(activation,1) 435 | activation_max = tf.nn.l2_normalize(activation_max,1) 436 | activation = tf.concat([activation_sum,activation_max],1) 437 | else: 438 | activation = activation_sum 439 | 440 | return activation 441 | 442 | 443 | 444 | class DBoF(): 445 | def __init__(self, feature_size,max_frames,cluster_size,activation, add_batch_norm, is_training): 446 | self.feature_size = feature_size 447 | self.max_frames = max_frames 448 | self.is_training = is_training 449 | self.add_batch_norm = add_batch_norm 450 | self.cluster_size = cluster_size 451 | self.activation = activation 452 | 453 | 454 | def forward(self, reshaped_input): 455 | 456 | feature_size = self.feature_size 457 | cluster_size = self.cluster_size 458 | add_batch_norm = self.add_batch_norm 459 | max_frames = self.max_frames 460 | is_training = self.is_training 461 | 462 | cluster_weights = tf.get_variable("cluster_weights", 463 | [feature_size, cluster_size], 464 | initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size))) 465 | 466 | tf.summary.histogram("cluster_weights", cluster_weights) 467 | activation = tf.matmul(reshaped_input, cluster_weights) 468 | 469 | if add_batch_norm: 470 | activation = slim.batch_norm( 471 | activation, 472 | center=True, 473 | scale=True, 474 | is_training=is_training, 475 | scope="cluster_bn") 476 | else: 477 | cluster_biases = tf.get_variable("cluster_biases", 478 | [cluster_size], 479 | initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size))) 480 | tf.summary.histogram("cluster_biases", cluster_biases) 481 | activation += cluster_biases 482 | 483 | if activation == 'glu': 484 | space_ind = range(cluster_size/2) 485 | gate_ind = range(cluster_size/2,cluster_size) 486 | 487 | gates = tf.sigmoid(activation[:,gate_ind]) 488 | activation = tf.multiply(activation[:,space_ind],gates) 489 | 490 | elif activation == 'relu': 491 | activation = tf.nn.relu6(activation) 492 | 493 | tf.summary.histogram("cluster_output", activation) 494 | 495 | activation = tf.reshape(activation, [-1, max_frames, cluster_size]) 496 | 497 | avg_activation = utils.FramePooling(activation, 'average') 498 | avg_activation = tf.nn.l2_normalize(avg_activation,1) 499 | 500 | max_activation = utils.FramePooling(activation, 'max') 501 | max_activation = tf.nn.l2_normalize(max_activation,1) 502 | 503 | return tf.concat([avg_activation,max_activation],1) 504 | 505 | class NetFV(): 506 | def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training): 507 | self.feature_size = feature_size 508 | self.max_frames = max_frames 509 | self.is_training = is_training 510 | self.add_batch_norm = add_batch_norm 511 | self.cluster_size = cluster_size 512 | 513 | def forward(self,reshaped_input): 514 | cluster_weights = tf.get_variable("cluster_weights", 515 | [self.feature_size, self.cluster_size], 516 | initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) 517 | 518 | covar_weights = tf.get_variable("covar_weights", 519 | [self.feature_size, self.cluster_size], 520 | initializer = tf.random_normal_initializer(mean=1.0, stddev=1 /math.sqrt(self.feature_size))) 521 | 522 | covar_weights = tf.square(covar_weights) 523 | eps = tf.constant([1e-6]) 524 | covar_weights = tf.add(covar_weights,eps) 525 | 526 | tf.summary.histogram("cluster_weights", cluster_weights) 527 | activation = tf.matmul(reshaped_input, cluster_weights) 528 | if self.add_batch_norm: 529 | activation = slim.batch_norm( 530 | activation, 531 | center=True, 532 | scale=True, 533 | is_training=self.is_training, 534 | scope="cluster_bn") 535 | else: 536 | cluster_biases = tf.get_variable("cluster_biases", 537 | [self.cluster_size], 538 | initializer = tf.random_normal(stddev=1 / math.sqrt(self.feature_size))) 539 | tf.summary.histogram("cluster_biases", cluster_biases) 540 | activation += cluster_biases 541 | 542 | activation = tf.nn.softmax(activation) 543 | tf.summary.histogram("cluster_output", activation) 544 | 545 | activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size]) 546 | 547 | a_sum = tf.reduce_sum(activation,-2,keep_dims=True) 548 | 549 | if not FLAGS.fv_couple_weights: 550 | cluster_weights2 = tf.get_variable("cluster_weights2", 551 | [1,self.feature_size, self.cluster_size], 552 | initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) 553 | else: 554 | cluster_weights2 = tf.scalar_mul(FLAGS.fv_coupling_factor,cluster_weights) 555 | 556 | a = tf.multiply(a_sum,cluster_weights2) 557 | 558 | activation = tf.transpose(activation,perm=[0,2,1]) 559 | 560 | reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size]) 561 | fv1 = tf.matmul(activation,reshaped_input) 562 | 563 | fv1 = tf.transpose(fv1,perm=[0,2,1]) 564 | 565 | # computing second order FV 566 | a2 = tf.multiply(a_sum,tf.square(cluster_weights2)) 567 | 568 | b2 = tf.multiply(fv1,cluster_weights2) 569 | fv2 = tf.matmul(activation,tf.square(reshaped_input)) 570 | 571 | fv2 = tf.transpose(fv2,perm=[0,2,1]) 572 | fv2 = tf.add_n([a2,fv2,tf.scalar_mul(-2,b2)]) 573 | 574 | fv2 = tf.divide(fv2,tf.square(covar_weights)) 575 | fv2 = tf.subtract(fv2,a_sum) 576 | 577 | fv2 = tf.reshape(fv2,[-1,self.cluster_size*self.feature_size]) 578 | 579 | fv2 = tf.nn.l2_normalize(fv2,1) 580 | fv2 = tf.reshape(fv2,[-1,self.cluster_size*self.feature_size]) 581 | fv2 = tf.nn.l2_normalize(fv2,1) 582 | 583 | fv1 = tf.subtract(fv1,a) 584 | fv1 = tf.divide(fv1,covar_weights) 585 | 586 | fv1 = tf.nn.l2_normalize(fv1,1) 587 | fv1 = tf.reshape(fv1,[-1,self.cluster_size*self.feature_size]) 588 | fv1 = tf.nn.l2_normalize(fv1,1) 589 | 590 | return tf.concat([fv1,fv2],1) 591 | 592 | class NetVLADModelLF(models.BaseModel): 593 | """Creates a NetVLAD based model. 594 | 595 | Args: 596 | model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of 597 | input features. 598 | vocab_size: The number of classes in the dataset. 599 | num_frames: A vector of length 'batch' which indicates the number of 600 | frames for each video (before padding). 601 | 602 | Returns: 603 | A dictionary with a tensor containing the probability predictions of the 604 | model in the 'predictions' key. The dimensions of the tensor are 605 | 'batch_size' x 'num_classes'. 606 | """ 607 | 608 | 609 | def create_model(self, 610 | model_input, 611 | vocab_size, 612 | num_frames, 613 | iterations=None, 614 | add_batch_norm=None, 615 | sample_random_frames=None, 616 | cluster_size=None, 617 | hidden_size=None, 618 | is_training=True, 619 | **unused_params): 620 | iterations = iterations or FLAGS.iterations 621 | add_batch_norm = add_batch_norm or FLAGS.netvlad_add_batch_norm 622 | random_frames = sample_random_frames or FLAGS.sample_random_frames 623 | cluster_size = cluster_size or FLAGS.netvlad_cluster_size 624 | hidden1_size = hidden_size or FLAGS.netvlad_hidden_size 625 | relu = FLAGS.netvlad_relu 626 | dimred = FLAGS.netvlad_dimred 627 | gating = FLAGS.gating 628 | remove_diag = FLAGS.gating_remove_diag 629 | lightvlad = FLAGS.lightvlad 630 | vlagd = FLAGS.vlagd 631 | 632 | num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) 633 | if random_frames: 634 | model_input = utils.SampleRandomFrames(model_input, num_frames, 635 | iterations) 636 | else: 637 | model_input = utils.SampleRandomSequence(model_input, num_frames, 638 | iterations) 639 | 640 | 641 | max_frames = model_input.get_shape().as_list()[1] 642 | feature_size = model_input.get_shape().as_list()[2] 643 | reshaped_input = tf.reshape(model_input, [-1, feature_size]) 644 | 645 | if lightvlad: 646 | video_NetVLAD = LightVLAD(1024,max_frames,cluster_size, add_batch_norm, is_training) 647 | audio_NetVLAD = LightVLAD(128,max_frames,cluster_size/2, add_batch_norm, is_training) 648 | elif vlagd: 649 | video_NetVLAD = NetVLAGD(1024,max_frames,cluster_size, add_batch_norm, is_training) 650 | audio_NetVLAD = NetVLAGD(128,max_frames,cluster_size/2, add_batch_norm, is_training) 651 | else: 652 | video_NetVLAD = NetVLAD(1024,max_frames,cluster_size, add_batch_norm, is_training) 653 | audio_NetVLAD = NetVLAD(128,max_frames,cluster_size/2, add_batch_norm, is_training) 654 | 655 | 656 | if add_batch_norm:# and not lightvlad: 657 | reshaped_input = slim.batch_norm( 658 | reshaped_input, 659 | center=True, 660 | scale=True, 661 | is_training=is_training, 662 | scope="input_bn") 663 | 664 | with tf.variable_scope("video_VLAD"): 665 | vlad_video = video_NetVLAD.forward(reshaped_input[:,0:1024]) 666 | 667 | with tf.variable_scope("audio_VLAD"): 668 | vlad_audio = audio_NetVLAD.forward(reshaped_input[:,1024:]) 669 | 670 | vlad = tf.concat([vlad_video, vlad_audio],1) 671 | 672 | vlad_dim = vlad.get_shape().as_list()[1] 673 | hidden1_weights = tf.get_variable("hidden1_weights", 674 | [vlad_dim, hidden1_size], 675 | initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size))) 676 | 677 | activation = tf.matmul(vlad, hidden1_weights) 678 | 679 | if add_batch_norm and relu: 680 | activation = slim.batch_norm( 681 | activation, 682 | center=True, 683 | scale=True, 684 | is_training=is_training, 685 | scope="hidden1_bn") 686 | 687 | else: 688 | hidden1_biases = tf.get_variable("hidden1_biases", 689 | [hidden1_size], 690 | initializer = tf.random_normal_initializer(stddev=0.01)) 691 | tf.summary.histogram("hidden1_biases", hidden1_biases) 692 | activation += hidden1_biases 693 | 694 | if relu: 695 | activation = tf.nn.relu6(activation) 696 | 697 | 698 | if gating: 699 | gating_weights = tf.get_variable("gating_weights_2", 700 | [hidden1_size, hidden1_size], 701 | initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size))) 702 | 703 | gates = tf.matmul(activation, gating_weights) 704 | 705 | if remove_diag: 706 | #removes diagonals coefficients 707 | diagonals = tf.matrix_diag_part(gating_weights) 708 | gates = gates - tf.multiply(diagonals,activation) 709 | 710 | 711 | if add_batch_norm: 712 | gates = slim.batch_norm( 713 | gates, 714 | center=True, 715 | scale=True, 716 | is_training=is_training, 717 | scope="gating_bn") 718 | else: 719 | gating_biases = tf.get_variable("gating_biases", 720 | [cluster_size], 721 | initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size))) 722 | gates += gating_biases 723 | 724 | gates = tf.sigmoid(gates) 725 | 726 | activation = tf.multiply(activation,gates) 727 | 728 | aggregated_model = getattr(video_level_models, 729 | FLAGS.video_level_classifier_model) 730 | 731 | 732 | return aggregated_model().create_model( 733 | model_input=activation, 734 | vocab_size=vocab_size, 735 | is_training=is_training, 736 | **unused_params) 737 | 738 | class DbofModelLF(models.BaseModel): 739 | """Creates a Deep Bag of Frames model. 740 | 741 | The model projects the features for each frame into a higher dimensional 742 | 'clustering' space, pools across frames in that space, and then 743 | uses a configurable video-level model to classify the now aggregated features. 744 | 745 | The model will randomly sample either frames or sequences of frames during 746 | training to speed up convergence. 747 | 748 | Args: 749 | model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of 750 | input features. 751 | vocab_size: The number of classes in the dataset. 752 | num_frames: A vector of length 'batch' which indicates the number of 753 | frames for each video (before padding). 754 | 755 | Returns: 756 | A dictionary with a tensor containing the probability predictions of the 757 | model in the 'predictions' key. The dimensions of the tensor are 758 | 'batch_size' x 'num_classes'. 759 | """ 760 | 761 | def create_model(self, 762 | model_input, 763 | vocab_size, 764 | num_frames, 765 | iterations=None, 766 | add_batch_norm=None, 767 | sample_random_frames=None, 768 | cluster_size=None, 769 | hidden_size=None, 770 | is_training=True, 771 | **unused_params): 772 | iterations = iterations or FLAGS.iterations 773 | add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm 774 | random_frames = sample_random_frames or FLAGS.sample_random_frames 775 | cluster_size = cluster_size or FLAGS.dbof_cluster_size 776 | hidden1_size = hidden_size or FLAGS.dbof_hidden_size 777 | relu = FLAGS.dbof_relu 778 | cluster_activation = FLAGS.dbof_activation 779 | 780 | num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) 781 | if random_frames: 782 | model_input = utils.SampleRandomFrames(model_input, num_frames, 783 | iterations) 784 | else: 785 | model_input = utils.SampleRandomSequence(model_input, num_frames, 786 | iterations) 787 | max_frames = model_input.get_shape().as_list()[1] 788 | feature_size = model_input.get_shape().as_list()[2] 789 | reshaped_input = tf.reshape(model_input, [-1, feature_size]) 790 | tf.summary.histogram("input_hist", reshaped_input) 791 | 792 | if cluster_activation == 'glu': 793 | cluster_size = 2*cluster_size 794 | 795 | video_Dbof = DBoF(1024,max_frames,cluster_size, cluster_activation, add_batch_norm, is_training) 796 | audio_Dbof = DBoF(128,max_frames,cluster_size/8, cluster_activation, add_batch_norm, is_training) 797 | 798 | 799 | if add_batch_norm: 800 | reshaped_input = slim.batch_norm( 801 | reshaped_input, 802 | center=True, 803 | scale=True, 804 | is_training=is_training, 805 | scope="input_bn") 806 | 807 | with tf.variable_scope("video_DBOF"): 808 | dbof_video = video_Dbof.forward(reshaped_input[:,0:1024]) 809 | 810 | with tf.variable_scope("audio_DBOF"): 811 | dbof_audio = audio_Dbof.forward(reshaped_input[:,1024:]) 812 | 813 | dbof = tf.concat([dbof_video, dbof_audio],1) 814 | 815 | dbof_dim = dbof.get_shape().as_list()[1] 816 | 817 | hidden1_weights = tf.get_variable("hidden1_weights", 818 | [dbof_dim, hidden1_size], 819 | initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size))) 820 | tf.summary.histogram("hidden1_weights", hidden1_weights) 821 | activation = tf.matmul(dbof, hidden1_weights) 822 | 823 | if add_batch_norm and relu: 824 | activation = slim.batch_norm( 825 | activation, 826 | center=True, 827 | scale=True, 828 | is_training=is_training, 829 | scope="hidden1_bn") 830 | else: 831 | hidden1_biases = tf.get_variable("hidden1_biases", 832 | [hidden1_size], 833 | initializer = tf.random_normal_initializer(stddev=0.01)) 834 | tf.summary.histogram("hidden1_biases", hidden1_biases) 835 | activation += hidden1_biases 836 | 837 | if relu: 838 | activation = tf.nn.relu6(activation) 839 | tf.summary.histogram("hidden1_output", activation) 840 | 841 | aggregated_model = getattr(video_level_models, 842 | FLAGS.video_level_classifier_model) 843 | 844 | return aggregated_model().create_model( 845 | model_input=activation, 846 | vocab_size=vocab_size, 847 | **unused_params) 848 | 849 | class GatedDbofModelLF(models.BaseModel): 850 | """Creates a Gated Deep Bag of Frames model. 851 | 852 | The model projects the features for each frame into a higher dimensional 853 | 'clustering' space, pools across frames in that space, and then 854 | uses a configurable video-level model to classify the now aggregated features. 855 | 856 | The model will randomly sample either frames or sequences of frames during 857 | training to speed up convergence. 858 | 859 | Args: 860 | model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of 861 | input features. 862 | vocab_size: The number of classes in the dataset. 863 | num_frames: A vector of length 'batch' which indicates the number of 864 | frames for each video (before padding). 865 | 866 | Returns: 867 | A dictionary with a tensor containing the probability predictions of the 868 | model in the 'predictions' key. The dimensions of the tensor are 869 | 'batch_size' x 'num_classes'. 870 | """ 871 | 872 | def create_model(self, 873 | model_input, 874 | vocab_size, 875 | num_frames, 876 | iterations=None, 877 | add_batch_norm=None, 878 | sample_random_frames=None, 879 | cluster_size=None, 880 | hidden_size=None, 881 | is_training=True, 882 | **unused_params): 883 | iterations = iterations or FLAGS.iterations 884 | add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm 885 | random_frames = sample_random_frames or FLAGS.sample_random_frames 886 | cluster_size = cluster_size or FLAGS.dbof_cluster_size 887 | hidden1_size = hidden_size or FLAGS.dbof_hidden_size 888 | fc_dimred = FLAGS.fc_dimred 889 | relu = FLAGS.dbof_relu 890 | max_pool = FLAGS.softdbof_maxpool 891 | 892 | num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) 893 | if random_frames: 894 | model_input = utils.SampleRandomFrames(model_input, num_frames, 895 | iterations) 896 | else: 897 | model_input = utils.SampleRandomSequence(model_input, num_frames, 898 | iterations) 899 | max_frames = model_input.get_shape().as_list()[1] 900 | feature_size = model_input.get_shape().as_list()[2] 901 | reshaped_input = tf.reshape(model_input, [-1, feature_size]) 902 | tf.summary.histogram("input_hist", reshaped_input) 903 | 904 | video_Dbof = GatedDBoF(1024,max_frames,cluster_size, max_pool, add_batch_norm, is_training) 905 | audio_Dbof = SoftDBoF(128,max_frames,cluster_size/8, max_pool, add_batch_norm, is_training) 906 | 907 | 908 | if add_batch_norm: 909 | reshaped_input = slim.batch_norm( 910 | reshaped_input, 911 | center=True, 912 | scale=True, 913 | is_training=is_training, 914 | scope="input_bn") 915 | 916 | with tf.variable_scope("video_DBOF"): 917 | dbof_video = video_Dbof.forward(reshaped_input[:,0:1024]) 918 | 919 | with tf.variable_scope("audio_DBOF"): 920 | dbof_audio = audio_Dbof.forward(reshaped_input[:,1024:]) 921 | 922 | dbof = tf.concat([dbof_video, dbof_audio],1) 923 | 924 | dbof_dim = dbof.get_shape().as_list()[1] 925 | 926 | if fc_dimred: 927 | hidden1_weights = tf.get_variable("hidden1_weights", 928 | [dbof_dim, hidden1_size], 929 | initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size))) 930 | tf.summary.histogram("hidden1_weights", hidden1_weights) 931 | activation = tf.matmul(dbof, hidden1_weights) 932 | 933 | if add_batch_norm and relu: 934 | activation = slim.batch_norm( 935 | activation, 936 | center=True, 937 | scale=True, 938 | is_training=is_training, 939 | scope="hidden1_bn") 940 | else: 941 | hidden1_biases = tf.get_variable("hidden1_biases", 942 | [hidden1_size], 943 | initializer = tf.random_normal_initializer(stddev=0.01)) 944 | tf.summary.histogram("hidden1_biases", hidden1_biases) 945 | activation += hidden1_biases 946 | 947 | if relu: 948 | activation = tf.nn.relu6(activation) 949 | tf.summary.histogram("hidden1_output", activation) 950 | else: 951 | activation = dbof 952 | 953 | aggregated_model = getattr(video_level_models, 954 | FLAGS.video_level_classifier_model) 955 | 956 | 957 | return aggregated_model().create_model( 958 | model_input=activation, 959 | vocab_size=vocab_size, 960 | is_training=is_training, 961 | **unused_params) 962 | 963 | 964 | class SoftDbofModelLF(models.BaseModel): 965 | """Creates a Soft Deep Bag of Frames model. 966 | 967 | The model projects the features for each frame into a higher dimensional 968 | 'clustering' space, pools across frames in that space, and then 969 | uses a configurable video-level model to classify the now aggregated features. 970 | 971 | The model will randomly sample either frames or sequences of frames during 972 | training to speed up convergence. 973 | 974 | Args: 975 | model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of 976 | input features. 977 | vocab_size: The number of classes in the dataset. 978 | num_frames: A vector of length 'batch' which indicates the number of 979 | frames for each video (before padding). 980 | 981 | Returns: 982 | A dictionary with a tensor containing the probability predictions of the 983 | model in the 'predictions' key. The dimensions of the tensor are 984 | 'batch_size' x 'num_classes'. 985 | """ 986 | 987 | def create_model(self, 988 | model_input, 989 | vocab_size, 990 | num_frames, 991 | iterations=None, 992 | add_batch_norm=None, 993 | sample_random_frames=None, 994 | cluster_size=None, 995 | hidden_size=None, 996 | is_training=True, 997 | **unused_params): 998 | iterations = iterations or FLAGS.iterations 999 | add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm 1000 | random_frames = sample_random_frames or FLAGS.sample_random_frames 1001 | cluster_size = cluster_size or FLAGS.dbof_cluster_size 1002 | hidden1_size = hidden_size or FLAGS.dbof_hidden_size 1003 | fc_dimred = FLAGS.fc_dimred 1004 | relu = FLAGS.dbof_relu 1005 | max_pool = FLAGS.softdbof_maxpool 1006 | 1007 | num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) 1008 | if random_frames: 1009 | model_input = utils.SampleRandomFrames(model_input, num_frames, 1010 | iterations) 1011 | else: 1012 | model_input = utils.SampleRandomSequence(model_input, num_frames, 1013 | iterations) 1014 | max_frames = model_input.get_shape().as_list()[1] 1015 | feature_size = model_input.get_shape().as_list()[2] 1016 | reshaped_input = tf.reshape(model_input, [-1, feature_size]) 1017 | tf.summary.histogram("input_hist", reshaped_input) 1018 | 1019 | video_Dbof = SoftDBoF(1024,max_frames,cluster_size, max_pool, add_batch_norm, is_training) 1020 | audio_Dbof = SoftDBoF(128,max_frames,cluster_size/8, max_pool, add_batch_norm, is_training) 1021 | 1022 | 1023 | if add_batch_norm: 1024 | reshaped_input = slim.batch_norm( 1025 | reshaped_input, 1026 | center=True, 1027 | scale=True, 1028 | is_training=is_training, 1029 | scope="input_bn") 1030 | 1031 | with tf.variable_scope("video_DBOF"): 1032 | dbof_video = video_Dbof.forward(reshaped_input[:,0:1024]) 1033 | 1034 | with tf.variable_scope("audio_DBOF"): 1035 | dbof_audio = audio_Dbof.forward(reshaped_input[:,1024:]) 1036 | 1037 | dbof = tf.concat([dbof_video, dbof_audio],1) 1038 | 1039 | dbof_dim = dbof.get_shape().as_list()[1] 1040 | 1041 | if fc_dimred: 1042 | hidden1_weights = tf.get_variable("hidden1_weights", 1043 | [dbof_dim, hidden1_size], 1044 | initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size))) 1045 | tf.summary.histogram("hidden1_weights", hidden1_weights) 1046 | activation = tf.matmul(dbof, hidden1_weights) 1047 | 1048 | if add_batch_norm and relu: 1049 | activation = slim.batch_norm( 1050 | activation, 1051 | center=True, 1052 | scale=True, 1053 | is_training=is_training, 1054 | scope="hidden1_bn") 1055 | else: 1056 | hidden1_biases = tf.get_variable("hidden1_biases", 1057 | [hidden1_size], 1058 | initializer = tf.random_normal_initializer(stddev=0.01)) 1059 | tf.summary.histogram("hidden1_biases", hidden1_biases) 1060 | activation += hidden1_biases 1061 | 1062 | if relu: 1063 | activation = tf.nn.relu6(activation) 1064 | tf.summary.histogram("hidden1_output", activation) 1065 | else: 1066 | activation = dbof 1067 | 1068 | aggregated_model = getattr(video_level_models, 1069 | FLAGS.video_level_classifier_model) 1070 | 1071 | 1072 | return aggregated_model().create_model( 1073 | model_input=activation, 1074 | vocab_size=vocab_size, 1075 | is_training=is_training, 1076 | **unused_params) 1077 | 1078 | class LstmModel(models.BaseModel): 1079 | 1080 | def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params): 1081 | """Creates a model which uses a stack of LSTMs to represent the video. 1082 | 1083 | Args: 1084 | model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of 1085 | input features. 1086 | vocab_size: The number of classes in the dataset. 1087 | num_frames: A vector of length 'batch' which indicates the number of 1088 | frames for each video (before padding). 1089 | 1090 | Returns: 1091 | A dictionary with a tensor containing the probability predictions of the 1092 | model in the 'predictions' key. The dimensions of the tensor are 1093 | 'batch_size' x 'num_classes'. 1094 | """ 1095 | lstm_size = FLAGS.lstm_cells 1096 | number_of_layers = FLAGS.lstm_layers 1097 | random_frames = FLAGS.lstm_random_sequence 1098 | iterations = FLAGS.iterations 1099 | backward = FLAGS.lstm_backward 1100 | 1101 | if random_frames: 1102 | num_frames_2 = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) 1103 | model_input = utils.SampleRandomFrames(model_input, num_frames_2, 1104 | iterations) 1105 | if backward: 1106 | model_input = tf.reverse_sequence(model_input, num_frames, seq_axis=1) 1107 | 1108 | stacked_lstm = tf.contrib.rnn.MultiRNNCell( 1109 | [ 1110 | tf.contrib.rnn.BasicLSTMCell( 1111 | lstm_size, forget_bias=1.0, state_is_tuple=False) 1112 | for _ in range(number_of_layers) 1113 | ], state_is_tuple=False) 1114 | 1115 | loss = 0.0 1116 | with tf.variable_scope("RNN"): 1117 | outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input, 1118 | sequence_length=num_frames, 1119 | dtype=tf.float32) 1120 | 1121 | aggregated_model = getattr(video_level_models, 1122 | FLAGS.video_level_classifier_model) 1123 | 1124 | return aggregated_model().create_model( 1125 | model_input=state, 1126 | vocab_size=vocab_size, 1127 | is_training=is_training, 1128 | **unused_params) 1129 | 1130 | 1131 | class GruModel(models.BaseModel): 1132 | 1133 | def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params): 1134 | """Creates a model which uses a stack of GRUs to represent the video. 1135 | 1136 | Args: 1137 | model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of 1138 | input features. 1139 | vocab_size: The number of classes in the dataset. 1140 | num_frames: A vector of length 'batch' which indicates the number of 1141 | frames for each video (before padding). 1142 | 1143 | Returns: 1144 | A dictionary with a tensor containing the probability predictions of the 1145 | model in the 'predictions' key. The dimensions of the tensor are 1146 | 'batch_size' x 'num_classes'. 1147 | """ 1148 | gru_size = FLAGS.gru_cells 1149 | number_of_layers = FLAGS.gru_layers 1150 | backward = FLAGS.gru_backward 1151 | random_frames = FLAGS.gru_random_sequence 1152 | iterations = FLAGS.iterations 1153 | 1154 | if random_frames: 1155 | num_frames_2 = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) 1156 | model_input = utils.SampleRandomFrames(model_input, num_frames_2, 1157 | iterations) 1158 | 1159 | if backward: 1160 | model_input = tf.reverse_sequence(model_input, num_frames, seq_axis=1) 1161 | 1162 | stacked_GRU = tf.contrib.rnn.MultiRNNCell( 1163 | [ 1164 | tf.contrib.rnn.GRUCell(gru_size) 1165 | for _ in range(number_of_layers) 1166 | ], state_is_tuple=False) 1167 | 1168 | loss = 0.0 1169 | with tf.variable_scope("RNN"): 1170 | outputs, state = tf.nn.dynamic_rnn(stacked_GRU, model_input, 1171 | sequence_length=num_frames, 1172 | dtype=tf.float32) 1173 | 1174 | aggregated_model = getattr(video_level_models, 1175 | FLAGS.video_level_classifier_model) 1176 | return aggregated_model().create_model( 1177 | model_input=state, 1178 | vocab_size=vocab_size, 1179 | is_training=is_training, 1180 | **unused_params) 1181 | 1182 | 1183 | 1184 | class NetFVModelLF(models.BaseModel): 1185 | """Creates a NetFV based model. 1186 | It emulates a Gaussian Mixture Fisher Vector pooling operations 1187 | 1188 | Args: 1189 | model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of 1190 | input features. 1191 | vocab_size: The number of classes in the dataset. 1192 | num_frames: A vector of length 'batch' which indicates the number of 1193 | frames for each video (before padding). 1194 | 1195 | Returns: 1196 | A dictionary with a tensor containing the probability predictions of the 1197 | model in the 'predictions' key. The dimensions of the tensor are 1198 | 'batch_size' x 'num_classes'. 1199 | """ 1200 | 1201 | 1202 | def create_model(self, 1203 | model_input, 1204 | vocab_size, 1205 | num_frames, 1206 | iterations=None, 1207 | add_batch_norm=None, 1208 | sample_random_frames=None, 1209 | cluster_size=None, 1210 | hidden_size=None, 1211 | is_training=True, 1212 | **unused_params): 1213 | iterations = iterations or FLAGS.iterations 1214 | add_batch_norm = add_batch_norm or FLAGS.netvlad_add_batch_norm 1215 | random_frames = sample_random_frames or FLAGS.sample_random_frames 1216 | cluster_size = cluster_size or FLAGS.fv_cluster_size 1217 | hidden1_size = hidden_size or FLAGS.fv_hidden_size 1218 | relu = FLAGS.fv_relu 1219 | gating = FLAGS.gating 1220 | 1221 | num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) 1222 | if random_frames: 1223 | model_input = utils.SampleRandomFrames(model_input, num_frames, 1224 | iterations) 1225 | else: 1226 | model_input = utils.SampleRandomSequence(model_input, num_frames, 1227 | iterations) 1228 | max_frames = model_input.get_shape().as_list()[1] 1229 | feature_size = model_input.get_shape().as_list()[2] 1230 | reshaped_input = tf.reshape(model_input, [-1, feature_size]) 1231 | tf.summary.histogram("input_hist", reshaped_input) 1232 | 1233 | video_NetFV = NetFV(1024,max_frames,cluster_size, add_batch_norm, is_training) 1234 | audio_NetFV = NetFV(128,max_frames,cluster_size/2, add_batch_norm, is_training) 1235 | 1236 | 1237 | if add_batch_norm: 1238 | reshaped_input = slim.batch_norm( 1239 | reshaped_input, 1240 | center=True, 1241 | scale=True, 1242 | is_training=is_training, 1243 | scope="input_bn") 1244 | 1245 | with tf.variable_scope("video_FV"): 1246 | fv_video = video_NetFV.forward(reshaped_input[:,0:1024]) 1247 | 1248 | with tf.variable_scope("audio_FV"): 1249 | fv_audio = audio_NetFV.forward(reshaped_input[:,1024:]) 1250 | 1251 | fv = tf.concat([fv_video, fv_audio],1) 1252 | 1253 | fv_dim = fv.get_shape().as_list()[1] 1254 | hidden1_weights = tf.get_variable("hidden1_weights", 1255 | [fv_dim, hidden1_size], 1256 | initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size))) 1257 | 1258 | activation = tf.matmul(fv, hidden1_weights) 1259 | 1260 | if add_batch_norm and relu: 1261 | activation = slim.batch_norm( 1262 | activation, 1263 | center=True, 1264 | scale=True, 1265 | is_training=is_training, 1266 | scope="hidden1_bn") 1267 | else: 1268 | hidden1_biases = tf.get_variable("hidden1_biases", 1269 | [hidden1_size], 1270 | initializer = tf.random_normal_initializer(stddev=0.01)) 1271 | tf.summary.histogram("hidden1_biases", hidden1_biases) 1272 | activation += hidden1_biases 1273 | 1274 | if relu: 1275 | activation = tf.nn.relu6(activation) 1276 | 1277 | if gating: 1278 | gating_weights = tf.get_variable("gating_weights_2", 1279 | [hidden1_size, hidden1_size], 1280 | initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size))) 1281 | 1282 | gates = tf.matmul(activation, gating_weights) 1283 | 1284 | if add_batch_norm: 1285 | gates = slim.batch_norm( 1286 | gates, 1287 | center=True, 1288 | scale=True, 1289 | is_training=is_training, 1290 | scope="gating_bn") 1291 | else: 1292 | gating_biases = tf.get_variable("gating_biases", 1293 | [cluster_size], 1294 | initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size))) 1295 | gates += gating_biases 1296 | 1297 | gates = tf.sigmoid(gates) 1298 | 1299 | activation = tf.multiply(activation,gates) 1300 | 1301 | 1302 | aggregated_model = getattr(video_level_models, 1303 | FLAGS.video_level_classifier_model) 1304 | 1305 | return aggregated_model().create_model( 1306 | model_input=activation, 1307 | vocab_size=vocab_size, 1308 | is_training=is_training, 1309 | **unused_params) 1310 | 1311 | -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Binary for generating predictions over a set of videos.""" 16 | 17 | import os 18 | import time 19 | 20 | import numpy 21 | import tensorflow as tf 22 | 23 | from tensorflow import app 24 | from tensorflow import flags 25 | from tensorflow import gfile 26 | from tensorflow import logging 27 | 28 | import eval_util 29 | import losses 30 | import readers 31 | import utils 32 | 33 | FLAGS = flags.FLAGS 34 | 35 | if __name__ == '__main__': 36 | flags.DEFINE_string("train_dir", "/tmp/yt8m_model/", 37 | "The directory to load the model files from.") 38 | flags.DEFINE_string("output_file", "", 39 | "The file to save the predictions to.") 40 | flags.DEFINE_string( 41 | "input_data_pattern", "", 42 | "File glob defining the evaluation dataset in tensorflow.SequenceExample " 43 | "format. The SequenceExamples are expected to have an 'rgb' byte array " 44 | "sequence feature as well as a 'labels' int64 context feature.") 45 | 46 | # Model flags. 47 | flags.DEFINE_bool( 48 | "frame_features", False, 49 | "If set, then --eval_data_pattern must be frame-level features. " 50 | "Otherwise, --eval_data_pattern must be aggregated video-level " 51 | "features. The model must also be set appropriately (i.e. to read 3D " 52 | "batches VS 4D batches.") 53 | flags.DEFINE_integer( 54 | "batch_size", 8192, 55 | "How many examples to process per batch.") 56 | flags.DEFINE_string("feature_names", "mean_rgb", "Name of the feature " 57 | "to use for training.") 58 | flags.DEFINE_string("feature_sizes", "1024", "Length of the feature vectors.") 59 | 60 | 61 | # Other flags. 62 | flags.DEFINE_integer("num_readers", 4, 63 | "How many threads to use for reading input files.") 64 | flags.DEFINE_integer("top_k", 20, 65 | "How many predictions to output per video.") 66 | flags.DEFINE_integer("check_point",-1, 67 | "Model checkpoint to load, -1 for latest.") 68 | 69 | def format_lines(video_ids, predictions, top_k): 70 | batch_size = len(video_ids) 71 | for video_index in range(batch_size): 72 | top_indices = numpy.argpartition(predictions[video_index], -top_k)[-top_k:] 73 | line = [(class_index, predictions[video_index][class_index]) 74 | for class_index in top_indices] 75 | line = sorted(line, key=lambda p: -p[1]) 76 | yield video_ids[video_index].decode('utf-8') + "," + " ".join("%i %f" % pair 77 | for pair in line) + "\n" 78 | 79 | 80 | def get_input_data_tensors(reader, data_pattern, batch_size, num_readers=1): 81 | """Creates the section of the graph which reads the input data. 82 | 83 | Args: 84 | reader: A class which parses the input data. 85 | data_pattern: A 'glob' style path to the data files. 86 | batch_size: How many examples to process at a time. 87 | num_readers: How many I/O threads to use. 88 | 89 | Returns: 90 | A tuple containing the features tensor, labels tensor, and optionally a 91 | tensor containing the number of frames per video. The exact dimensions 92 | depend on the reader being used. 93 | 94 | Raises: 95 | IOError: If no files matching the given pattern were found. 96 | """ 97 | with tf.name_scope("input"): 98 | files = gfile.Glob(data_pattern) 99 | if not files: 100 | raise IOError("Unable to find input files. data_pattern='" + 101 | data_pattern + "'") 102 | logging.info("number of input files: " + str(len(files))) 103 | filename_queue = tf.train.string_input_producer( 104 | files, num_epochs=1, shuffle=False) 105 | examples_and_labels = [reader.prepare_reader(filename_queue) 106 | for _ in range(num_readers)] 107 | 108 | video_id_batch, video_batch, unused_labels, num_frames_batch = ( 109 | tf.train.batch_join(examples_and_labels, 110 | batch_size=batch_size, 111 | allow_smaller_final_batch = True, 112 | enqueue_many=True)) 113 | return video_id_batch, video_batch, num_frames_batch 114 | 115 | def inference(reader, train_dir, data_pattern, out_file_location, batch_size, top_k): 116 | with tf.Session() as sess, gfile.Open(out_file_location, "w+") as out_file: 117 | video_id_batch, video_batch, num_frames_batch = get_input_data_tensors(reader, data_pattern, batch_size) 118 | latest_checkpoint = tf.train.latest_checkpoint(train_dir) 119 | if latest_checkpoint is None: 120 | raise Exception("unable to find a checkpoint at location: %s" % train_dir) 121 | else: 122 | if FLAGS.check_point < 0: 123 | meta_graph_location = latest_checkpoint + ".meta" 124 | else: 125 | meta_graph_location = FLAGS.train_dir + "/model.ckpt-" + str(FLAGS.check_point) + ".meta" 126 | latest_checkpoint = FLAGS.train_dir + "/model.ckpt-" + str(FLAGS.check_point) 127 | logging.info("loading meta-graph: " + meta_graph_location) 128 | saver = tf.train.import_meta_graph(meta_graph_location, clear_devices=True) 129 | logging.info("restoring variables from " + latest_checkpoint) 130 | saver.restore(sess, latest_checkpoint) 131 | input_tensor = tf.get_collection("input_batch_raw")[0] 132 | num_frames_tensor = tf.get_collection("num_frames")[0] 133 | predictions_tensor = tf.get_collection("predictions")[0] 134 | 135 | # Workaround for num_epochs issue. 136 | def set_up_init_ops(variables): 137 | init_op_list = [] 138 | for variable in list(variables): 139 | if "train_input" in variable.name: 140 | init_op_list.append(tf.assign(variable, 1)) 141 | variables.remove(variable) 142 | init_op_list.append(tf.variables_initializer(variables)) 143 | return init_op_list 144 | 145 | sess.run(set_up_init_ops(tf.get_collection_ref( 146 | tf.GraphKeys.LOCAL_VARIABLES))) 147 | 148 | coord = tf.train.Coordinator() 149 | threads = tf.train.start_queue_runners(sess=sess, coord=coord) 150 | num_examples_processed = 0 151 | start_time = time.time() 152 | out_file.write("VideoId,LabelConfidencePairs\n") 153 | 154 | try: 155 | while not coord.should_stop(): 156 | video_id_batch_val, video_batch_val,num_frames_batch_val = sess.run([video_id_batch, video_batch, num_frames_batch]) 157 | predictions_val, = sess.run([predictions_tensor], feed_dict={input_tensor: video_batch_val, num_frames_tensor: num_frames_batch_val}) 158 | now = time.time() 159 | num_examples_processed += len(video_batch_val) 160 | num_classes = predictions_val.shape[1] 161 | logging.info("num examples processed: " + str(num_examples_processed) + " elapsed seconds: " + "{0:.2f}".format(now-start_time)) 162 | for line in format_lines(video_id_batch_val, predictions_val, top_k): 163 | out_file.write(line) 164 | out_file.flush() 165 | 166 | 167 | except tf.errors.OutOfRangeError: 168 | logging.info('Done with inference. The output file was written to ' + out_file_location) 169 | finally: 170 | coord.request_stop() 171 | 172 | coord.join(threads) 173 | sess.close() 174 | 175 | 176 | def main(unused_argv): 177 | logging.set_verbosity(tf.logging.INFO) 178 | 179 | # convert feature_names and feature_sizes to lists of values 180 | feature_names, feature_sizes = utils.GetListOfFeatureNamesAndSizes( 181 | FLAGS.feature_names, FLAGS.feature_sizes) 182 | 183 | if FLAGS.frame_features: 184 | reader = readers.YT8MFrameFeatureReader(feature_names=feature_names, 185 | feature_sizes=feature_sizes) 186 | else: 187 | reader = readers.YT8MAggregatedFeatureReader(feature_names=feature_names, 188 | feature_sizes=feature_sizes) 189 | 190 | if FLAGS.output_file is "": 191 | raise ValueError("'output_file' was not specified. " 192 | "Unable to continue with inference.") 193 | 194 | if FLAGS.input_data_pattern is "": 195 | raise ValueError("'input_data_pattern' was not specified. " 196 | "Unable to continue with inference.") 197 | 198 | inference(reader, FLAGS.train_dir, FLAGS.input_data_pattern, 199 | FLAGS.output_file, FLAGS.batch_size, FLAGS.top_k) 200 | 201 | 202 | if __name__ == "__main__": 203 | app.run() 204 | -------------------------------------------------------------------------------- /losses.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Antoine Miech All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Provides definitions for non-regularized training or test losses.""" 16 | 17 | import tensorflow as tf 18 | from tensorflow import flags 19 | import scipy.io as sio 20 | import numpy as np 21 | 22 | FLAGS = flags.FLAGS 23 | 24 | flags.DEFINE_float( 25 | "alpha", "0.5", 26 | "Ponderation for XENT") 27 | 28 | 29 | class BaseLoss(object): 30 | """Inherit from this class when implementing new losses.""" 31 | 32 | def calculate_loss(self, unused_predictions, unused_labels, **unused_params): 33 | """Calculates the average loss of the examples in a mini-batch. 34 | 35 | Args: 36 | unused_predictions: a 2-d tensor storing the prediction scores, in which 37 | each row represents a sample in the mini-batch and each column 38 | represents a class. 39 | unused_labels: a 2-d tensor storing the labels, which has the same shape 40 | as the unused_predictions. The labels must be in the range of 0 and 1. 41 | unused_params: loss specific parameters. 42 | 43 | Returns: 44 | A scalar loss tensor. 45 | """ 46 | raise NotImplementedError() 47 | 48 | 49 | class CrossEntropyLoss(BaseLoss): 50 | """Calculate the cross entropy loss between the predictions and labels. 51 | """ 52 | 53 | def calculate_loss(self, predictions, labels, **unused_params): 54 | with tf.name_scope("loss_xent"): 55 | epsilon = 10e-6 56 | alpha = FLAGS.alpha 57 | 58 | float_labels = tf.cast(labels, tf.float32) 59 | cross_entropy_loss = 2*(alpha*float_labels * tf.log(predictions + epsilon) + (1-alpha)*( 60 | 1 - float_labels) * tf.log(1 - predictions + epsilon)) 61 | cross_entropy_loss = tf.negative(cross_entropy_loss) 62 | return tf.reduce_mean(tf.reduce_sum(cross_entropy_loss, 1)) 63 | -------------------------------------------------------------------------------- /mean_average_precision_calculator.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Calculate the mean average precision. 16 | 17 | It provides an interface for calculating mean average precision 18 | for an entire list or the top-n ranked items. 19 | 20 | Example usages: 21 | We first call the function accumulate many times to process parts of the ranked 22 | list. After processing all the parts, we call peek_map_at_n 23 | to calculate the mean average precision. 24 | 25 | ``` 26 | import random 27 | 28 | p = np.array([[random.random() for _ in xrange(50)] for _ in xrange(1000)]) 29 | a = np.array([[random.choice([0, 1]) for _ in xrange(50)] 30 | for _ in xrange(1000)]) 31 | 32 | # mean average precision for 50 classes. 33 | calculator = mean_average_precision_calculator.MeanAveragePrecisionCalculator( 34 | num_class=50) 35 | calculator.accumulate(p, a) 36 | aps = calculator.peek_map_at_n() 37 | ``` 38 | """ 39 | 40 | import numpy 41 | import average_precision_calculator 42 | 43 | 44 | class MeanAveragePrecisionCalculator(object): 45 | """This class is to calculate mean average precision. 46 | """ 47 | 48 | def __init__(self, num_class): 49 | """Construct a calculator to calculate the (macro) average precision. 50 | 51 | Args: 52 | num_class: A positive Integer specifying the number of classes. 53 | top_n_array: A list of positive integers specifying the top n for each 54 | class. The top n in each class will be used to calculate its average 55 | precision at n. 56 | The size of the array must be num_class. 57 | 58 | Raises: 59 | ValueError: An error occurred when num_class is not a positive integer; 60 | or the top_n_array is not a list of positive integers. 61 | """ 62 | if not isinstance(num_class, int) or num_class <= 1: 63 | raise ValueError("num_class must be a positive integer.") 64 | 65 | self._ap_calculators = [] # member of AveragePrecisionCalculator 66 | self._num_class = num_class # total number of classes 67 | for i in range(num_class): 68 | self._ap_calculators.append( 69 | average_precision_calculator.AveragePrecisionCalculator()) 70 | 71 | def accumulate(self, predictions, actuals, num_positives=None): 72 | """Accumulate the predictions and their ground truth labels. 73 | 74 | Args: 75 | predictions: A list of lists storing the prediction scores. The outer 76 | dimension corresponds to classes. 77 | actuals: A list of lists storing the ground truth labels. The dimensions 78 | should correspond to the predictions input. Any value 79 | larger than 0 will be treated as positives, otherwise as negatives. 80 | num_positives: If provided, it is a list of numbers representing the 81 | number of true positives for each class. If not provided, the number of 82 | true positives will be inferred from the 'actuals' array. 83 | 84 | Raises: 85 | ValueError: An error occurred when the shape of predictions and actuals 86 | does not match. 87 | """ 88 | if not num_positives: 89 | num_positives = [None for i in predictions.shape[1]] 90 | 91 | calculators = self._ap_calculators 92 | for i in range(len(predictions)): 93 | calculators[i].accumulate(predictions[i], actuals[i], num_positives[i]) 94 | 95 | def clear(self): 96 | for calculator in self._ap_calculators: 97 | calculator.clear() 98 | 99 | def is_empty(self): 100 | return ([calculator.heap_size for calculator in self._ap_calculators] == 101 | [0 for _ in range(self._num_class)]) 102 | 103 | def peek_map_at_n(self): 104 | """Peek the non-interpolated mean average precision at n. 105 | 106 | Returns: 107 | An array of non-interpolated average precision at n (default 0) for each 108 | class. 109 | """ 110 | aps = [self._ap_calculators[i].peek_ap_at_n() 111 | for i in range(self._num_class)] 112 | return aps 113 | -------------------------------------------------------------------------------- /model_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Contains a collection of util functions for model construction. 16 | """ 17 | import numpy 18 | import tensorflow as tf 19 | from tensorflow import logging 20 | from tensorflow import flags 21 | import tensorflow.contrib.slim as slim 22 | 23 | def SampleRandomSequence(model_input, num_frames, num_samples): 24 | """Samples a random sequence of frames of size num_samples. 25 | 26 | Args: 27 | model_input: A tensor of size batch_size x max_frames x feature_size 28 | num_frames: A tensor of size batch_size x 1 29 | num_samples: A scalar 30 | 31 | Returns: 32 | `model_input`: A tensor of size batch_size x num_samples x feature_size 33 | """ 34 | 35 | batch_size = tf.shape(model_input)[0] 36 | frame_index_offset = tf.tile( 37 | tf.expand_dims(tf.range(num_samples), 0), [batch_size, 1]) 38 | max_start_frame_index = tf.maximum(num_frames - num_samples, 0) 39 | start_frame_index = tf.cast( 40 | tf.multiply( 41 | tf.random_uniform([batch_size, 1]), 42 | tf.cast(max_start_frame_index + 1, tf.float32)), tf.int32) 43 | frame_index = tf.minimum(start_frame_index + frame_index_offset, 44 | tf.cast(num_frames - 1, tf.int32)) 45 | batch_index = tf.tile( 46 | tf.expand_dims(tf.range(batch_size), 1), [1, num_samples]) 47 | index = tf.stack([batch_index, frame_index], 2) 48 | return tf.gather_nd(model_input, index) 49 | 50 | 51 | def SampleRandomFrames(model_input, num_frames, num_samples): 52 | """Samples a random set of frames of size num_samples. 53 | 54 | Args: 55 | model_input: A tensor of size batch_size x max_frames x feature_size 56 | num_frames: A tensor of size batch_size x 1 57 | num_samples: A scalar 58 | 59 | Returns: 60 | `model_input`: A tensor of size batch_size x num_samples x feature_size 61 | """ 62 | batch_size = tf.shape(model_input)[0] 63 | frame_index = tf.cast( 64 | tf.multiply( 65 | tf.random_uniform([batch_size, num_samples]), 66 | tf.tile(tf.cast(num_frames, tf.float32), [1, num_samples])), tf.int32) 67 | batch_index = tf.tile( 68 | tf.expand_dims(tf.range(batch_size), 1), [1, num_samples]) 69 | index = tf.stack([batch_index, frame_index], 2) 70 | return tf.gather_nd(model_input, index) 71 | 72 | def FramePooling(frames, method, **unused_params): 73 | """Pools over the frames of a video. 74 | 75 | Args: 76 | frames: A tensor with shape [batch_size, num_frames, feature_size]. 77 | method: "average", "max", "attention", or "none". 78 | Returns: 79 | A tensor with shape [batch_size, feature_size] for average, max, or 80 | attention pooling. A tensor with shape [batch_size*num_frames, feature_size] 81 | for none pooling. 82 | 83 | Raises: 84 | ValueError: if method is other than "average", "max", "attention", or 85 | "none". 86 | """ 87 | if method == "average": 88 | return tf.reduce_mean(frames, 1) 89 | elif method == "max": 90 | return tf.reduce_max(frames, 1) 91 | elif method == "none": 92 | feature_size = frames.shape_as_list()[2] 93 | return tf.reshape(frames, [-1, feature_size]) 94 | else: 95 | raise ValueError("Unrecognized pooling method: %s" % method) 96 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Contains the base class for models.""" 16 | 17 | class BaseModel(object): 18 | """Inherit from this class when implementing new models.""" 19 | 20 | def create_model(self, unused_model_input, **unused_params): 21 | raise NotImplementedError() 22 | -------------------------------------------------------------------------------- /readers.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Provides readers configured for different datasets.""" 16 | 17 | import tensorflow as tf 18 | import utils 19 | 20 | from tensorflow import logging 21 | def resize_axis(tensor, axis, new_size, fill_value=0): 22 | """Truncates or pads a tensor to new_size on on a given axis. 23 | 24 | Truncate or extend tensor such that tensor.shape[axis] == new_size. If the 25 | size increases, the padding will be performed at the end, using fill_value. 26 | 27 | Args: 28 | tensor: The tensor to be resized. 29 | axis: An integer representing the dimension to be sliced. 30 | new_size: An integer or 0d tensor representing the new value for 31 | tensor.shape[axis]. 32 | fill_value: Value to use to fill any new entries in the tensor. Will be 33 | cast to the type of tensor. 34 | 35 | Returns: 36 | The resized tensor. 37 | """ 38 | tensor = tf.convert_to_tensor(tensor) 39 | shape = tf.unstack(tf.shape(tensor)) 40 | 41 | pad_shape = shape[:] 42 | pad_shape[axis] = tf.maximum(0, new_size - shape[axis]) 43 | 44 | shape[axis] = tf.minimum(shape[axis], new_size) 45 | shape = tf.stack(shape) 46 | 47 | resized = tf.concat([ 48 | tf.slice(tensor, tf.zeros_like(shape), shape), 49 | tf.fill(tf.stack(pad_shape), tf.cast(fill_value, tensor.dtype)) 50 | ], axis) 51 | 52 | # Update shape. 53 | new_shape = tensor.get_shape().as_list() # A copy is being made. 54 | new_shape[axis] = new_size 55 | resized.set_shape(new_shape) 56 | return resized 57 | 58 | class BaseReader(object): 59 | """Inherit from this class when implementing new readers.""" 60 | 61 | def prepare_reader(self, unused_filename_queue): 62 | """Create a thread for generating prediction and label tensors.""" 63 | raise NotImplementedError() 64 | 65 | 66 | class YT8MAggregatedFeatureReader(BaseReader): 67 | """Reads TFRecords of pre-aggregated Examples. 68 | 69 | The TFRecords must contain Examples with a sparse int64 'labels' feature and 70 | a fixed length float32 feature, obtained from the features in 'feature_name'. 71 | The float features are assumed to be an average of dequantized values. 72 | """ 73 | 74 | def __init__(self, 75 | num_classes=4716, 76 | feature_sizes=[1024], 77 | feature_names=["mean_inc3"]): 78 | """Construct a YT8MAggregatedFeatureReader. 79 | 80 | Args: 81 | num_classes: a positive integer for the number of classes. 82 | feature_sizes: positive integer(s) for the feature dimensions as a list. 83 | feature_names: the feature name(s) in the tensorflow record as a list. 84 | """ 85 | 86 | assert len(feature_names) == len(feature_sizes), \ 87 | "length of feature_names (={}) != length of feature_sizes (={})".format( \ 88 | len(feature_names), len(feature_sizes)) 89 | 90 | self.num_classes = num_classes 91 | self.feature_sizes = feature_sizes 92 | self.feature_names = feature_names 93 | 94 | def prepare_reader(self, filename_queue, batch_size=1024): 95 | """Creates a single reader thread for pre-aggregated YouTube 8M Examples. 96 | 97 | Args: 98 | filename_queue: A tensorflow queue of filename locations. 99 | 100 | Returns: 101 | A tuple of video indexes, features, labels, and padding data. 102 | """ 103 | reader = tf.TFRecordReader() 104 | _, serialized_examples = reader.read_up_to(filename_queue, batch_size) 105 | 106 | tf.add_to_collection("serialized_examples", serialized_examples) 107 | return self.prepare_serialized_examples(serialized_examples) 108 | 109 | def prepare_serialized_examples(self, serialized_examples): 110 | # set the mapping from the fields to data types in the proto 111 | num_features = len(self.feature_names) 112 | assert num_features > 0, "self.feature_names is empty!" 113 | assert len(self.feature_names) == len(self.feature_sizes), \ 114 | "length of feature_names (={}) != length of feature_sizes (={})".format( \ 115 | len(self.feature_names), len(self.feature_sizes)) 116 | 117 | feature_map = {"video_id": tf.FixedLenFeature([], tf.string), 118 | "labels": tf.VarLenFeature(tf.int64)} 119 | for feature_index in range(num_features): 120 | feature_map[self.feature_names[feature_index]] = tf.FixedLenFeature( 121 | [self.feature_sizes[feature_index]], tf.float32) 122 | 123 | features = tf.parse_example(serialized_examples, features=feature_map) 124 | 125 | labels = tf.sparse_to_indicator(features["labels"], self.num_classes) 126 | labels.set_shape([None, self.num_classes]) 127 | concatenated_features = tf.concat([ 128 | features[feature_name] for feature_name in self.feature_names], 1) 129 | 130 | return features["video_id"], concatenated_features, labels, tf.ones([tf.shape(serialized_examples)[0]]) 131 | 132 | class YT8MFrameFeatureReader(BaseReader): 133 | """Reads TFRecords of SequenceExamples. 134 | 135 | The TFRecords must contain SequenceExamples with the sparse in64 'labels' 136 | context feature and a fixed length byte-quantized feature vector, obtained 137 | from the features in 'feature_names'. The quantized features will be mapped 138 | back into a range between min_quantized_value and max_quantized_value. 139 | """ 140 | 141 | def __init__(self, 142 | num_classes=4716, 143 | feature_sizes=[1024], 144 | feature_names=["inc3"], 145 | max_frames=300): 146 | """Construct a YT8MFrameFeatureReader. 147 | 148 | Args: 149 | num_classes: a positive integer for the number of classes. 150 | feature_sizes: positive integer(s) for the feature dimensions as a list. 151 | feature_names: the feature name(s) in the tensorflow record as a list. 152 | max_frames: the maximum number of frames to process. 153 | """ 154 | 155 | assert len(feature_names) == len(feature_sizes), \ 156 | "length of feature_names (={}) != length of feature_sizes (={})".format( \ 157 | len(feature_names), len(feature_sizes)) 158 | 159 | self.num_classes = num_classes 160 | self.feature_sizes = feature_sizes 161 | self.feature_names = feature_names 162 | self.max_frames = max_frames 163 | 164 | def get_video_matrix(self, 165 | features, 166 | feature_size, 167 | max_frames, 168 | max_quantized_value, 169 | min_quantized_value): 170 | """Decodes features from an input string and quantizes it. 171 | 172 | Args: 173 | features: raw feature values 174 | feature_size: length of each frame feature vector 175 | max_frames: number of frames (rows) in the output feature_matrix 176 | max_quantized_value: the maximum of the quantized value. 177 | min_quantized_value: the minimum of the quantized value. 178 | 179 | Returns: 180 | feature_matrix: matrix of all frame-features 181 | num_frames: number of frames in the sequence 182 | """ 183 | decoded_features = tf.reshape( 184 | tf.cast(tf.decode_raw(features, tf.uint8), tf.float32), 185 | [-1, feature_size]) 186 | 187 | num_frames = tf.minimum(tf.shape(decoded_features)[0], max_frames) 188 | feature_matrix = utils.Dequantize(decoded_features, 189 | max_quantized_value, 190 | min_quantized_value) 191 | feature_matrix = resize_axis(feature_matrix, 0, max_frames) 192 | return feature_matrix, num_frames 193 | 194 | def prepare_reader(self, 195 | filename_queue, 196 | max_quantized_value=2, 197 | min_quantized_value=-2): 198 | """Creates a single reader thread for YouTube8M SequenceExamples. 199 | 200 | Args: 201 | filename_queue: A tensorflow queue of filename locations. 202 | max_quantized_value: the maximum of the quantized value. 203 | min_quantized_value: the minimum of the quantized value. 204 | 205 | Returns: 206 | A tuple of video indexes, video features, labels, and padding data. 207 | """ 208 | reader = tf.TFRecordReader() 209 | _, serialized_example = reader.read(filename_queue) 210 | 211 | return self.prepare_serialized_examples(serialized_example, 212 | max_quantized_value, min_quantized_value) 213 | 214 | def prepare_serialized_examples(self, serialized_example, 215 | max_quantized_value=2, min_quantized_value=-2): 216 | 217 | contexts, features = tf.parse_single_sequence_example( 218 | serialized_example, 219 | context_features={"video_id": tf.FixedLenFeature( 220 | [], tf.string), 221 | "labels": tf.VarLenFeature(tf.int64)}, 222 | sequence_features={ 223 | feature_name : tf.FixedLenSequenceFeature([], dtype=tf.string) 224 | for feature_name in self.feature_names 225 | }) 226 | 227 | # read ground truth labels 228 | labels = (tf.cast( 229 | tf.sparse_to_dense(contexts["labels"].values, (self.num_classes,), 1, 230 | validate_indices=False), 231 | tf.bool)) 232 | 233 | # loads (potentially) different types of features and concatenates them 234 | num_features = len(self.feature_names) 235 | assert num_features > 0, "No feature selected: feature_names is empty!" 236 | 237 | assert len(self.feature_names) == len(self.feature_sizes), \ 238 | "length of feature_names (={}) != length of feature_sizes (={})".format( \ 239 | len(self.feature_names), len(self.feature_sizes)) 240 | 241 | num_frames = -1 # the number of frames in the video 242 | feature_matrices = [None] * num_features # an array of different features 243 | for feature_index in range(num_features): 244 | feature_matrix, num_frames_in_this_feature = self.get_video_matrix( 245 | features[self.feature_names[feature_index]], 246 | self.feature_sizes[feature_index], 247 | self.max_frames, 248 | max_quantized_value, 249 | min_quantized_value) 250 | if num_frames == -1: 251 | num_frames = num_frames_in_this_feature 252 | else: 253 | tf.assert_equal(num_frames, num_frames_in_this_feature) 254 | 255 | feature_matrices[feature_index] = feature_matrix 256 | 257 | # cap the number of frames at self.max_frames 258 | num_frames = tf.minimum(num_frames, self.max_frames) 259 | 260 | # concatenate different features 261 | video_matrix = tf.concat(feature_matrices, 1) 262 | 263 | # convert to batch format. 264 | # TODO: Do proper batch reads to remove the IO bottleneck. 265 | batch_video_ids = tf.expand_dims(contexts["video_id"], 0) 266 | batch_video_matrix = tf.expand_dims(video_matrix, 0) 267 | batch_labels = tf.expand_dims(labels, 0) 268 | batch_frames = tf.expand_dims(num_frames, 0) 269 | 270 | return batch_video_ids, batch_video_matrix, batch_labels, batch_frames 271 | 272 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Binary for training Tensorflow models on the YouTube-8M dataset.""" 15 | 16 | import json 17 | import os 18 | import time 19 | 20 | import eval_util 21 | import export_model 22 | import losses 23 | import frame_level_models 24 | import video_level_models 25 | import readers 26 | import tensorflow as tf 27 | import tensorflow.contrib.slim as slim 28 | from tensorflow import app 29 | from tensorflow import flags 30 | from tensorflow import gfile 31 | from tensorflow import logging 32 | import utils 33 | 34 | FLAGS = flags.FLAGS 35 | 36 | if __name__ == "__main__": 37 | # Dataset flags. 38 | flags.DEFINE_string("train_dir", "/tmp/yt8m_model/", 39 | "The directory to save the model files in.") 40 | flags.DEFINE_string( 41 | "train_data_pattern", "", 42 | "File glob for the training dataset. If the files refer to Frame Level " 43 | "features (i.e. tensorflow.SequenceExample), then set --reader_type " 44 | "format. The (Sequence)Examples are expected to have 'rgb' byte array " 45 | "sequence feature as well as a 'labels' int64 context feature.") 46 | flags.DEFINE_string("feature_names", "mean_rgb", "Name of the feature " 47 | "to use for training.") 48 | flags.DEFINE_string("feature_sizes", "1024", "Length of the feature vectors.") 49 | 50 | # Model flags. 51 | flags.DEFINE_bool( 52 | "frame_features", False, 53 | "If set, then --train_data_pattern must be frame-level features. " 54 | "Otherwise, --train_data_pattern must be aggregated video-level " 55 | "features. The model must also be set appropriately (i.e. to read 3D " 56 | "batches VS 4D batches.") 57 | flags.DEFINE_string( 58 | "model", "LogisticModel", 59 | "Which architecture to use for the model. Models are defined " 60 | "in models.py.") 61 | flags.DEFINE_bool( 62 | "start_new_model", False, 63 | "If set, this will not resume from a checkpoint and will instead create a" 64 | " new model instance.") 65 | 66 | # Training flags. 67 | flags.DEFINE_integer("batch_size", 1024, 68 | "How many examples to process per batch for training.") 69 | flags.DEFINE_string("label_loss", "CrossEntropyLoss", 70 | "Which loss function to use for training the model.") 71 | flags.DEFINE_float( 72 | "regularization_penalty", 1, 73 | "How much weight to give to the regularization loss (the label loss has " 74 | "a weight of 1).") 75 | flags.DEFINE_float("base_learning_rate", 0.001, 76 | "Which learning rate to start with.") 77 | flags.DEFINE_float("learning_rate_decay", 0.9, 78 | "Learning rate decay factor to be applied every " 79 | "learning_rate_decay_examples.") 80 | flags.DEFINE_float("learning_rate_decay_examples", 4000000, 81 | "Multiply current learning rate by learning_rate_decay " 82 | "every learning_rate_decay_examples.") 83 | flags.DEFINE_integer("num_epochs", 15, 84 | "How many passes to make over the dataset before " 85 | "halting training.") 86 | flags.DEFINE_integer("max_steps", None, 87 | "The maximum number of iterations of the training loop.") 88 | flags.DEFINE_integer("export_model_steps", 10000, 89 | "The period, in number of steps, with which the model " 90 | "is exported for batch prediction.") 91 | 92 | # Other flags. 93 | flags.DEFINE_integer("num_readers", 8, 94 | "How many threads to use for reading input files.") 95 | flags.DEFINE_string("optimizer", "AdamOptimizer", 96 | "What optimizer class to use.") 97 | flags.DEFINE_float("clip_gradient_norm", 1.0, "Norm to clip gradients to.") 98 | flags.DEFINE_bool( 99 | "log_device_placement", False, 100 | "Whether to write the device on which every op will run into the " 101 | "logs on startup.") 102 | 103 | def validate_class_name(flag_value, category, modules, expected_superclass): 104 | """Checks that the given string matches a class of the expected type. 105 | 106 | Args: 107 | flag_value: A string naming the class to instantiate. 108 | category: A string used further describe the class in error messages 109 | (e.g. 'model', 'reader', 'loss'). 110 | modules: A list of modules to search for the given class. 111 | expected_superclass: A class that the given class should inherit from. 112 | 113 | Raises: 114 | FlagsError: If the given class could not be found or if the first class 115 | found with that name doesn't inherit from the expected superclass. 116 | 117 | Returns: 118 | True if a class was found that matches the given constraints. 119 | """ 120 | candidates = [getattr(module, flag_value, None) for module in modules] 121 | for candidate in candidates: 122 | if not candidate: 123 | continue 124 | if not issubclass(candidate, expected_superclass): 125 | raise flags.FlagsError("%s '%s' doesn't inherit from %s." % 126 | (category, flag_value, 127 | expected_superclass.__name__)) 128 | return True 129 | raise flags.FlagsError("Unable to find %s '%s'." % (category, flag_value)) 130 | 131 | def get_input_data_tensors(reader, 132 | data_pattern, 133 | batch_size=1000, 134 | num_epochs=None, 135 | num_readers=1): 136 | """Creates the section of the graph which reads the training data. 137 | 138 | Args: 139 | reader: A class which parses the training data. 140 | data_pattern: A 'glob' style path to the data files. 141 | batch_size: How many examples to process at a time. 142 | num_epochs: How many passes to make over the training data. Set to 'None' 143 | to run indefinitely. 144 | num_readers: How many I/O threads to use. 145 | 146 | Returns: 147 | A tuple containing the features tensor, labels tensor, and optionally a 148 | tensor containing the number of frames per video. The exact dimensions 149 | depend on the reader being used. 150 | 151 | Raises: 152 | IOError: If no files matching the given pattern were found. 153 | """ 154 | logging.info("Using batch size of " + str(batch_size) + " for training.") 155 | with tf.name_scope("train_input"): 156 | files = gfile.Glob(data_pattern) 157 | if not files: 158 | raise IOError("Unable to find training files. data_pattern='" + 159 | data_pattern + "'.") 160 | logging.info("Number of training files: %s.", str(len(files))) 161 | filename_queue = tf.train.string_input_producer( 162 | files, num_epochs=num_epochs, shuffle=True) 163 | training_data = [ 164 | reader.prepare_reader(filename_queue) for _ in range(num_readers) 165 | ] 166 | 167 | return tf.train.shuffle_batch_join( 168 | training_data, 169 | batch_size=batch_size, 170 | capacity=FLAGS.batch_size * 5, 171 | min_after_dequeue=FLAGS.batch_size, 172 | allow_smaller_final_batch=True, 173 | enqueue_many=True) 174 | 175 | 176 | def find_class_by_name(name, modules): 177 | """Searches the provided modules for the named class and returns it.""" 178 | modules = [getattr(module, name, None) for module in modules] 179 | return next(a for a in modules if a) 180 | 181 | 182 | def build_graph(reader, 183 | model, 184 | train_data_pattern, 185 | label_loss_fn=losses.CrossEntropyLoss(), 186 | batch_size=1000, 187 | base_learning_rate=0.01, 188 | learning_rate_decay_examples=1000000, 189 | learning_rate_decay=0.95, 190 | optimizer_class=tf.train.AdamOptimizer, 191 | clip_gradient_norm=1.0, 192 | regularization_penalty=1, 193 | num_readers=1, 194 | num_epochs=None): 195 | """Creates the Tensorflow graph. 196 | 197 | This will only be called once in the life of 198 | a training model, because after the graph is created the model will be 199 | restored from a meta graph file rather than being recreated. 200 | 201 | Args: 202 | reader: The data file reader. It should inherit from BaseReader. 203 | model: The core model (e.g. logistic or neural net). It should inherit 204 | from BaseModel. 205 | train_data_pattern: glob path to the training data files. 206 | label_loss_fn: What kind of loss to apply to the model. It should inherit 207 | from BaseLoss. 208 | batch_size: How many examples to process at a time. 209 | base_learning_rate: What learning rate to initialize the optimizer with. 210 | optimizer_class: Which optimization algorithm to use. 211 | clip_gradient_norm: Magnitude of the gradient to clip to. 212 | regularization_penalty: How much weight to give the regularization loss 213 | compared to the label loss. 214 | num_readers: How many threads to use for I/O operations. 215 | num_epochs: How many passes to make over the data. 'None' means an 216 | unlimited number of passes. 217 | """ 218 | 219 | global_step = tf.Variable(0, trainable=False, name="global_step") 220 | 221 | learning_rate = tf.train.exponential_decay( 222 | base_learning_rate, 223 | global_step * batch_size, 224 | learning_rate_decay_examples, 225 | learning_rate_decay, 226 | staircase=True) 227 | tf.summary.scalar('learning_rate', learning_rate) 228 | 229 | optimizer = optimizer_class(learning_rate) 230 | unused_video_id, model_input_raw, labels_batch, num_frames = ( 231 | get_input_data_tensors( 232 | reader, 233 | train_data_pattern, 234 | batch_size=batch_size, 235 | num_readers=num_readers, 236 | num_epochs=num_epochs)) 237 | tf.summary.histogram("model/input_raw", model_input_raw) 238 | 239 | feature_dim = len(model_input_raw.get_shape()) - 1 240 | 241 | model_input = tf.nn.l2_normalize(model_input_raw, feature_dim) 242 | 243 | with tf.name_scope("model"): 244 | result = model.create_model( 245 | model_input, 246 | num_frames=num_frames, 247 | vocab_size=reader.num_classes, 248 | labels=labels_batch) 249 | 250 | for variable in slim.get_model_variables(): 251 | tf.summary.histogram(variable.op.name, variable) 252 | 253 | predictions = result["predictions"] 254 | if "loss" in result.keys(): 255 | label_loss = result["loss"] 256 | else: 257 | label_loss = label_loss_fn.calculate_loss(predictions, labels_batch) 258 | tf.summary.scalar("label_loss", label_loss) 259 | 260 | if "regularization_loss" in result.keys(): 261 | reg_loss = result["regularization_loss"] 262 | else: 263 | reg_loss = tf.constant(0.0) 264 | 265 | reg_losses = tf.losses.get_regularization_losses() 266 | if reg_losses: 267 | reg_loss += tf.add_n(reg_losses) 268 | 269 | if regularization_penalty != 0: 270 | tf.summary.scalar("reg_loss", reg_loss) 271 | 272 | # Adds update_ops (e.g., moving average updates in batch normalization) as 273 | # a dependency to the train_op. 274 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 275 | if "update_ops" in result.keys(): 276 | update_ops += result["update_ops"] 277 | if update_ops: 278 | with tf.control_dependencies(update_ops): 279 | barrier = tf.no_op(name="gradient_barrier") 280 | with tf.control_dependencies([barrier]): 281 | label_loss = tf.identity(label_loss) 282 | 283 | # Incorporate the L2 weight penalties etc. 284 | final_loss = regularization_penalty * reg_loss + label_loss 285 | train_op = slim.learning.create_train_op( 286 | final_loss, 287 | optimizer, 288 | global_step=global_step, 289 | clip_gradient_norm=clip_gradient_norm) 290 | 291 | tf.add_to_collection("global_step", global_step) 292 | tf.add_to_collection("loss", label_loss) 293 | tf.add_to_collection("predictions", predictions) 294 | tf.add_to_collection("input_batch_raw", model_input_raw) 295 | tf.add_to_collection("input_batch", model_input) 296 | tf.add_to_collection("num_frames", num_frames) 297 | tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) 298 | tf.add_to_collection("train_op", train_op) 299 | 300 | 301 | class Trainer(object): 302 | """A Trainer to train a Tensorflow graph.""" 303 | 304 | def __init__(self, cluster, task, train_dir, model, reader, model_exporter, 305 | log_device_placement=True, max_steps=None, 306 | export_model_steps=1000): 307 | """"Creates a Trainer. 308 | 309 | Args: 310 | cluster: A tf.train.ClusterSpec if the execution is distributed. 311 | None otherwise. 312 | task: A TaskSpec describing the job type and the task index. 313 | """ 314 | 315 | self.cluster = cluster 316 | self.task = task 317 | self.is_master = (task.type == "master" and task.index == 0) 318 | self.train_dir = train_dir 319 | self.config = tf.ConfigProto(log_device_placement=log_device_placement) 320 | self.model = model 321 | self.reader = reader 322 | self.model_exporter = model_exporter 323 | self.max_steps = max_steps 324 | self.max_steps_reached = False 325 | self.export_model_steps = export_model_steps 326 | self.last_model_export_step = 0 327 | 328 | if self.is_master and self.task.index > 0: 329 | raise StandardError("%s: Only one replica of master expected", 330 | task_as_string(self.task)) 331 | 332 | def run(self, start_new_model=False): 333 | """Performs training on the currently defined Tensorflow graph. 334 | 335 | Returns: 336 | A tuple of the training Hit@1 and the training PERR. 337 | """ 338 | if self.is_master and start_new_model: 339 | self.remove_training_directory(self.train_dir) 340 | 341 | target, device_fn = self.start_server_if_distributed() 342 | 343 | meta_filename = self.get_meta_filename(start_new_model, self.train_dir) 344 | 345 | with tf.Graph().as_default() as graph: 346 | 347 | if meta_filename: 348 | saver = self.recover_model(meta_filename) 349 | 350 | with tf.device(device_fn): 351 | 352 | if not meta_filename: 353 | saver = self.build_model(self.model, self.reader) 354 | 355 | global_step = tf.get_collection("global_step")[0] 356 | loss = tf.get_collection("loss")[0] 357 | predictions = tf.get_collection("predictions")[0] 358 | labels = tf.get_collection("labels")[0] 359 | train_op = tf.get_collection("train_op")[0] 360 | init_op = tf.global_variables_initializer() 361 | 362 | sv = tf.train.Supervisor( 363 | graph, 364 | logdir=self.train_dir, 365 | init_op=init_op, 366 | is_chief=self.is_master, 367 | global_step=global_step, 368 | save_model_secs=15 * 60, 369 | save_summaries_secs=120, 370 | saver=saver) 371 | 372 | logging.info("%s: Starting managed session.", task_as_string(self.task)) 373 | with sv.managed_session(target, config=self.config) as sess: 374 | 375 | try: 376 | logging.info("%s: Entering training loop.", task_as_string(self.task)) 377 | while (not sv.should_stop()) and (not self.max_steps_reached): 378 | 379 | batch_start_time = time.time() 380 | _, global_step_val, loss_val, predictions_val, labels_val = sess.run( 381 | [train_op, global_step, loss, predictions, labels]) 382 | seconds_per_batch = time.time() - batch_start_time 383 | 384 | if self.max_steps and self.max_steps <= global_step_val: 385 | self.max_steps_reached = True 386 | 387 | if self.is_master: 388 | examples_per_second = labels_val.shape[0] / seconds_per_batch 389 | hit_at_one = eval_util.calculate_hit_at_one(predictions_val, 390 | labels_val) 391 | perr = eval_util.calculate_precision_at_equal_recall_rate( 392 | predictions_val, labels_val) 393 | gap = eval_util.calculate_gap(predictions_val, labels_val) 394 | 395 | logging.info( 396 | "%s: training step " + str(global_step_val) + "| Hit@1: " + 397 | ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) + " GAP: " + 398 | ("%.2f" % gap) + " Loss: " + str(loss_val), 399 | task_as_string(self.task)) 400 | 401 | sv.summary_writer.add_summary( 402 | utils.MakeSummary("model/Training_Hit@1", hit_at_one), 403 | global_step_val) 404 | sv.summary_writer.add_summary( 405 | utils.MakeSummary("model/Training_Perr", perr), global_step_val) 406 | sv.summary_writer.add_summary( 407 | utils.MakeSummary("model/Training_GAP", gap), global_step_val) 408 | sv.summary_writer.add_summary( 409 | utils.MakeSummary("global_step/Examples/Second", 410 | examples_per_second), global_step_val) 411 | sv.summary_writer.flush() 412 | 413 | # Exporting the model every x steps 414 | time_to_export = ((self.last_model_export_step == 0) or 415 | (global_step_val - self.last_model_export_step 416 | >= self.export_model_steps)) 417 | 418 | if self.is_master and time_to_export: 419 | self.export_model(global_step_val, sv.saver, sv.save_path, sess) 420 | self.last_model_export_step = global_step_val 421 | 422 | # Exporting the final model 423 | if self.is_master: 424 | self.export_model(global_step_val, sv.saver, sv.save_path, sess) 425 | 426 | except tf.errors.OutOfRangeError: 427 | logging.info("%s: Done training -- epoch limit reached.", 428 | task_as_string(self.task)) 429 | 430 | logging.info("%s: Exited training loop.", task_as_string(self.task)) 431 | sv.Stop() 432 | 433 | def export_model(self, global_step_val, saver, save_path, session): 434 | 435 | # If the model has already been exported at this step, return. 436 | if global_step_val == self.last_model_export_step: 437 | return 438 | 439 | last_checkpoint = saver.save(session, save_path, global_step_val) 440 | 441 | model_dir = "{0}/export/step_{1}".format(self.train_dir, global_step_val) 442 | logging.info("%s: Exporting the model at step %s to %s.", 443 | task_as_string(self.task), global_step_val, model_dir) 444 | 445 | self.model_exporter.export_model( 446 | model_dir=model_dir, 447 | global_step_val=global_step_val, 448 | last_checkpoint=last_checkpoint) 449 | 450 | 451 | def start_server_if_distributed(self): 452 | """Starts a server if the execution is distributed.""" 453 | 454 | if self.cluster: 455 | logging.info("%s: Starting trainer within cluster %s.", 456 | task_as_string(self.task), self.cluster.as_dict()) 457 | server = start_server(self.cluster, self.task) 458 | target = server.target 459 | device_fn = tf.train.replica_device_setter( 460 | ps_device="/job:ps", 461 | worker_device="/job:%s/task:%d" % (self.task.type, self.task.index), 462 | cluster=self.cluster) 463 | else: 464 | target = "" 465 | device_fn = "" 466 | return (target, device_fn) 467 | 468 | def remove_training_directory(self, train_dir): 469 | """Removes the training directory.""" 470 | try: 471 | logging.info( 472 | "%s: Removing existing train directory.", 473 | task_as_string(self.task)) 474 | gfile.DeleteRecursively(train_dir) 475 | except: 476 | logging.error( 477 | "%s: Failed to delete directory " + train_dir + 478 | " when starting a new model. Please delete it manually and" + 479 | " try again.", task_as_string(self.task)) 480 | 481 | def get_meta_filename(self, start_new_model, train_dir): 482 | if start_new_model: 483 | logging.info("%s: Flag 'start_new_model' is set. Building a new model.", 484 | task_as_string(self.task)) 485 | return None 486 | 487 | latest_checkpoint = tf.train.latest_checkpoint(train_dir) 488 | if not latest_checkpoint: 489 | logging.info("%s: No checkpoint file found. Building a new model.", 490 | task_as_string(self.task)) 491 | return None 492 | 493 | meta_filename = latest_checkpoint + ".meta" 494 | if not gfile.Exists(meta_filename): 495 | logging.info("%s: No meta graph file found. Building a new model.", 496 | task_as_string(self.task)) 497 | return None 498 | else: 499 | return meta_filename 500 | 501 | def recover_model(self, meta_filename): 502 | logging.info("%s: Restoring from meta graph file %s", 503 | task_as_string(self.task), meta_filename) 504 | return tf.train.import_meta_graph(meta_filename) 505 | 506 | def build_model(self, model, reader): 507 | """Find the model and build the graph.""" 508 | 509 | label_loss_fn = find_class_by_name(FLAGS.label_loss, [losses])() 510 | optimizer_class = find_class_by_name(FLAGS.optimizer, [tf.train]) 511 | 512 | build_graph(reader=reader, 513 | model=model, 514 | optimizer_class=optimizer_class, 515 | clip_gradient_norm=FLAGS.clip_gradient_norm, 516 | train_data_pattern=FLAGS.train_data_pattern, 517 | label_loss_fn=label_loss_fn, 518 | base_learning_rate=FLAGS.base_learning_rate, 519 | learning_rate_decay=FLAGS.learning_rate_decay, 520 | learning_rate_decay_examples=FLAGS.learning_rate_decay_examples, 521 | regularization_penalty=FLAGS.regularization_penalty, 522 | num_readers=FLAGS.num_readers, 523 | batch_size=FLAGS.batch_size, 524 | num_epochs=FLAGS.num_epochs) 525 | 526 | return tf.train.Saver(max_to_keep=0, keep_checkpoint_every_n_hours=5) 527 | 528 | 529 | def get_reader(): 530 | # Convert feature_names and feature_sizes to lists of values. 531 | feature_names, feature_sizes = utils.GetListOfFeatureNamesAndSizes( 532 | FLAGS.feature_names, FLAGS.feature_sizes) 533 | 534 | if FLAGS.frame_features: 535 | reader = readers.YT8MFrameFeatureReader( 536 | feature_names=feature_names, feature_sizes=feature_sizes) 537 | else: 538 | reader = readers.YT8MAggregatedFeatureReader( 539 | feature_names=feature_names, feature_sizes=feature_sizes) 540 | 541 | return reader 542 | 543 | 544 | class ParameterServer(object): 545 | """A parameter server to serve variables in a distributed execution.""" 546 | 547 | def __init__(self, cluster, task): 548 | """Creates a ParameterServer. 549 | 550 | Args: 551 | cluster: A tf.train.ClusterSpec if the execution is distributed. 552 | None otherwise. 553 | task: A TaskSpec describing the job type and the task index. 554 | """ 555 | 556 | self.cluster = cluster 557 | self.task = task 558 | 559 | def run(self): 560 | """Starts the parameter server.""" 561 | 562 | logging.info("%s: Starting parameter server within cluster %s.", 563 | task_as_string(self.task), self.cluster.as_dict()) 564 | server = start_server(self.cluster, self.task) 565 | server.join() 566 | 567 | 568 | def start_server(cluster, task): 569 | """Creates a Server. 570 | 571 | Args: 572 | cluster: A tf.train.ClusterSpec if the execution is distributed. 573 | None otherwise. 574 | task: A TaskSpec describing the job type and the task index. 575 | """ 576 | 577 | if not task.type: 578 | raise ValueError("%s: The task type must be specified." % 579 | task_as_string(task)) 580 | if task.index is None: 581 | raise ValueError("%s: The task index must be specified." % 582 | task_as_string(task)) 583 | 584 | # Create and start a server. 585 | return tf.train.Server( 586 | tf.train.ClusterSpec(cluster), 587 | protocol="grpc", 588 | job_name=task.type, 589 | task_index=task.index) 590 | 591 | def task_as_string(task): 592 | return "/job:%s/task:%s" % (task.type, task.index) 593 | 594 | def main(unused_argv): 595 | # Load the environment. 596 | env = json.loads(os.environ.get("TF_CONFIG", "{}")) 597 | 598 | # Load the cluster data from the environment. 599 | cluster_data = env.get("cluster", None) 600 | cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None 601 | 602 | # Load the task data from the environment. 603 | task_data = env.get("task", None) or {"type": "master", "index": 0} 604 | task = type("TaskSpec", (object,), task_data) 605 | 606 | # Logging the version. 607 | logging.set_verbosity(tf.logging.INFO) 608 | logging.info("%s: Tensorflow version: %s.", 609 | task_as_string(task), tf.__version__) 610 | 611 | # Dispatch to a master, a worker, or a parameter server. 612 | if not cluster or task.type == "master" or task.type == "worker": 613 | 614 | model = find_class_by_name(FLAGS.model, 615 | [frame_level_models, video_level_models])() 616 | 617 | reader = get_reader() 618 | 619 | model_exporter = export_model.ModelExporter( 620 | frame_features=FLAGS.frame_features, 621 | model=model, 622 | reader=reader) 623 | 624 | Trainer(cluster, task, FLAGS.train_dir, model, reader, model_exporter, 625 | FLAGS.log_device_placement, FLAGS.max_steps, 626 | FLAGS.export_model_steps).run(start_new_model=FLAGS.start_new_model) 627 | 628 | elif task.type == "ps": 629 | 630 | ParameterServer(cluster, task).run() 631 | 632 | else: 633 | 634 | raise ValueError("%s: Invalid task_type: %s." % 635 | (task_as_string(task), task.type)) 636 | 637 | if __name__ == "__main__": 638 | app.run() 639 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Contains a collection of util functions for training and evaluating. 16 | """ 17 | 18 | import numpy 19 | import tensorflow as tf 20 | from tensorflow import logging 21 | 22 | 23 | def Dequantize(feat_vector, max_quantized_value=2, min_quantized_value=-2): 24 | """Dequantize the feature from the byte format to the float format. 25 | 26 | Args: 27 | feat_vector: the input 1-d vector. 28 | max_quantized_value: the maximum of the quantized value. 29 | min_quantized_value: the minimum of the quantized value. 30 | 31 | Returns: 32 | A float vector which has the same shape as feat_vector. 33 | """ 34 | assert max_quantized_value > min_quantized_value 35 | quantized_range = max_quantized_value - min_quantized_value 36 | scalar = quantized_range / 255.0 37 | bias = (quantized_range / 512.0) + min_quantized_value 38 | return feat_vector * scalar + bias 39 | 40 | 41 | def MakeSummary(name, value): 42 | """Creates a tf.Summary proto with the given name and value.""" 43 | summary = tf.Summary() 44 | val = summary.value.add() 45 | val.tag = str(name) 46 | val.simple_value = float(value) 47 | return summary 48 | 49 | 50 | def AddGlobalStepSummary(summary_writer, 51 | global_step_val, 52 | global_step_info_dict, 53 | summary_scope="Eval"): 54 | """Add the global_step summary to the Tensorboard. 55 | 56 | Args: 57 | summary_writer: Tensorflow summary_writer. 58 | global_step_val: a int value of the global step. 59 | global_step_info_dict: a dictionary of the evaluation metrics calculated for 60 | a mini-batch. 61 | summary_scope: Train or Eval. 62 | 63 | Returns: 64 | A string of this global_step summary 65 | """ 66 | this_hit_at_one = global_step_info_dict["hit_at_one"] 67 | this_perr = global_step_info_dict["perr"] 68 | this_loss = global_step_info_dict["loss"] 69 | examples_per_second = global_step_info_dict.get("examples_per_second", -1) 70 | 71 | summary_writer.add_summary( 72 | MakeSummary("GlobalStep/" + summary_scope + "_Hit@1", this_hit_at_one), 73 | global_step_val) 74 | summary_writer.add_summary( 75 | MakeSummary("GlobalStep/" + summary_scope + "_Perr", this_perr), 76 | global_step_val) 77 | summary_writer.add_summary( 78 | MakeSummary("GlobalStep/" + summary_scope + "_Loss", this_loss), 79 | global_step_val) 80 | 81 | if examples_per_second != -1: 82 | summary_writer.add_summary( 83 | MakeSummary("GlobalStep/" + summary_scope + "_Example_Second", 84 | examples_per_second), global_step_val) 85 | 86 | summary_writer.flush() 87 | info = ("global_step {0} | Batch Hit@1: {1:.3f} | Batch PERR: {2:.3f} | Batch Loss: {3:.3f} " 88 | "| Examples_per_sec: {4:.3f}").format( 89 | global_step_val, this_hit_at_one, this_perr, this_loss, 90 | examples_per_second) 91 | return info 92 | 93 | 94 | def AddEpochSummary(summary_writer, 95 | global_step_val, 96 | epoch_info_dict, 97 | summary_scope="Eval"): 98 | """Add the epoch summary to the Tensorboard. 99 | 100 | Args: 101 | summary_writer: Tensorflow summary_writer. 102 | global_step_val: a int value of the global step. 103 | epoch_info_dict: a dictionary of the evaluation metrics calculated for the 104 | whole epoch. 105 | summary_scope: Train or Eval. 106 | 107 | Returns: 108 | A string of this global_step summary 109 | """ 110 | epoch_id = epoch_info_dict["epoch_id"] 111 | avg_hit_at_one = epoch_info_dict["avg_hit_at_one"] 112 | avg_perr = epoch_info_dict["avg_perr"] 113 | avg_loss = epoch_info_dict["avg_loss"] 114 | aps = epoch_info_dict["aps"] 115 | gap = epoch_info_dict["gap"] 116 | mean_ap = numpy.mean(aps) 117 | 118 | summary_writer.add_summary( 119 | MakeSummary("Epoch/" + summary_scope + "_Avg_Hit@1", avg_hit_at_one), 120 | global_step_val) 121 | summary_writer.add_summary( 122 | MakeSummary("Epoch/" + summary_scope + "_Avg_Perr", avg_perr), 123 | global_step_val) 124 | summary_writer.add_summary( 125 | MakeSummary("Epoch/" + summary_scope + "_Avg_Loss", avg_loss), 126 | global_step_val) 127 | summary_writer.add_summary( 128 | MakeSummary("Epoch/" + summary_scope + "_MAP", mean_ap), 129 | global_step_val) 130 | summary_writer.add_summary( 131 | MakeSummary("Epoch/" + summary_scope + "_GAP", gap), 132 | global_step_val) 133 | summary_writer.flush() 134 | 135 | info = ("epoch/eval number {0} | Avg_Hit@1: {1:.3f} | Avg_PERR: {2:.3f} " 136 | "| MAP: {3:.3f} | GAP: {4:.4f} | Avg_Loss: {5:3f}").format( 137 | epoch_id, avg_hit_at_one, avg_perr, mean_ap, gap, avg_loss) 138 | return info 139 | 140 | def GetListOfFeatureNamesAndSizes(feature_names, feature_sizes): 141 | """Extract the list of feature names and the dimensionality of each feature 142 | from string of comma separated values. 143 | 144 | Args: 145 | feature_names: string containing comma separated list of feature names 146 | feature_sizes: string containing comma separated list of feature sizes 147 | 148 | Returns: 149 | List of the feature names and list of the dimensionality of each feature. 150 | Elements in the first/second list are strings/integers. 151 | """ 152 | list_of_feature_names = [ 153 | feature_names.strip() for feature_names in feature_names.split(',')] 154 | list_of_feature_sizes = [ 155 | int(feature_sizes) for feature_sizes in feature_sizes.split(',')] 156 | if len(list_of_feature_names) != len(list_of_feature_sizes): 157 | logging.error("length of the feature names (=" + 158 | str(len(list_of_feature_names)) + ") != length of feature " 159 | "sizes (=" + str(len(list_of_feature_sizes)) + ")") 160 | 161 | return list_of_feature_names, list_of_feature_sizes 162 | 163 | -------------------------------------------------------------------------------- /video_level_models.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Antoine Miech All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Contains model definitions.""" 16 | import math 17 | 18 | import models 19 | import tensorflow as tf 20 | import utils 21 | 22 | from tensorflow import flags 23 | import tensorflow.contrib.slim as slim 24 | 25 | FLAGS = flags.FLAGS 26 | flags.DEFINE_integer( 27 | "moe_num_mixtures", 2, 28 | "The number of mixtures (excluding the dummy 'expert') used for MoeModel.") 29 | flags.DEFINE_float( 30 | "moe_l2", 1e-8, 31 | "L2 penalty for MoeModel.") 32 | flags.DEFINE_integer( 33 | "moe_low_rank_gating", -1, 34 | "Low rank gating for MoeModel.") 35 | flags.DEFINE_bool( 36 | "moe_prob_gating", False, 37 | "Prob gating for MoeModel.") 38 | flags.DEFINE_string( 39 | "moe_prob_gating_input", "prob", 40 | "input Prob gating for MoeModel.") 41 | 42 | 43 | class MoeModel(models.BaseModel): 44 | """A softmax over a mixture of logistic models (with L2 regularization).""" 45 | 46 | def create_model(self, 47 | model_input, 48 | vocab_size, 49 | is_training, 50 | num_mixtures=None, 51 | l2_penalty=1e-8, 52 | **unused_params): 53 | """Creates a Mixture of (Logistic) Experts model. 54 | It also includes the possibility of gating the probabilities 55 | 56 | The model consists of a per-class softmax distribution over a 57 | configurable number of logistic classifiers. One of the classifiers in the 58 | mixture is not trained, and always predicts 0. 59 | 60 | Args: 61 | model_input: 'batch_size' x 'num_features' matrix of input features. 62 | vocab_size: The number of classes in the dataset. 63 | is_training: Is this the training phase ? 64 | num_mixtures: The number of mixtures (excluding a dummy 'expert' that 65 | always predicts the non-existence of an entity). 66 | l2_penalty: How much to penalize the squared magnitudes of parameter 67 | values. 68 | Returns: 69 | A dictionary with a tensor containing the probability predictions of the 70 | model in the 'predictions' key. The dimensions of the tensor are 71 | batch_size x num_classes. 72 | """ 73 | num_mixtures = num_mixtures or FLAGS.moe_num_mixtures 74 | low_rank_gating = FLAGS.moe_low_rank_gating 75 | l2_penalty = FLAGS.moe_l2; 76 | gating_probabilities = FLAGS.moe_prob_gating 77 | gating_input = FLAGS.moe_prob_gating_input 78 | 79 | input_size = model_input.get_shape().as_list()[1] 80 | remove_diag = FLAGS.gating_remove_diag 81 | 82 | if low_rank_gating == -1: 83 | gate_activations = slim.fully_connected( 84 | model_input, 85 | vocab_size * (num_mixtures + 1), 86 | activation_fn=None, 87 | biases_initializer=None, 88 | weights_regularizer=slim.l2_regularizer(l2_penalty), 89 | scope="gates") 90 | else: 91 | gate_activations1 = slim.fully_connected( 92 | model_input, 93 | low_rank_gating, 94 | activation_fn=None, 95 | biases_initializer=None, 96 | weights_regularizer=slim.l2_regularizer(l2_penalty), 97 | scope="gates1") 98 | gate_activations = slim.fully_connected( 99 | gate_activations1, 100 | vocab_size * (num_mixtures + 1), 101 | activation_fn=None, 102 | biases_initializer=None, 103 | weights_regularizer=slim.l2_regularizer(l2_penalty), 104 | scope="gates2") 105 | 106 | 107 | expert_activations = slim.fully_connected( 108 | model_input, 109 | vocab_size * num_mixtures, 110 | activation_fn=None, 111 | weights_regularizer=slim.l2_regularizer(l2_penalty), 112 | scope="experts") 113 | 114 | gating_distribution = tf.nn.softmax(tf.reshape( 115 | gate_activations, 116 | [-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1) 117 | expert_distribution = tf.nn.sigmoid(tf.reshape( 118 | expert_activations, 119 | [-1, num_mixtures])) # (Batch * #Labels) x num_mixtures 120 | 121 | probabilities_by_class_and_batch = tf.reduce_sum( 122 | gating_distribution[:, :num_mixtures] * expert_distribution, 1) 123 | probabilities = tf.reshape(probabilities_by_class_and_batch, 124 | [-1, vocab_size]) 125 | 126 | if gating_probabilities: 127 | if gating_input == 'prob': 128 | gating_weights = tf.get_variable("gating_prob_weights", 129 | [vocab_size, vocab_size], 130 | initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size))) 131 | gates = tf.matmul(probabilities, gating_weights) 132 | else: 133 | gating_weights = tf.get_variable("gating_prob_weights", 134 | [input_size, vocab_size], 135 | initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size))) 136 | 137 | gates = tf.matmul(model_input, gating_weights) 138 | 139 | if remove_diag: 140 | #removes diagonals coefficients 141 | diagonals = tf.matrix_diag_part(gating_weights) 142 | gates = gates - tf.multiply(diagonals,probabilities) 143 | 144 | gates = slim.batch_norm( 145 | gates, 146 | center=True, 147 | scale=True, 148 | is_training=is_training, 149 | scope="gating_prob_bn") 150 | 151 | gates = tf.sigmoid(gates) 152 | 153 | probabilities = tf.multiply(probabilities,gates) 154 | 155 | 156 | return {"predictions": probabilities} 157 | --------------------------------------------------------------------------------