├── LICENSE
├── README.md
├── __init__.py
├── average_precision_calculator.py
├── cloudml-gpu-distributed.yaml
├── cloudml-gpu.yaml
├── convert_prediction_from_json_to_csv.py
├── eval.py
├── eval_util.py
├── export_model.py
├── file_averaging.py
├── frame_level_models.py
├── inference.py
├── losses.py
├── mean_average_precision_calculator.py
├── model_utils.py
├── models.py
├── readers.py
├── train.py
├── utils.py
└── video_level_models.py


/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Youtube-8M-WILLOW
  2 | 
  3 | NEW: I just released a pretrained model (Gated NetVLAD) as of 11th December 2017! 
  4 | you can download the pretrained model here: https://www.rocq.inria.fr/cluster-willow/amiech/pretrainedmodel.zip
  5 | the model is: gatednetvladLF-256k-1024-80-0002-300iter-norelu-basic-gatedmoe
  6 | 
  7 | This is code of the kaggle Youtube-8M Large-Scale Video Understanding challenge winners (https://www.kaggle.com/c/youtube8m).
  8 | For more details about our models,  please read our arXiv paper: https://arxiv.org/abs/1706.06905 .
  9 | 
 10 | This repo contains some code to reproduce a winning submission for the kaggle challenge. If you are just looking for an efficient Tensorflow implementation of NetVLAD, NetRVLAD, NetFV, Soft-DBoW and their gated version, please directly consult our Tensorflow toolbox (LOUPE): https://github.com/antoine77340/LOUPE. 
 11 | 
 12 | The code is built on top of the Google Youtube-8M starter code (https://github.com/google/youtube-8m).
 13 | Please look at their README to see the needed dependencies to run the code (mainly Tensorflow 1.0).
 14 | 
 15 | You will additionally only need to have the pandas python library installed.
 16 | 
 17 | Hardware requirement: Each model have been run on a single NVIDIA TITAN X 12 GB GPU. Be aware that some of the models
 18 | do not fit with a GPU with less than 9GB of memory. Please do not modify the training batch size 
 19 | of these models as it might affect the final results.
 20 | 
 21 | Our best submitted model (GAP: 0.84967% on the private leaderboard) is a weighted ensemble of 25 models.
 22 | However for the sake of simplicity, we present a much more simple ensemble of 
 23 | 7 models that is enough to reach the first place with a significant margin (GAP ~ 84.7%). The 25 models trained
 24 | are only some very similar variant (of hyper-parameter) of these seven main models. 
 25 | 
 26 | Please note that because of the time constraint, we did not have time to try to run the code from scratch.
 27 | It might be possible, but rather unlikely, that something is not working properly. If so please create an issue on
 28 | github.
 29 | 
 30 | ## Training the single models
 31 | 
 32 | Each of the following command lines train a single model. They are scheduled to stop training at the good time.
 33 | 
 34 | Our models were trained on all the training set and almost all the validation set. 
 35 | We only discarded 21k videos to build a smaller validation set.
 36 | This validation set (used in the arXiv paper) is composed of all the tensorflow record file of the form: 'validatea*.tfrecord'. 
 37 | We will however, train the models on all both training and validation set as it was allowed in the kaggle competition. It should not make any huge difference.
 38 | 
 39 | Each model takes several days to train, so each command line are separated in order to be run in parallel if possible. 
 40 | Please replace 'path_to_features' with the folder path which contains all the tensorflow record frame level feature.
 41 | ```sh
 42 | path_to_features='path_to_features'
 43 | ```
 44 | 
 45 | Training Gated NetVLAD (256 Clusters):
 46 | 
 47 | ```sh
 48 | python train.py --train_data_pattern="$path_to_features/*a*??.tfrecord" --model=NetVLADModelLF --train_dir=gatednetvladLF-256k-1024-80-0002-300iter-norelu-basic-gatedmoe --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=80 --base_learning_rate=0.0002 --netvlad_cluster_size=256 --netvlad_hidden_size=1024 --moe_l2=1e-6 --iterations=300 --learning_rate_decay=0.8 --netvlad_relu=False --gating=True --moe_prob_gating=True --max_step=700000
 49 | ```
 50 | 
 51 | Note: The best single model is this one but with the flag --max_step=300000. We somehow need it to train longer for better effect on the ensemble.
 52 | G
 53 | Training Gated NetFV (128 Clusters):
 54 | 
 55 | 
 56 | ```sh
 57 | python train.py --train_data_pattern="$path_to_features/*a*??.tfrecord" --model=NetFVModelLF --train_dir=gatednetfvLF-128k-1024-80-0002-300iter-norelu-basic-gatedmoe --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=80 --base_learning_rate=0.0002 --fv_cluster_size=128 --fv_hidden_size=1024 --moe_l2=1e-6 --iterations=300 --learning_rate_decay=0.8 --fv_relu=False --gating=True --moe_prob_gating=True --fv_couple_weights=False --max_step=600000
 58 | ```
 59 | 
 60 | Training Gated Soft-DBoW (4096 Clusters):
 61 | 
 62 | ```sh
 63 | python train.py --train_data_pattern="$path_to_features/*a*??.tfrecord" --model=GatedDbofModelLF --train_dir=gateddboflf-4096-1024-80-0002-300iter --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=80 --base_learning_rate=0.0002 --dbof_cluster_size=4096 --dbof_hidden_size=1024 --moe_l2=1e-6 --iterations=300 --dbof_relu=False --max_step=1000000
 64 | ```
 65 | 
 66 | Training Soft-DBoW (8000 Clusters):
 67 | 
 68 | ```sh
 69 | python train.py --train_data_pattern="$path_to_features/*a*??.tfrecord" --model=SoftDbofModelLF --train_dir=softdboflf-8000-1024-80-0002-300iter --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=80 --base_learning_rate=0.0002 --dbof_cluster_size=8000 --dbof_hidden_size=1024 --iterations=300 --dbof_relu=False --max_step=800000
 70 | ```
 71 | 
 72 | Training Gated NetRVLAD (256 Clusters):
 73 | 
 74 | ```sh
 75 | python train.py --train_data_pattern="$path_to_features/*a*??.tfrecord" --model=NetVLADModelLF --train_dir=gatedlightvladLF-256k-1024-80-0002-300iter-norelu-basic-gatedmoe --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=80 --base_learning_rate=0.0002 --netvlad_cluster_size=256 --netvlad_hidden_size=1024 --moe_l2=1e-6 --iterations=300 --learning_rate_decay=0.8 --netvlad_relu=False --gating=True --moe_prob_gating=True --lightvlad=True --max_step=600000
 76 | ```
 77 | 
 78 | Training GRU (2 layers):
 79 | 
 80 | ```sh
 81 | python train.py --train_data_pattern="$path_to_features/*a*??.tfrecord" --model=GruModel --train_dir=GRU-0002-1200 --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=128 --base_learning_rate=0.0002 --gru_cells=1200 --learning_rate_decay=0.9 --moe_l2=1e-6 --max_step=300000
 82 | ```
 83 | 
 84 | Training LSTM (2 layers):
 85 | 
 86 | ```sh
 87 | python train.py --train_data_pattern="$path_to_features/*a*??.tfrecord" --model=LstmModel --train_dir=lstm-0002-val-150-random --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=128 --base_learning_rate=0.0002 --iterations=150 --lstm_random_sequence=True --max_step=400000
 88 | ```
 89 | 
 90 | 
 91 | ## Inference
 92 | 
 93 | After training, we will write the predictions into 7 different files and then ensemble them.
 94 | Run each one of this command to run inference for each model.
 95 | 
 96 | ```sh
 97 | python inference.py --output_file=test-lstm-0002-val-150-random.csv --input_data_pattern="$path_to_features/test*.tfrecord" --model=LstmModel --train_dir=lstm-0002-val-150-random --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=1024 --base_learning_rate=0.0002 --iterations=150 --lstm_random_sequence=True --run_once=True --top_k=50
 98 | 
 99 | python inference.py --output_file=test-gatedlightvladLF-256k-1024-80-0002-300iter-norelu-basic-gatedmoe.csv --input_data_pattern="$path_to_features/test*.tfrecord" --model=NetVLADModelLF --train_dir=gatedlightvladLF-256k-1024-80-0002-300iter-norelu-basic-gatedmoe --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=1024 --base_learning_rate=0.0002 --netvlad_cluster_size=256 --netvlad_hidden_size=1024 --moe_l2=1e-6 --iterations=300 --learning_rate_decay=0.8 --netvlad_relu=False --gating=True --moe_prob_gating=True --lightvlad=True --run_once=True  --top_k=50 
100 | 
101 | python inference.py --output_file=test-gateddboflf-4096-1024-80-0002-300iter-gatedmoe.csv --input_data_pattern="$path_to_features/test*.tfrecord" --model=GatedDbofModelLF --train_dir=gateddboflf-4096-1024-80-0002-300iter-gatedmoe --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=512 --base_learning_rate=0.0002 --dbof_cluster_size=4096 --dbof_hidden_size=1024 --moe_l2=1e-6 --iterations=300 --dbof_relu=False --moe_prob_gating=True --run_once=True --top_k=50
102 | 
103 | python inference.py --output_file=test-gatednetfvLF-128k-1024-80-0002-300iter-norelu-basic-gatedmoe.csv --input_data_pattern="$path_to_features/test*.tfrecord" --model=NetFVModelLF --train_dir=gatednetfvLF-128k-1024-80-0002-300iter-norelu-basic-gatedmoe --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=1024 --base_learning_rate=0.0002 --fv_cluster_size=128 --fv_hidden_size=1024 --moe_l2=1e-6 --iterations=300 --learning_rate_decay=0.8 --fv_relu=False --gating=True --moe_prob_gating=True --fv_couple_weights=False --top_k=50
104 | 
105 | python inference.py --output_file=test-GRU-0002-1200.csv --input_data_pattern="$path_to_features/test*.tfrecord" --model=GruModel --train_dir=GRU-0002-1200 --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=1024 --base_learning_rate=0.0002 --gru_cells=1200 --learning_rate_decay=0.9 --moe_l2=1e-6 --run_once=True --top_k=50
106 | 
107 | python inference.py --output_file=test-gatednetvladLF-256k-1024-80-0002-300iter-norelu-basic-gatedmoe.csv --input_data_pattern="$path_to_features/test*.tfrecord" --model=NetVLADModelLF --train_dir=gatednetvladLF-256k-1024-80-0002-300iter-norelu-basic-gatedmoe --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=1024 --base_learning_rate=0.0002 --netvlad_cluster_size=256 --netvlad_hidden_size=1024 --moe_l2=1e-6 --iterations=300 --learning_rate_decay=0.8 --netvlad_relu=False --gating=True --moe_prob_gating=True --run_once=True  --top_k=50
108 | 
109 | python inference.py --output_file=test-softdboflf-8000-1024-80-0002-300iter.csv --input_data_pattern="$path_to_features/test*.tfrecord"  --model=SoftDbofModelLF --train_dir=softdboflf-8000-1024-80-0002-300iter --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=256 --base_learning_rate=0.0002 --dbof_cluster_size=8000 --dbof_hidden_size=1024 --iterations=300 --dbof_relu=False --run_once=True --top_k=50
110 | ```
111 | 
112 | ## Averaging the models
113 | 
114 | After inference done for all models just run:
115 | 
116 | 
117 | ```sh
118 | python file_averaging.py
119 | ```
120 | 
121 | It will just take you the time to make a coffee and the submission file will be written in WILLOW_submission.csv 
122 | before you finish to drink it :D.
123 | 
124 | Antoine Miech
125 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/average_precision_calculator.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Calculate or keep track of the interpolated average precision.
 16 | 
 17 | It provides an interface for calculating interpolated average precision for an
 18 | entire list or the top-n ranked items. For the definition of the
 19 | (non-)interpolated average precision:
 20 | http://trec.nist.gov/pubs/trec15/appendices/CE.MEASURES06.pdf
 21 | 
 22 | Example usages:
 23 | 1) Use it as a static function call to directly calculate average precision for
 24 | a short ranked list in the memory.
 25 | 
 26 | ```
 27 | import random
 28 | 
 29 | p = np.array([random.random() for _ in xrange(10)])
 30 | a = np.array([random.choice([0, 1]) for _ in xrange(10)])
 31 | 
 32 | ap = average_precision_calculator.AveragePrecisionCalculator.ap(p, a)
 33 | ```
 34 | 
 35 | 2) Use it as an object for long ranked list that cannot be stored in memory or
 36 | the case where partial predictions can be observed at a time (Tensorflow
 37 | predictions). In this case, we first call the function accumulate many times
 38 | to process parts of the ranked list. After processing all the parts, we call
 39 | peek_interpolated_ap_at_n.
 40 | ```
 41 | p1 = np.array([random.random() for _ in xrange(5)])
 42 | a1 = np.array([random.choice([0, 1]) for _ in xrange(5)])
 43 | p2 = np.array([random.random() for _ in xrange(5)])
 44 | a2 = np.array([random.choice([0, 1]) for _ in xrange(5)])
 45 | 
 46 | # interpolated average precision at 10 using 1000 break points
 47 | calculator = average_precision_calculator.AveragePrecisionCalculator(10)
 48 | calculator.accumulate(p1, a1)
 49 | calculator.accumulate(p2, a2)
 50 | ap3 = calculator.peek_ap_at_n()
 51 | ```
 52 | """
 53 | 
 54 | import heapq
 55 | import random
 56 | import numbers
 57 | 
 58 | import numpy
 59 | 
 60 | 
 61 | class AveragePrecisionCalculator(object):
 62 |   """Calculate the average precision and average precision at n."""
 63 | 
 64 |   def __init__(self, top_n=None):
 65 |     """Construct an AveragePrecisionCalculator to calculate average precision.
 66 | 
 67 |     This class is used to calculate the average precision for a single label.
 68 | 
 69 |     Args:
 70 |       top_n: A positive Integer specifying the average precision at n, or
 71 |         None to use all provided data points.
 72 | 
 73 |     Raises:
 74 |       ValueError: An error occurred when the top_n is not a positive integer.
 75 |     """
 76 |     if not ((isinstance(top_n, int) and top_n >= 0) or top_n is None):
 77 |       raise ValueError("top_n must be a positive integer or None.")
 78 | 
 79 |     self._top_n = top_n  # average precision at n
 80 |     self._total_positives = 0  # total number of positives have seen
 81 |     self._heap = []  # max heap of (prediction, actual)
 82 | 
 83 |   @property
 84 |   def heap_size(self):
 85 |     """Gets the heap size maintained in the class."""
 86 |     return len(self._heap)
 87 | 
 88 |   @property
 89 |   def num_accumulated_positives(self):
 90 |     """Gets the number of positive samples that have been accumulated."""
 91 |     return self._total_positives
 92 | 
 93 |   def accumulate(self, predictions, actuals, num_positives=None):
 94 |     """Accumulate the predictions and their ground truth labels.
 95 | 
 96 |     After the function call, we may call peek_ap_at_n to actually calculate
 97 |     the average precision.
 98 |     Note predictions and actuals must have the same shape.
 99 | 
100 |     Args:
101 |       predictions: a list storing the prediction scores.
102 |       actuals: a list storing the ground truth labels. Any value
103 |       larger than 0 will be treated as positives, otherwise as negatives.
104 |       num_positives = If the 'predictions' and 'actuals' inputs aren't complete,
105 |       then it's possible some true positives were missed in them. In that case,
106 |       you can provide 'num_positives' in order to accurately track recall.
107 | 
108 |     Raises:
109 |       ValueError: An error occurred when the format of the input is not the
110 |       numpy 1-D array or the shape of predictions and actuals does not match.
111 |     """
112 |     if len(predictions) != len(actuals):
113 |       raise ValueError("the shape of predictions and actuals does not match.")
114 | 
115 |     if not num_positives is None:
116 |       if not isinstance(num_positives, numbers.Number) or num_positives < 0:
117 |         raise ValueError("'num_positives' was provided but it wan't a nonzero number.")
118 | 
119 |     if not num_positives is None:
120 |       self._total_positives += num_positives
121 |     else:
122 |       self._total_positives += numpy.size(numpy.where(actuals > 0))
123 |     topk = self._top_n
124 |     heap = self._heap
125 | 
126 |     for i in range(numpy.size(predictions)):
127 |       if topk is None or len(heap) < topk:
128 |         heapq.heappush(heap, (predictions[i], actuals[i]))
129 |       else:
130 |         if predictions[i] > heap[0][0]:  # heap[0] is the smallest
131 |           heapq.heappop(heap)
132 |           heapq.heappush(heap, (predictions[i], actuals[i]))
133 | 
134 |   def clear(self):
135 |     """Clear the accumulated predictions."""
136 |     self._heap = []
137 |     self._total_positives = 0
138 | 
139 |   def peek_ap_at_n(self):
140 |     """Peek the non-interpolated average precision at n.
141 | 
142 |     Returns:
143 |       The non-interpolated average precision at n (default 0).
144 |       If n is larger than the length of the ranked list,
145 |       the average precision will be returned.
146 |     """
147 |     if self.heap_size <= 0:
148 |       return 0
149 |     predlists = numpy.array(list(zip(*self._heap)))
150 | 
151 |     ap = self.ap_at_n(predlists[0],
152 |                       predlists[1],
153 |                       n=self._top_n,
154 |                       total_num_positives=self._total_positives)
155 |     return ap
156 | 
157 |   @staticmethod
158 |   def ap(predictions, actuals):
159 |     """Calculate the non-interpolated average precision.
160 | 
161 |     Args:
162 |       predictions: a numpy 1-D array storing the sparse prediction scores.
163 |       actuals: a numpy 1-D array storing the ground truth labels. Any value
164 |       larger than 0 will be treated as positives, otherwise as negatives.
165 | 
166 |     Returns:
167 |       The non-interpolated average precision at n.
168 |       If n is larger than the length of the ranked list,
169 |       the average precision will be returned.
170 | 
171 |     Raises:
172 |       ValueError: An error occurred when the format of the input is not the
173 |       numpy 1-D array or the shape of predictions and actuals does not match.
174 |     """
175 |     return AveragePrecisionCalculator.ap_at_n(predictions,
176 |                                               actuals,
177 |                                               n=None)
178 | 
179 |   @staticmethod
180 |   def ap_at_n(predictions, actuals, n=20, total_num_positives=None):
181 |     """Calculate the non-interpolated average precision.
182 | 
183 |     Args:
184 |       predictions: a numpy 1-D array storing the sparse prediction scores.
185 |       actuals: a numpy 1-D array storing the ground truth labels. Any value
186 |       larger than 0 will be treated as positives, otherwise as negatives.
187 |       n: the top n items to be considered in ap@n.
188 |       total_num_positives : (optionally) you can specify the number of total
189 |         positive
190 |       in the list. If specified, it will be used in calculation.
191 | 
192 |     Returns:
193 |       The non-interpolated average precision at n.
194 |       If n is larger than the length of the ranked list,
195 |       the average precision will be returned.
196 | 
197 |     Raises:
198 |       ValueError: An error occurred when
199 |       1) the format of the input is not the numpy 1-D array;
200 |       2) the shape of predictions and actuals does not match;
201 |       3) the input n is not a positive integer.
202 |     """
203 |     if len(predictions) != len(actuals):
204 |       raise ValueError("the shape of predictions and actuals does not match.")
205 | 
206 |     if n is not None:
207 |       if not isinstance(n, int) or n <= 0:
208 |         raise ValueError("n must be 'None' or a positive integer."
209 |                          " It was '%s'." % n)
210 | 
211 |     ap = 0.0
212 | 
213 |     predictions = numpy.array(predictions)
214 |     actuals = numpy.array(actuals)
215 | 
216 |     # add a shuffler to avoid overestimating the ap
217 |     predictions, actuals = AveragePrecisionCalculator._shuffle(predictions,
218 |                                                                actuals)
219 |     sortidx = sorted(
220 |         range(len(predictions)),
221 |         key=lambda k: predictions[k],
222 |         reverse=True)
223 | 
224 |     if total_num_positives is None:
225 |       numpos = numpy.size(numpy.where(actuals > 0))
226 |     else:
227 |       numpos = total_num_positives
228 | 
229 |     if numpos == 0:
230 |       return 0
231 | 
232 |     if n is not None:
233 |       numpos = min(numpos, n)
234 |     delta_recall = 1.0 / numpos
235 |     poscount = 0.0
236 | 
237 |     # calculate the ap
238 |     r = len(sortidx)
239 |     if n is not None:
240 |       r = min(r, n)
241 |     for i in range(r):
242 |       if actuals[sortidx[i]] > 0:
243 |         poscount += 1
244 |         ap += poscount / (i + 1) * delta_recall
245 |     return ap
246 | 
247 |   @staticmethod
248 |   def _shuffle(predictions, actuals):
249 |     random.seed(0)
250 |     suffidx = random.sample(range(len(predictions)), len(predictions))
251 |     predictions = predictions[suffidx]
252 |     actuals = actuals[suffidx]
253 |     return predictions, actuals
254 | 
255 |   @staticmethod
256 |   def _zero_one_normalize(predictions, epsilon=1e-7):
257 |     """Normalize the predictions to the range between 0.0 and 1.0.
258 | 
259 |     For some predictions like SVM predictions, we need to normalize them before
260 |     calculate the interpolated average precision. The normalization will not
261 |     change the rank in the original list and thus won't change the average
262 |     precision.
263 | 
264 |     Args:
265 |       predictions: a numpy 1-D array storing the sparse prediction scores.
266 |       epsilon: a small constant to avoid denominator being zero.
267 | 
268 |     Returns:
269 |       The normalized prediction.
270 |     """
271 |     denominator = numpy.max(predictions) - numpy.min(predictions)
272 |     ret = (predictions - numpy.min(predictions)) / numpy.max(denominator,
273 |                                                              epsilon)
274 |     return ret
275 | 


--------------------------------------------------------------------------------
/cloudml-gpu-distributed.yaml:
--------------------------------------------------------------------------------
1 | trainingInput:
2 |   runtimeVersion: "1.0"
3 |   scaleTier: CUSTOM
4 |   masterType: standard_gpu
5 |   workerCount: 2
6 |   workerType: standard_gpu
7 |   parameterServerCount: 2
8 |   parameterServerType: standard
9 | 


--------------------------------------------------------------------------------
/cloudml-gpu.yaml:
--------------------------------------------------------------------------------
1 | trainingInput:
2 |   scaleTier: CUSTOM
3 |   # standard_gpu provides 1 GPU. Change to complex_model_m_gpu for 4 GPUs
4 |   masterType: standard_gpu
5 |   runtimeVersion: "1.0"
6 | 


--------------------------------------------------------------------------------
/convert_prediction_from_json_to_csv.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Utility to convert the output of batch prediction into a CSV submission.
 16 | 
 17 | It converts the JSON files created by the command
 18 | 'gcloud beta ml jobs submit prediction' into a CSV file ready for submission.
 19 | """
 20 | 
 21 | import json
 22 | import tensorflow as tf
 23 | 
 24 | from builtins import range
 25 | from tensorflow import app
 26 | from tensorflow import flags
 27 | from tensorflow import gfile
 28 | from tensorflow import logging
 29 | 
 30 | 
 31 | FLAGS = flags.FLAGS
 32 | 
 33 | if __name__ == '__main__':
 34 | 
 35 |   flags.DEFINE_string(
 36 |       "json_prediction_files_pattern", None,
 37 |       "Pattern specifying the list of JSON files that the command "
 38 |       "'gcloud beta ml jobs submit prediction' outputs. These files are "
 39 |       "located in the output path of the prediction command and are prefixed "
 40 |       "with 'prediction.results'.")
 41 |   flags.DEFINE_string(
 42 |       "csv_output_file", None,
 43 |       "The file to save the predictions converted to the CSV format.")
 44 | 
 45 | 
 46 | def get_csv_header():
 47 |   return "VideoId,LabelConfidencePairs\n"
 48 | 
 49 | def to_csv_row(json_data):
 50 | 
 51 |   video_id = json_data["video_id"]
 52 | 
 53 |   class_indexes = json_data["class_indexes"]
 54 |   predictions = json_data["predictions"]
 55 | 
 56 |   if isinstance(video_id, list):
 57 |     video_id = video_id[0]
 58 |     class_indexes = class_indexes[0]
 59 |     predictions = predictions[0]
 60 | 
 61 |   if len(class_indexes) != len(predictions):
 62 |     raise ValueError(
 63 |         "The number of indexes (%s) and predictions (%s) must be equal." 
 64 |         % (len(class_indexes), len(predictions)))
 65 | 
 66 |   return (video_id.decode('utf-8') + "," + " ".join("%i %f" % 
 67 |       (class_indexes[i], predictions[i]) 
 68 |       for i in range(len(class_indexes))) + "\n")
 69 | 
 70 | def main(unused_argv):
 71 |   logging.set_verbosity(tf.logging.INFO)
 72 | 
 73 |   if not FLAGS.json_prediction_files_pattern:
 74 |     raise ValueError(
 75 |         "The flag --json_prediction_files_pattern must be specified.")
 76 | 
 77 |   if not FLAGS.csv_output_file:
 78 |     raise ValueError("The flag --csv_output_file must be specified.")
 79 | 
 80 |   logging.info("Looking for prediction files with pattern: %s", 
 81 |                FLAGS.json_prediction_files_pattern)
 82 | 
 83 |   file_paths = gfile.Glob(FLAGS.json_prediction_files_pattern)  
 84 |   logging.info("Found files: %s", file_paths)
 85 | 
 86 |   logging.info("Writing submission file to: %s", FLAGS.csv_output_file)
 87 |   with gfile.Open(FLAGS.csv_output_file, "w+") as output_file:
 88 |     output_file.write(get_csv_header())
 89 | 
 90 |     for file_path in file_paths:
 91 |       logging.info("processing file: %s", file_path)
 92 | 
 93 |       with gfile.Open(file_path) as input_file:
 94 | 
 95 |         for line in input_file: 
 96 |           json_data = json.loads(line)
 97 |           output_file.write(to_csv_row(json_data))
 98 | 
 99 |     output_file.flush()
100 |   logging.info("done")
101 | 
102 | if __name__ == "__main__":
103 |   app.run()
104 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Binary for evaluating Tensorflow models on the YouTube-8M dataset."""
 15 | 
 16 | import time
 17 | 
 18 | import eval_util
 19 | import losses
 20 | import frame_level_models
 21 | import video_level_models
 22 | import readers
 23 | import tensorflow as tf
 24 | from tensorflow import app
 25 | from tensorflow import flags
 26 | from tensorflow import gfile
 27 | from tensorflow import logging
 28 | import utils
 29 | 
 30 | FLAGS = flags.FLAGS
 31 | 
 32 | if __name__ == "__main__":
 33 |   # Dataset flags.
 34 |   flags.DEFINE_string("train_dir", "/tmp/yt8m_model/",
 35 |                       "The directory to load the model files from. "
 36 |                       "The tensorboard metrics files are also saved to this "
 37 |                       "directory.")
 38 |   flags.DEFINE_string(
 39 |       "eval_data_pattern", "",
 40 |       "File glob defining the evaluation dataset in tensorflow.SequenceExample "
 41 |       "format. The SequenceExamples are expected to have an 'rgb' byte array "
 42 |       "sequence feature as well as a 'labels' int64 context feature.")
 43 |   flags.DEFINE_string("feature_names", "mean_rgb", "Name of the feature "
 44 |                       "to use for training.")
 45 |   flags.DEFINE_string("feature_sizes", "1024", "Length of the feature vectors.")
 46 | 
 47 |   # Model flags.
 48 |   flags.DEFINE_bool(
 49 |       "frame_features", False,
 50 |       "If set, then --eval_data_pattern must be frame-level features. "
 51 |       "Otherwise, --eval_data_pattern must be aggregated video-level "
 52 |       "features. The model must also be set appropriately (i.e. to read 3D "
 53 |       "batches VS 4D batches.")
 54 |   flags.DEFINE_string(
 55 |       "model", "LogisticModel",
 56 |       "Which architecture to use for the model. Options include 'Logistic', "
 57 |       "'SingleMixtureMoe', and 'TwoLayerSigmoid'. See aggregated_models.py and "
 58 |       "frame_level_models.py for the model definitions.")
 59 |   flags.DEFINE_integer("batch_size", 1024,
 60 |                        "How many examples to process per batch.")
 61 |   flags.DEFINE_string("label_loss", "CrossEntropyLoss",
 62 |                       "Loss computed on validation data")
 63 | 
 64 |   # Other flags.
 65 |   flags.DEFINE_integer("num_readers", 4,
 66 |                        "How many threads to use for reading input files.")
 67 |   flags.DEFINE_boolean("run_once", False, "Whether to run eval only once.")
 68 |   flags.DEFINE_integer("top_k", 20, "How many predictions to output per video.")
 69 |   flags.DEFINE_integer("check_point",-1,
 70 |                        "Model checkpoint to load, -1 for latest.")
 71 | 
 72 | 
 73 | 
 74 | def find_class_by_name(name, modules):
 75 |   """Searches the provided modules for the named class and returns it."""
 76 |   modules = [getattr(module, name, None) for module in modules]
 77 |   return next(a for a in modules if a)
 78 | 
 79 | 
 80 | def get_input_evaluation_tensors(reader,
 81 |                                  data_pattern,
 82 |                                  batch_size=1024,
 83 |                                  num_readers=1):
 84 |   """Creates the section of the graph which reads the evaluation data.
 85 | 
 86 |   Args:
 87 |     reader: A class which parses the training data.
 88 |     data_pattern: A 'glob' style path to the data files.
 89 |     batch_size: How many examples to process at a time.
 90 |     num_readers: How many I/O threads to use.
 91 | 
 92 |   Returns:
 93 |     A tuple containing the features tensor, labels tensor, and optionally a
 94 |     tensor containing the number of frames per video. The exact dimensions
 95 |     depend on the reader being used.
 96 | 
 97 |   Raises:
 98 |     IOError: If no files matching the given pattern were found.
 99 |   """
100 |   logging.info("Using batch size of " + str(batch_size) + " for evaluation.")
101 |   with tf.name_scope("eval_input"):
102 |     files = gfile.Glob(data_pattern)
103 |     if not files:
104 |       raise IOError("Unable to find the evaluation files.")
105 |     logging.info("number of evaluation files: " + str(len(files)))
106 |     filename_queue = tf.train.string_input_producer(
107 |         files, shuffle=False, num_epochs=1)
108 |     eval_data = [
109 |         reader.prepare_reader(filename_queue) for _ in range(num_readers)
110 |     ]
111 |     return tf.train.batch_join(
112 |         eval_data,
113 |         batch_size=batch_size,
114 |         capacity=3 * batch_size,
115 |         allow_smaller_final_batch=True,
116 |         enqueue_many=True)
117 | 
118 | 
119 | def build_graph(reader,
120 |                 model,
121 |                 eval_data_pattern,
122 |                 label_loss_fn,
123 |                 batch_size=1024,
124 |                 num_readers=1):
125 |   """Creates the Tensorflow graph for evaluation.
126 | 
127 |   Args:
128 |     reader: The data file reader. It should inherit from BaseReader.
129 |     model: The core model (e.g. logistic or neural net). It should inherit
130 |            from BaseModel.
131 |     eval_data_pattern: glob path to the evaluation data files.
132 |     label_loss_fn: What kind of loss to apply to the model. It should inherit
133 |                 from BaseLoss.
134 |     batch_size: How many examples to process at a time.
135 |     num_readers: How many threads to use for I/O operations.
136 |   """
137 | 
138 |   global_step = tf.Variable(0, trainable=False, name="global_step")
139 |   video_id_batch, model_input_raw, labels_batch, num_frames = get_input_evaluation_tensors(  # pylint: disable=g-line-too-long
140 |       reader,
141 |       eval_data_pattern,
142 |       batch_size=batch_size,
143 |       num_readers=num_readers)
144 |   tf.summary.histogram("model_input_raw", model_input_raw)
145 | 
146 |   feature_dim = len(model_input_raw.get_shape()) - 1
147 | 
148 |   # Normalize input features.
149 |   model_input = tf.nn.l2_normalize(model_input_raw, feature_dim)
150 | 
151 |   with tf.name_scope("model"):
152 |     result = model.create_model(model_input,
153 |                                 num_frames=num_frames,
154 |                                 vocab_size=reader.num_classes,
155 |                                 labels=labels_batch,
156 |                                 is_training=False)
157 |     predictions = result["predictions"]
158 |     tf.summary.histogram("model_activations", predictions)
159 |     if "loss" in result.keys():
160 |       label_loss = result["loss"]
161 |     else:
162 |       label_loss = label_loss_fn.calculate_loss(predictions, labels_batch)
163 | 
164 |   tf.add_to_collection("global_step", global_step)
165 |   tf.add_to_collection("loss", label_loss)
166 |   tf.add_to_collection("predictions", predictions)
167 |   tf.add_to_collection("input_batch", model_input)
168 |   tf.add_to_collection("video_id_batch", video_id_batch)
169 |   tf.add_to_collection("num_frames", num_frames)
170 |   tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32))
171 |   tf.add_to_collection("summary_op", tf.summary.merge_all())
172 | 
173 | 
174 | def evaluation_loop(video_id_batch, prediction_batch, label_batch, loss,
175 |                     summary_op, saver, summary_writer, evl_metrics,
176 |                     last_global_step_val):
177 |   """Run the evaluation loop once.
178 | 
179 |   Args:
180 |     video_id_batch: a tensor of video ids mini-batch.
181 |     prediction_batch: a tensor of predictions mini-batch.
182 |     label_batch: a tensor of label_batch mini-batch.
183 |     loss: a tensor of loss for the examples in the mini-batch.
184 |     summary_op: a tensor which runs the tensorboard summary operations.
185 |     saver: a tensorflow saver to restore the model.
186 |     summary_writer: a tensorflow summary_writer
187 |     evl_metrics: an EvaluationMetrics object.
188 |     last_global_step_val: the global step used in the previous evaluation.
189 | 
190 |   Returns:
191 |     The global_step used in the latest model.
192 |   """
193 | 
194 |   global_step_val = -1
195 |   with tf.Session() as sess:
196 |     if FLAGS.check_point < 0:
197 |       latest_checkpoint = tf.train.latest_checkpoint(FLAGS.train_dir)
198 |     else:
199 |       latest_checkpoint = FLAGS.train_dir + "/model.ckpt-" + str(FLAGS.check_point)
200 | 
201 |     if latest_checkpoint:
202 |       logging.info("Loading checkpoint for eval: " + latest_checkpoint)
203 |       # Restores from checkpoint
204 |       saver.restore(sess, latest_checkpoint)
205 |       # Assuming model_checkpoint_path looks something like:
206 |       # /my-favorite-path/yt8m_train/model.ckpt-0, extract global_step from it.
207 |       global_step_val = latest_checkpoint.split("/")[-1].split("-")[-1]
208 |     else:
209 |       logging.info("No checkpoint file found.")
210 |       return global_step_val
211 | 
212 |     if global_step_val == last_global_step_val:
213 |       logging.info("skip this checkpoint global_step_val=%s "
214 |                    "(same as the previous one).", global_step_val)
215 |       return global_step_val
216 | 
217 |     sess.run([tf.local_variables_initializer()])
218 | 
219 |     # Start the queue runners.
220 |     fetches = [video_id_batch, prediction_batch, label_batch, loss, summary_op]
221 |     coord = tf.train.Coordinator()
222 |     try:
223 |       threads = []
224 |       for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS):
225 |         threads.extend(qr.create_threads(
226 |             sess, coord=coord, daemon=True,
227 |             start=True))
228 |       logging.info("enter eval_once loop global_step_val = %s. ",
229 |                    global_step_val)
230 | 
231 |       evl_metrics.clear()
232 | 
233 |       examples_processed = 0
234 |       while not coord.should_stop():
235 |         batch_start_time = time.time()
236 |         _, predictions_val, labels_val, loss_val, summary_val = sess.run(
237 |             fetches)
238 |         seconds_per_batch = time.time() - batch_start_time
239 |         example_per_second = labels_val.shape[0] / seconds_per_batch
240 |         examples_processed += labels_val.shape[0]
241 | 
242 |         iteration_info_dict = evl_metrics.accumulate(predictions_val,
243 |                                                      labels_val, loss_val)
244 |         iteration_info_dict["examples_per_second"] = example_per_second
245 | 
246 |         iterinfo = utils.AddGlobalStepSummary(
247 |             summary_writer,
248 |             global_step_val,
249 |             iteration_info_dict,
250 |             summary_scope="Eval")
251 |         logging.info("examples_processed: %d | %s", examples_processed,
252 |                      iterinfo)
253 | 
254 |     except tf.errors.OutOfRangeError as e:
255 |       logging.info(
256 |           "Done with batched inference. Now calculating global performance "
257 |           "metrics.")
258 |       # calculate the metrics for the entire epoch
259 |       epoch_info_dict = evl_metrics.get()
260 |       epoch_info_dict["epoch_id"] = global_step_val
261 | 
262 |       summary_writer.add_summary(summary_val, global_step_val)
263 |       epochinfo = utils.AddEpochSummary(
264 |           summary_writer,
265 |           global_step_val,
266 |           epoch_info_dict,
267 |           summary_scope="Eval")
268 |       logging.info(epochinfo)
269 |       evl_metrics.clear()
270 |     except Exception as e:  # pylint: disable=broad-except
271 |       logging.info("Unexpected exception: " + str(e))
272 |       coord.request_stop(e)
273 | 
274 |     coord.request_stop()
275 |     coord.join(threads, stop_grace_period_secs=10)
276 | 
277 |     return global_step_val
278 | 
279 | 
280 | def evaluate():
281 |   tf.set_random_seed(0)  # for reproducibility
282 |   with tf.Graph().as_default():
283 |     # convert feature_names and feature_sizes to lists of values
284 |     feature_names, feature_sizes = utils.GetListOfFeatureNamesAndSizes(
285 |         FLAGS.feature_names, FLAGS.feature_sizes)
286 | 
287 |     if FLAGS.frame_features:
288 |       reader = readers.YT8MFrameFeatureReader(feature_names=feature_names,
289 |                                               feature_sizes=feature_sizes)
290 |     else:
291 |       reader = readers.YT8MAggregatedFeatureReader(feature_names=feature_names,
292 |                                                    feature_sizes=feature_sizes)
293 | 
294 |     model = find_class_by_name(FLAGS.model,
295 |         [frame_level_models, video_level_models])()
296 |     label_loss_fn = find_class_by_name(FLAGS.label_loss, [losses])()
297 | 
298 |     if FLAGS.eval_data_pattern is "":
299 |       raise IOError("'eval_data_pattern' was not specified. " +
300 |                      "Nothing to evaluate.")
301 | 
302 |     build_graph(
303 |         reader=reader,
304 |         model=model,
305 |         eval_data_pattern=FLAGS.eval_data_pattern,
306 |         label_loss_fn=label_loss_fn,
307 |         num_readers=FLAGS.num_readers,
308 |         batch_size=FLAGS.batch_size)
309 |     logging.info("built evaluation graph")
310 |     video_id_batch = tf.get_collection("video_id_batch")[0]
311 |     prediction_batch = tf.get_collection("predictions")[0]
312 |     label_batch = tf.get_collection("labels")[0]
313 |     loss = tf.get_collection("loss")[0]
314 |     summary_op = tf.get_collection("summary_op")[0]
315 | 
316 |     saver = tf.train.Saver(tf.global_variables())
317 |     summary_writer = tf.summary.FileWriter(
318 |         FLAGS.train_dir, graph=tf.get_default_graph())
319 | 
320 |     evl_metrics = eval_util.EvaluationMetrics(reader.num_classes, FLAGS.top_k)
321 | 
322 |     last_global_step_val = -1
323 |     while True:
324 |       last_global_step_val = evaluation_loop(video_id_batch, prediction_batch,
325 |                                              label_batch, loss, summary_op,
326 |                                              saver, summary_writer, evl_metrics,
327 |                                              last_global_step_val)
328 |       if FLAGS.run_once:
329 |         break
330 | 
331 | 
332 | def main(unused_argv):
333 |   logging.set_verbosity(tf.logging.INFO)
334 |   print("tensorflow version: %s" % tf.__version__)
335 |   evaluate()
336 | 
337 | 
338 | if __name__ == "__main__":
339 |   app.run()
340 | 
341 | 


--------------------------------------------------------------------------------
/eval_util.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Provides functions to help with evaluating models."""
 16 | import datetime
 17 | import numpy
 18 | 
 19 | #from tensorflow.python.platform import gfile
 20 | 
 21 | import mean_average_precision_calculator as map_calculator
 22 | import average_precision_calculator as ap_calculator
 23 | 
 24 | def flatten(l):
 25 |   """ Merges a list of lists into a single list. """
 26 |   return [item for sublist in l for item in sublist]
 27 | 
 28 | def calculate_hit_at_one(predictions, actuals):
 29 |   """Performs a local (numpy) calculation of the hit at one.
 30 | 
 31 |   Args:
 32 |     predictions: Matrix containing the outputs of the model.
 33 |       Dimensions are 'batch' x 'num_classes'.
 34 |     actuals: Matrix containing the ground truth labels.
 35 |       Dimensions are 'batch' x 'num_classes'.
 36 | 
 37 |   Returns:
 38 |     float: The average hit at one across the entire batch.
 39 |   """
 40 |   top_prediction = numpy.argmax(predictions, 1)
 41 |   hits = actuals[numpy.arange(actuals.shape[0]), top_prediction]
 42 |   return numpy.average(hits)
 43 | 
 44 | 
 45 | def calculate_precision_at_equal_recall_rate(predictions, actuals):
 46 |   """Performs a local (numpy) calculation of the PERR.
 47 | 
 48 |   Args:
 49 |     predictions: Matrix containing the outputs of the model.
 50 |       Dimensions are 'batch' x 'num_classes'.
 51 |     actuals: Matrix containing the ground truth labels.
 52 |       Dimensions are 'batch' x 'num_classes'.
 53 | 
 54 |   Returns:
 55 |     float: The average precision at equal recall rate across the entire batch.
 56 |   """
 57 |   aggregated_precision = 0.0
 58 |   num_videos = actuals.shape[0]
 59 |   for row in numpy.arange(num_videos):
 60 |     num_labels = int(numpy.sum(actuals[row]))
 61 |     top_indices = numpy.argpartition(predictions[row],
 62 |                                      -num_labels)[-num_labels:]
 63 |     item_precision = 0.0
 64 |     for label_index in top_indices:
 65 |       if predictions[row][label_index] > 0:
 66 |         item_precision += actuals[row][label_index]
 67 |     item_precision /= top_indices.size
 68 |     aggregated_precision += item_precision
 69 |   aggregated_precision /= num_videos
 70 |   return aggregated_precision
 71 | 
 72 | def calculate_gap(predictions, actuals, top_k=20):
 73 |   """Performs a local (numpy) calculation of the global average precision.
 74 | 
 75 |   Only the top_k predictions are taken for each of the videos.
 76 | 
 77 |   Args:
 78 |     predictions: Matrix containing the outputs of the model.
 79 |       Dimensions are 'batch' x 'num_classes'.
 80 |     actuals: Matrix containing the ground truth labels.
 81 |       Dimensions are 'batch' x 'num_classes'.
 82 |     top_k: How many predictions to use per video.
 83 | 
 84 |   Returns:
 85 |     float: The global average precision.
 86 |   """
 87 |   gap_calculator = ap_calculator.AveragePrecisionCalculator()
 88 |   sparse_predictions, sparse_labels, num_positives = top_k_by_class(predictions, actuals, top_k)
 89 |   gap_calculator.accumulate(flatten(sparse_predictions), flatten(sparse_labels), sum(num_positives))
 90 |   return gap_calculator.peek_ap_at_n()
 91 | 
 92 | 
 93 | def top_k_by_class(predictions, labels, k=20):
 94 |   """Extracts the top k predictions for each video, sorted by class.
 95 | 
 96 |   Args:
 97 |     predictions: A numpy matrix containing the outputs of the model.
 98 |       Dimensions are 'batch' x 'num_classes'.
 99 |     k: the top k non-zero entries to preserve in each prediction.
100 | 
101 |   Returns:
102 |     A tuple (predictions,labels, true_positives). 'predictions' and 'labels'
103 |     are lists of lists of floats. 'true_positives' is a list of scalars. The
104 |     length of the lists are equal to the number of classes. The entries in the
105 |     predictions variable are probability predictions, and
106 |     the corresponding entries in the labels variable are the ground truth for
107 |     those predictions. The entries in 'true_positives' are the number of true
108 |     positives for each class in the ground truth.
109 | 
110 |   Raises:
111 |     ValueError: An error occurred when the k is not a positive integer.
112 |   """
113 |   if k <= 0:
114 |     raise ValueError("k must be a positive integer.")
115 |   k = min(k, predictions.shape[1])
116 |   num_classes = predictions.shape[1]
117 |   prediction_triplets= []
118 |   for video_index in range(predictions.shape[0]):
119 |     prediction_triplets.extend(top_k_triplets(predictions[video_index],labels[video_index], k))
120 |   out_predictions = [[] for v in range(num_classes)]
121 |   out_labels = [[] for v in range(num_classes)]
122 |   for triplet in prediction_triplets:
123 |     out_predictions[triplet[0]].append(triplet[1])
124 |     out_labels[triplet[0]].append(triplet[2])
125 |   out_true_positives = [numpy.sum(labels[:,i]) for i in range(num_classes)]
126 | 
127 |   return out_predictions, out_labels, out_true_positives
128 | 
129 | def top_k_triplets(predictions, labels, k=20):
130 |   """Get the top_k for a 1-d numpy array. Returns a sparse list of tuples in
131 |   (prediction, class) format"""
132 |   m = len(predictions)
133 |   k = min(k, m)
134 |   indices = numpy.argpartition(predictions, -k)[-k:]
135 |   return [(index, predictions[index], labels[index]) for index in indices]
136 | 
137 | class EvaluationMetrics(object):
138 |   """A class to store the evaluation metrics."""
139 | 
140 |   def __init__(self, num_class, top_k):
141 |     """Construct an EvaluationMetrics object to store the evaluation metrics.
142 | 
143 |     Args:
144 |       num_class: A positive integer specifying the number of classes.
145 |       top_k: A positive integer specifying how many predictions are considered per video.
146 | 
147 |     Raises:
148 |       ValueError: An error occurred when MeanAveragePrecisionCalculator cannot
149 |         not be constructed.
150 |     """
151 |     self.sum_hit_at_one = 0.0
152 |     self.sum_perr = 0.0
153 |     self.sum_loss = 0.0
154 |     self.map_calculator = map_calculator.MeanAveragePrecisionCalculator(num_class)
155 |     self.global_ap_calculator = ap_calculator.AveragePrecisionCalculator()
156 |     self.top_k = top_k
157 |     self.num_examples = 0
158 | 
159 |   def accumulate(self, predictions, labels, loss):
160 |     """Accumulate the metrics calculated locally for this mini-batch.
161 | 
162 |     Args:
163 |       predictions: A numpy matrix containing the outputs of the model.
164 |         Dimensions are 'batch' x 'num_classes'.
165 |       labels: A numpy matrix containing the ground truth labels.
166 |         Dimensions are 'batch' x 'num_classes'.
167 |       loss: A numpy array containing the loss for each sample.
168 | 
169 |     Returns:
170 |       dictionary: A dictionary storing the metrics for the mini-batch.
171 | 
172 |     Raises:
173 |       ValueError: An error occurred when the shape of predictions and actuals
174 |         does not match.
175 |     """
176 |     batch_size = labels.shape[0]
177 |     mean_hit_at_one = calculate_hit_at_one(predictions, labels)
178 |     mean_perr = calculate_precision_at_equal_recall_rate(predictions, labels)
179 |     mean_loss = numpy.mean(loss)
180 | 
181 |     # Take the top 20 predictions.
182 |     sparse_predictions, sparse_labels, num_positives = top_k_by_class(predictions, labels, self.top_k)
183 |     self.map_calculator.accumulate(sparse_predictions, sparse_labels, num_positives)
184 |     self.global_ap_calculator.accumulate(flatten(sparse_predictions), flatten(sparse_labels), sum(num_positives))
185 | 
186 |     self.num_examples += batch_size
187 |     self.sum_hit_at_one += mean_hit_at_one * batch_size
188 |     self.sum_perr += mean_perr * batch_size
189 |     self.sum_loss += mean_loss * batch_size
190 | 
191 |     return {"hit_at_one": mean_hit_at_one, "perr": mean_perr, "loss": mean_loss}
192 | 
193 |   def get(self):
194 |     """Calculate the evaluation metrics for the whole epoch.
195 | 
196 |     Raises:
197 |       ValueError: If no examples were accumulated.
198 | 
199 |     Returns:
200 |       dictionary: a dictionary storing the evaluation metrics for the epoch. The
201 |         dictionary has the fields: avg_hit_at_one, avg_perr, avg_loss, and
202 |         aps (default nan).
203 |     """
204 |     if self.num_examples <= 0:
205 |       raise ValueError("total_sample must be positive.")
206 |     avg_hit_at_one = self.sum_hit_at_one / self.num_examples
207 |     avg_perr = self.sum_perr / self.num_examples
208 |     avg_loss = self.sum_loss / self.num_examples
209 | 
210 |     aps = self.map_calculator.peek_map_at_n()
211 |     gap = self.global_ap_calculator.peek_ap_at_n()
212 | 
213 |     epoch_info_dict = {}
214 |     return {"avg_hit_at_one": avg_hit_at_one, "avg_perr": avg_perr,
215 |             "avg_loss": avg_loss, "aps": aps, "gap": gap}
216 | 
217 |   def clear(self):
218 |     """Clear the evaluation metrics and reset the EvaluationMetrics object."""
219 |     self.sum_hit_at_one = 0.0
220 |     self.sum_perr = 0.0
221 |     self.sum_loss = 0.0
222 |     self.map_calculator.clear()
223 |     self.global_ap_calculator.clear()
224 |     self.num_examples = 0
225 | 


--------------------------------------------------------------------------------
/export_model.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Utilities to export a model for batch prediction."""
 15 | 
 16 | import tensorflow as tf
 17 | import tensorflow.contrib.slim as slim
 18 | 
 19 | from tensorflow.python.saved_model import builder as saved_model_builder
 20 | from tensorflow.python.saved_model import signature_constants
 21 | from tensorflow.python.saved_model import signature_def_utils
 22 | from tensorflow.python.saved_model import tag_constants
 23 | from tensorflow.python.saved_model import utils as saved_model_utils
 24 | 
 25 | _TOP_PREDICTIONS_IN_OUTPUT = 20
 26 | 
 27 | class ModelExporter(object):
 28 | 
 29 |   def __init__(self, frame_features, model, reader):
 30 |     self.frame_features = frame_features
 31 |     self.model = model
 32 |     self.reader = reader
 33 | 
 34 |     with tf.Graph().as_default() as graph:
 35 |       self.inputs, self.outputs = self.build_inputs_and_outputs()
 36 |       self.graph = graph
 37 |       self.saver = tf.train.Saver(tf.trainable_variables(), sharded=True)
 38 | 
 39 |   def export_model(self, model_dir, global_step_val, last_checkpoint):
 40 |     """Exports the model so that it can used for batch predictions."""
 41 | 
 42 |     with self.graph.as_default():
 43 |       with tf.Session() as session:
 44 |         session.run(tf.global_variables_initializer())
 45 |         self.saver.restore(session, last_checkpoint)
 46 | 
 47 |         signature = signature_def_utils.build_signature_def(
 48 |             inputs=self.inputs,
 49 |             outputs=self.outputs,
 50 |             method_name=signature_constants.PREDICT_METHOD_NAME)
 51 | 
 52 |         signature_map = {signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: 
 53 |                          signature}
 54 | 
 55 |         model_builder = saved_model_builder.SavedModelBuilder(model_dir)
 56 |         model_builder.add_meta_graph_and_variables(session,
 57 |             tags=[tag_constants.SERVING],
 58 |             signature_def_map=signature_map,
 59 |             clear_devices=True)
 60 |         model_builder.save()
 61 | 
 62 |   def build_inputs_and_outputs(self):
 63 | 
 64 |     if self.frame_features:
 65 | 
 66 |       serialized_examples = tf.placeholder(tf.string, shape=(None,))
 67 | 
 68 |       fn = lambda x: self.build_prediction_graph(x)
 69 |       video_id_output, top_indices_output, top_predictions_output = (
 70 |           tf.map_fn(fn, serialized_examples, 
 71 |                     dtype=(tf.string, tf.int32, tf.float32)))
 72 | 
 73 |     else:
 74 | 
 75 |       serialized_examples = tf.placeholder(tf.string, shape=(None,))
 76 | 
 77 |       video_id_output, top_indices_output, top_predictions_output = (
 78 |           self.build_prediction_graph(serialized_examples))
 79 | 
 80 |     inputs = {"example_bytes": 
 81 |               saved_model_utils.build_tensor_info(serialized_examples)}
 82 | 
 83 |     outputs = {
 84 |         "video_id": saved_model_utils.build_tensor_info(video_id_output),
 85 |         "class_indexes": saved_model_utils.build_tensor_info(top_indices_output),
 86 |         "predictions": saved_model_utils.build_tensor_info(top_predictions_output)}
 87 | 
 88 |     return inputs, outputs
 89 | 
 90 |   def build_prediction_graph(self, serialized_examples):    
 91 | 
 92 |     video_id, model_input_raw, labels_batch, num_frames = (
 93 |         self.reader.prepare_serialized_examples(serialized_examples))
 94 | 
 95 |     feature_dim = len(model_input_raw.get_shape()) - 1
 96 |     model_input = tf.nn.l2_normalize(model_input_raw, feature_dim)
 97 | 
 98 |     with tf.name_scope("model"):
 99 |       result = self.model.create_model(
100 |           model_input,
101 |           num_frames=num_frames,
102 |           vocab_size=self.reader.num_classes,
103 |           labels=labels_batch,
104 |           is_training=False)
105 | 
106 |       for variable in slim.get_model_variables():
107 |         tf.summary.histogram(variable.op.name, variable)
108 | 
109 |       predictions = result["predictions"]
110 | 
111 |       top_predictions, top_indices = tf.nn.top_k(predictions, 
112 |           _TOP_PREDICTIONS_IN_OUTPUT)
113 |     return video_id, top_indices, top_predictions
114 | 


--------------------------------------------------------------------------------
/file_averaging.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Antoine Miech All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | 
17 | 
18 | import os
19 | from collections import defaultdict, Counter
20 | import pickle
21 | import pandas as pd
22 | 
23 | SUBMIT_PATH = ''
24 | SIGFIGS = 6
25 | 
26 | def read_models(model_weights, blend=None):
27 |     if not blend:
28 |         blend = defaultdict(Counter)
29 |     for m, w in model_weights.items():
30 |         print(m, w)
31 |         with open(os.path.join(SUBMIT_PATH, m + '.csv'), 'r') as f:
32 |             f.readline()
33 |             for l in f:
34 |                 id, r = l.split(',')
35 |                 id, r = int(id), r.split(' ')
36 |                 n = len(r) // 2
37 |                 for i in range(0, n, 2):
38 |                     k = int(r[i])
39 |                     v = int(10**(SIGFIGS - 1) * float(r[i+1]))
40 |                     blend[id][k] += w * v
41 |     return blend
42 | 
43 | 
44 | def write_models(blend, file_name, total_weight):
45 |     with open(os.path.join(SUBMIT_PATH, file_name + '.csv'), 'w') as f:
46 |         f.write('VideoID,LabelConfidencePairs\n')
47 |         for id, v in blend.items():
48 |             l = ' '.join(['{} {:{}f}'.format(t[0]
49 |                                             , float(t[1]) / 10 ** (SIGFIGS - 1) / total_weight
50 |                                             , SIGFIGS) for t in v.most_common(20)])
51 |             f.write(','.join([str(id), l + '\n']))
52 |     return None
53 | 
54 | 
55 | model_pred = {'test-gatednetvladLF-256k-1024-80-0002-300iter-norelu-basic-gatedmoe': 1
56 |                   , 'test-GRU-0002-1200-2': 1
57 |                   , 'test-gatednetfvLF-128k-1024-80-0002-300iter-norelu-basic-gatedmoe': 1
58 |                   , 'test-gateddboflf-4096-1024-80-0002-300iter': 1
59 |                   , 'test-softdboflf-8000-1024-80-0002-300iter': 1
60 |                   , 'test-gatedlightvladLF-256k-1024-80-0002-300iter-norelu-basic-gatedmoe': 1
61 |                   , 'test-lstm-0002-val-150-random': 1
62 |                  }
63 | 
64 | avg = read_models(model_pred)
65 | write_models(avg, 'WILLOW_submission', sum(model_pred.values()))
66 | 


--------------------------------------------------------------------------------
/frame_level_models.py:
--------------------------------------------------------------------------------
   1 | # Copyright 2017 Antoine Miech All Rights Reserved.
   2 | #
   3 | # Licensed under the Apache License, Version 2.0 (the "License");
   4 | # you may not use this file except in compliance with the License.
   5 | # You may obtain a copy of the License at
   6 | #
   7 | #      http://www.apache.org/licenses/LICENSE-2.0
   8 | #
   9 | # Unless required by applicable law or agreed to in writing, software
  10 | # distributed under the License is distributed on an "AS-IS" BASIS,
  11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 | # See the License for the specific language governing permissions and
  13 | # limitations under the License.
  14 | 
  15 | """Contains a collection of models which operate on variable-length sequences.
  16 | """
  17 | import math
  18 | 
  19 | import models
  20 | import video_level_models
  21 | import tensorflow as tf
  22 | import model_utils as utils
  23 | 
  24 | import tensorflow.contrib.slim as slim
  25 | from tensorflow import flags
  26 | 
  27 | import scipy.io as sio
  28 | import numpy as np
  29 | 
  30 | FLAGS = flags.FLAGS
  31 | 
  32 | 
  33 | flags.DEFINE_bool("gating_remove_diag", False,
  34 |                   "Remove diag for self gating")
  35 | flags.DEFINE_bool("lightvlad", False,
  36 |                   "Light or full NetVLAD")
  37 | flags.DEFINE_bool("vlagd", False,
  38 |                   "vlagd of vlad")
  39 | 
  40 | 
  41 | 
  42 | flags.DEFINE_integer("iterations", 30,
  43 |                      "Number of frames per batch for DBoF.")
  44 | flags.DEFINE_bool("dbof_add_batch_norm", True,
  45 |                   "Adds batch normalization to the DBoF model.")
  46 | flags.DEFINE_bool(
  47 |     "sample_random_frames", True,
  48 |     "If true samples random frames (for frame level models). If false, a random"
  49 |     "sequence of frames is sampled instead.")
  50 | flags.DEFINE_integer("dbof_cluster_size", 16384,
  51 |                      "Number of units in the DBoF cluster layer.")
  52 | flags.DEFINE_integer("dbof_hidden_size", 2048,
  53 |                     "Number of units in the DBoF hidden layer.")
  54 | flags.DEFINE_bool("dbof_relu", True, 'add ReLU to hidden layer')
  55 | flags.DEFINE_integer("dbof_var_features", 0,
  56 |                      "Variance features on top of Dbof cluster layer.")
  57 | 
  58 | flags.DEFINE_string("dbof_activation", "relu", 'dbof activation')
  59 | 
  60 | flags.DEFINE_bool("softdbof_maxpool", False, 'add max pool to soft dbof')
  61 | 
  62 | flags.DEFINE_integer("netvlad_cluster_size", 64,
  63 |                      "Number of units in the NetVLAD cluster layer.")
  64 | flags.DEFINE_bool("netvlad_relu", True, 'add ReLU to hidden layer')
  65 | flags.DEFINE_integer("netvlad_dimred", -1,
  66 |                    "NetVLAD output dimension reduction")
  67 | flags.DEFINE_integer("gatednetvlad_dimred", 1024,
  68 |                    "GatedNetVLAD output dimension reduction")
  69 | 
  70 | flags.DEFINE_bool("gating", False,
  71 |                    "Gating for NetVLAD")
  72 | flags.DEFINE_integer("hidden_size", 1024,
  73 |                      "size of hidden layer for BasicStatModel.")
  74 | 
  75 | 
  76 | flags.DEFINE_integer("netvlad_hidden_size", 1024,
  77 |                      "Number of units in the NetVLAD hidden layer.")
  78 | 
  79 | flags.DEFINE_integer("netvlad_hidden_size_video", 1024,
  80 |                      "Number of units in the NetVLAD video hidden layer.")
  81 | 
  82 | flags.DEFINE_integer("netvlad_hidden_size_audio", 64,
  83 |                      "Number of units in the NetVLAD audio hidden layer.")
  84 | 
  85 | 
  86 | 
  87 | flags.DEFINE_bool("netvlad_add_batch_norm", True,
  88 |                   "Adds batch normalization to the DBoF model.")
  89 | 
  90 | flags.DEFINE_integer("fv_cluster_size", 64,
  91 |                      "Number of units in the NetVLAD cluster layer.")
  92 | 
  93 | flags.DEFINE_integer("fv_hidden_size", 2048,
  94 |                      "Number of units in the NetVLAD hidden layer.")
  95 | flags.DEFINE_bool("fv_relu", True,
  96 |                      "ReLU after the NetFV hidden layer.")
  97 | 
  98 | 
  99 | flags.DEFINE_bool("fv_couple_weights", True,
 100 |                      "Coupling cluster weights or not")
 101 |  
 102 | flags.DEFINE_float("fv_coupling_factor", 0.01,
 103 |                      "Coupling factor")
 104 | 
 105 | 
 106 | flags.DEFINE_string("dbof_pooling_method", "max",
 107 |                     "The pooling method used in the DBoF cluster layer. "
 108 |                     "Choices are 'average' and 'max'.")
 109 | flags.DEFINE_string("video_level_classifier_model", "MoeModel",
 110 |                     "Some Frame-Level models can be decomposed into a "
 111 |                     "generalized pooling operation followed by a "
 112 |                     "classifier layer")
 113 | flags.DEFINE_integer("lstm_cells", 1024, "Number of LSTM cells.")
 114 | flags.DEFINE_integer("lstm_layers", 2, "Number of LSTM layers.")
 115 | flags.DEFINE_integer("lstm_cells_video", 1024, "Number of LSTM cells (video).")
 116 | flags.DEFINE_integer("lstm_cells_audio", 128, "Number of LSTM cells (audio).")
 117 | 
 118 | 
 119 | 
 120 | flags.DEFINE_integer("gru_cells", 1024, "Number of GRU cells.")
 121 | flags.DEFINE_integer("gru_cells_video", 1024, "Number of GRU cells (video).")
 122 | flags.DEFINE_integer("gru_cells_audio", 128, "Number of GRU cells (audio).")
 123 | flags.DEFINE_integer("gru_layers", 2, "Number of GRU layers.")
 124 | flags.DEFINE_bool("lstm_random_sequence", False,
 125 |                      "Random sequence input for lstm.")
 126 | flags.DEFINE_bool("gru_random_sequence", False,
 127 |                      "Random sequence input for gru.")
 128 | flags.DEFINE_bool("gru_backward", False, "BW reading for GRU")
 129 | flags.DEFINE_bool("lstm_backward", False, "BW reading for LSTM")
 130 | 
 131 | 
 132 | flags.DEFINE_bool("fc_dimred", True, "Adding FC dimred after pooling")
 133 | 
 134 | class LightVLAD():
 135 |     def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training):
 136 |         self.feature_size = feature_size
 137 |         self.max_frames = max_frames
 138 |         self.is_training = is_training
 139 |         self.add_batch_norm = add_batch_norm
 140 |         self.cluster_size = cluster_size
 141 | 
 142 |     def forward(self,reshaped_input):
 143 | 
 144 | 
 145 |         cluster_weights = tf.get_variable("cluster_weights",
 146 |               [self.feature_size, self.cluster_size],
 147 |               initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
 148 |        
 149 |         activation = tf.matmul(reshaped_input, cluster_weights)
 150 |         
 151 |         if self.add_batch_norm:
 152 |           activation = slim.batch_norm(
 153 |               activation,
 154 |               center=True,
 155 |               scale=True,
 156 |               is_training=self.is_training,
 157 |               scope="cluster_bn")
 158 |         else:
 159 |           cluster_biases = tf.get_variable("cluster_biases",
 160 |             [cluster_size],
 161 |             initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
 162 |           tf.summary.histogram("cluster_biases", cluster_biases)
 163 |           activation += cluster_biases
 164 |         
 165 |         activation = tf.nn.softmax(activation)
 166 | 
 167 |         activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size])
 168 |        
 169 |         activation = tf.transpose(activation,perm=[0,2,1])
 170 |         
 171 |         reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size])
 172 |         vlad = tf.matmul(activation,reshaped_input)
 173 |         
 174 |         vlad = tf.transpose(vlad,perm=[0,2,1])
 175 |         vlad = tf.nn.l2_normalize(vlad,1)
 176 | 
 177 |         vlad = tf.reshape(vlad,[-1,self.cluster_size*self.feature_size])
 178 |         vlad = tf.nn.l2_normalize(vlad,1)
 179 | 
 180 |         return vlad
 181 | 
 182 | 
 183 | class NetVLAD():
 184 |     def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training):
 185 |         self.feature_size = feature_size
 186 |         self.max_frames = max_frames
 187 |         self.is_training = is_training
 188 |         self.add_batch_norm = add_batch_norm
 189 |         self.cluster_size = cluster_size
 190 | 
 191 |     def forward(self,reshaped_input):
 192 | 
 193 | 
 194 |         cluster_weights = tf.get_variable("cluster_weights",
 195 |               [self.feature_size, self.cluster_size],
 196 |               initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
 197 |        
 198 |         tf.summary.histogram("cluster_weights", cluster_weights)
 199 |         activation = tf.matmul(reshaped_input, cluster_weights)
 200 |         
 201 |         if self.add_batch_norm:
 202 |           activation = slim.batch_norm(
 203 |               activation,
 204 |               center=True,
 205 |               scale=True,
 206 |               is_training=self.is_training,
 207 |               scope="cluster_bn")
 208 |         else:
 209 |           cluster_biases = tf.get_variable("cluster_biases",
 210 |             [cluster_size],
 211 |             initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
 212 |           tf.summary.histogram("cluster_biases", cluster_biases)
 213 |           activation += cluster_biases
 214 |         
 215 |         activation = tf.nn.softmax(activation)
 216 |         tf.summary.histogram("cluster_output", activation)
 217 | 
 218 |         activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size])
 219 | 
 220 |         a_sum = tf.reduce_sum(activation,-2,keep_dims=True)
 221 | 
 222 |         cluster_weights2 = tf.get_variable("cluster_weights2",
 223 |             [1,self.feature_size, self.cluster_size],
 224 |             initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
 225 |         
 226 |         a = tf.multiply(a_sum,cluster_weights2)
 227 |         
 228 |         activation = tf.transpose(activation,perm=[0,2,1])
 229 |         
 230 |         reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size])
 231 |         vlad = tf.matmul(activation,reshaped_input)
 232 |         vlad = tf.transpose(vlad,perm=[0,2,1])
 233 |         vlad = tf.subtract(vlad,a)
 234 |         
 235 | 
 236 |         vlad = tf.nn.l2_normalize(vlad,1)
 237 | 
 238 |         vlad = tf.reshape(vlad,[-1,self.cluster_size*self.feature_size])
 239 |         vlad = tf.nn.l2_normalize(vlad,1)
 240 | 
 241 |         return vlad
 242 | 
 243 | 
 244 | class NetVLAGD():
 245 |     def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training):
 246 |         self.feature_size = feature_size
 247 |         self.max_frames = max_frames
 248 |         self.is_training = is_training
 249 |         self.add_batch_norm = add_batch_norm
 250 |         self.cluster_size = cluster_size
 251 | 
 252 |     def forward(self,reshaped_input):
 253 | 
 254 | 
 255 |         cluster_weights = tf.get_variable("cluster_weights",
 256 |               [self.feature_size, self.cluster_size],
 257 |               initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
 258 |        
 259 |         activation = tf.matmul(reshaped_input, cluster_weights)
 260 |         
 261 |         if self.add_batch_norm:
 262 |           activation = slim.batch_norm(
 263 |               activation,
 264 |               center=True,
 265 |               scale=True,
 266 |               is_training=self.is_training,
 267 |               scope="cluster_bn")
 268 |         else:
 269 |           cluster_biases = tf.get_variable("cluster_biases",
 270 |             [cluster_size],
 271 |             initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
 272 |         
 273 |         activation = tf.nn.softmax(activation)
 274 | 
 275 |         activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size])
 276 | 
 277 |         gate_weights = tf.get_variable("gate_weights",
 278 |             [1, self.cluster_size,self.feature_size],
 279 |             initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
 280 |         
 281 |         gate_weights = tf.sigmoid(gate_weights)
 282 | 
 283 |         activation = tf.transpose(activation,perm=[0,2,1])
 284 |         
 285 |         reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size])
 286 | 
 287 |         vlagd = tf.matmul(activation,reshaped_input)
 288 |         vlagd = tf.multiply(vlagd,gate_weights)
 289 | 
 290 |         vlagd = tf.transpose(vlagd,perm=[0,2,1])
 291 |         
 292 |         vlagd = tf.nn.l2_normalize(vlagd,1)
 293 | 
 294 |         vlagd = tf.reshape(vlagd,[-1,self.cluster_size*self.feature_size])
 295 |         vlagd = tf.nn.l2_normalize(vlagd,1)
 296 | 
 297 |         return vlagd
 298 | 
 299 | 
 300 | 
 301 | 
 302 | class GatedDBoF():
 303 |     def __init__(self, feature_size,max_frames,cluster_size, max_pool, add_batch_norm, is_training):
 304 |         self.feature_size = feature_size
 305 |         self.max_frames = max_frames
 306 |         self.is_training = is_training
 307 |         self.add_batch_norm = add_batch_norm
 308 |         self.cluster_size = cluster_size
 309 |         self.max_pool = max_pool
 310 | 
 311 |     def forward(self, reshaped_input):
 312 | 
 313 |         feature_size = self.feature_size
 314 |         cluster_size = self.cluster_size
 315 |         add_batch_norm = self.add_batch_norm
 316 |         max_frames = self.max_frames
 317 |         is_training = self.is_training
 318 |         max_pool = self.max_pool
 319 | 
 320 |         cluster_weights = tf.get_variable("cluster_weights",
 321 |           [feature_size, cluster_size],
 322 |           initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
 323 |         
 324 |         tf.summary.histogram("cluster_weights", cluster_weights)
 325 |         activation = tf.matmul(reshaped_input, cluster_weights)
 326 |         
 327 |         if add_batch_norm:
 328 |           activation = slim.batch_norm(
 329 |               activation,
 330 |               center=True,
 331 |               scale=True,
 332 |               is_training=is_training,
 333 |               scope="cluster_bn")
 334 |         else:
 335 |           cluster_biases = tf.get_variable("cluster_biases",
 336 |             [cluster_size],
 337 |             initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
 338 |           tf.summary.histogram("cluster_biases", cluster_biases)
 339 |           activation += cluster_biases
 340 | 
 341 |         activation = tf.nn.softmax(activation)
 342 | 
 343 |         activation = tf.reshape(activation, [-1, max_frames, cluster_size])
 344 | 
 345 |         activation_sum = tf.reduce_sum(activation,1)
 346 |         
 347 |         activation_max = tf.reduce_max(activation,1)
 348 |         activation_max = tf.nn.l2_normalize(activation_max,1)
 349 | 
 350 | 
 351 |         dim_red = tf.get_variable("dim_red",
 352 |           [cluster_size, feature_size],
 353 |           initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
 354 |  
 355 |         cluster_weights_2 = tf.get_variable("cluster_weights_2",
 356 |           [feature_size, cluster_size],
 357 |           initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
 358 |         
 359 |         tf.summary.histogram("cluster_weights_2", cluster_weights_2)
 360 |         
 361 |         activation = tf.matmul(activation_max, dim_red)
 362 |         activation = tf.matmul(activation, cluster_weights_2)
 363 |         
 364 |         if add_batch_norm:
 365 |           activation = slim.batch_norm(
 366 |               activation,
 367 |               center=True,
 368 |               scale=True,
 369 |               is_training=is_training,
 370 |               scope="cluster_bn_2")
 371 |         else:
 372 |           cluster_biases = tf.get_variable("cluster_biases_2",
 373 |             [cluster_size],
 374 |             initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
 375 |           tf.summary.histogram("cluster_biases_2", cluster_biases)
 376 |           activation += cluster_biases
 377 | 
 378 |         activation = tf.sigmoid(activation)
 379 | 
 380 |         activation = tf.multiply(activation,activation_sum)
 381 |         activation = tf.nn.l2_normalize(activation,1)
 382 | 
 383 |         return activation
 384 | 
 385 | 
 386 | 
 387 | class SoftDBoF():
 388 |     def __init__(self, feature_size,max_frames,cluster_size, max_pool, add_batch_norm, is_training):
 389 |         self.feature_size = feature_size
 390 |         self.max_frames = max_frames
 391 |         self.is_training = is_training
 392 |         self.add_batch_norm = add_batch_norm
 393 |         self.cluster_size = cluster_size
 394 |         self.max_pool = max_pool
 395 | 
 396 |     def forward(self, reshaped_input):
 397 | 
 398 |         feature_size = self.feature_size
 399 |         cluster_size = self.cluster_size
 400 |         add_batch_norm = self.add_batch_norm
 401 |         max_frames = self.max_frames
 402 |         is_training = self.is_training
 403 |         max_pool = self.max_pool
 404 | 
 405 |         cluster_weights = tf.get_variable("cluster_weights",
 406 |           [feature_size, cluster_size],
 407 |           initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
 408 |         
 409 |         tf.summary.histogram("cluster_weights", cluster_weights)
 410 |         activation = tf.matmul(reshaped_input, cluster_weights)
 411 |         
 412 |         if add_batch_norm:
 413 |           activation = slim.batch_norm(
 414 |               activation,
 415 |               center=True,
 416 |               scale=True,
 417 |               is_training=is_training,
 418 |               scope="cluster_bn")
 419 |         else:
 420 |           cluster_biases = tf.get_variable("cluster_biases",
 421 |             [cluster_size],
 422 |             initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
 423 |           tf.summary.histogram("cluster_biases", cluster_biases)
 424 |           activation += cluster_biases
 425 | 
 426 |         activation = tf.nn.softmax(activation)
 427 | 
 428 |         activation = tf.reshape(activation, [-1, max_frames, cluster_size])
 429 | 
 430 |         activation_sum = tf.reduce_sum(activation,1)
 431 |         activation_sum = tf.nn.l2_normalize(activation_sum,1)
 432 | 
 433 |         if max_pool:
 434 |             activation_max = tf.reduce_max(activation,1)
 435 |             activation_max = tf.nn.l2_normalize(activation_max,1)
 436 |             activation = tf.concat([activation_sum,activation_max],1)
 437 |         else:
 438 |             activation = activation_sum
 439 |         
 440 |         return activation
 441 | 
 442 | 
 443 | 
 444 | class DBoF():
 445 |     def __init__(self, feature_size,max_frames,cluster_size,activation, add_batch_norm, is_training):
 446 |         self.feature_size = feature_size
 447 |         self.max_frames = max_frames
 448 |         self.is_training = is_training
 449 |         self.add_batch_norm = add_batch_norm
 450 |         self.cluster_size = cluster_size
 451 |         self.activation = activation
 452 | 
 453 | 
 454 |     def forward(self, reshaped_input):
 455 | 
 456 |         feature_size = self.feature_size
 457 |         cluster_size = self.cluster_size
 458 |         add_batch_norm = self.add_batch_norm
 459 |         max_frames = self.max_frames
 460 |         is_training = self.is_training
 461 | 
 462 |         cluster_weights = tf.get_variable("cluster_weights",
 463 |           [feature_size, cluster_size],
 464 |           initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
 465 |         
 466 |         tf.summary.histogram("cluster_weights", cluster_weights)
 467 |         activation = tf.matmul(reshaped_input, cluster_weights)
 468 |         
 469 |         if add_batch_norm:
 470 |           activation = slim.batch_norm(
 471 |               activation,
 472 |               center=True,
 473 |               scale=True,
 474 |               is_training=is_training,
 475 |               scope="cluster_bn")
 476 |         else:
 477 |           cluster_biases = tf.get_variable("cluster_biases",
 478 |             [cluster_size],
 479 |             initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
 480 |           tf.summary.histogram("cluster_biases", cluster_biases)
 481 |           activation += cluster_biases
 482 | 
 483 |         if activation == 'glu':
 484 |             space_ind = range(cluster_size/2)
 485 |             gate_ind = range(cluster_size/2,cluster_size)
 486 | 
 487 |             gates = tf.sigmoid(activation[:,gate_ind])
 488 |             activation = tf.multiply(activation[:,space_ind],gates)
 489 | 
 490 |         elif activation == 'relu':
 491 |             activation = tf.nn.relu6(activation)
 492 |         
 493 |         tf.summary.histogram("cluster_output", activation)
 494 | 
 495 |         activation = tf.reshape(activation, [-1, max_frames, cluster_size])
 496 | 
 497 |         avg_activation = utils.FramePooling(activation, 'average')
 498 |         avg_activation = tf.nn.l2_normalize(avg_activation,1)
 499 | 
 500 |         max_activation = utils.FramePooling(activation, 'max')
 501 |         max_activation = tf.nn.l2_normalize(max_activation,1)
 502 |         
 503 |         return tf.concat([avg_activation,max_activation],1)
 504 | 
 505 | class NetFV():
 506 |     def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training):
 507 |         self.feature_size = feature_size
 508 |         self.max_frames = max_frames
 509 |         self.is_training = is_training
 510 |         self.add_batch_norm = add_batch_norm
 511 |         self.cluster_size = cluster_size
 512 | 
 513 |     def forward(self,reshaped_input):
 514 |         cluster_weights = tf.get_variable("cluster_weights",
 515 |           [self.feature_size, self.cluster_size],
 516 |           initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
 517 |      
 518 |         covar_weights = tf.get_variable("covar_weights",
 519 |           [self.feature_size, self.cluster_size],
 520 |           initializer = tf.random_normal_initializer(mean=1.0, stddev=1 /math.sqrt(self.feature_size)))
 521 |       
 522 |         covar_weights = tf.square(covar_weights)
 523 |         eps = tf.constant([1e-6])
 524 |         covar_weights = tf.add(covar_weights,eps)
 525 | 
 526 |         tf.summary.histogram("cluster_weights", cluster_weights)
 527 |         activation = tf.matmul(reshaped_input, cluster_weights)
 528 |         if self.add_batch_norm:
 529 |           activation = slim.batch_norm(
 530 |               activation,
 531 |               center=True,
 532 |               scale=True,
 533 |               is_training=self.is_training,
 534 |               scope="cluster_bn")
 535 |         else:
 536 |           cluster_biases = tf.get_variable("cluster_biases",
 537 |             [self.cluster_size],
 538 |             initializer = tf.random_normal(stddev=1 / math.sqrt(self.feature_size)))
 539 |           tf.summary.histogram("cluster_biases", cluster_biases)
 540 |           activation += cluster_biases
 541 |         
 542 |         activation = tf.nn.softmax(activation)
 543 |         tf.summary.histogram("cluster_output", activation)
 544 | 
 545 |         activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size])
 546 | 
 547 |         a_sum = tf.reduce_sum(activation,-2,keep_dims=True)
 548 | 
 549 |         if not FLAGS.fv_couple_weights:
 550 |             cluster_weights2 = tf.get_variable("cluster_weights2",
 551 |               [1,self.feature_size, self.cluster_size],
 552 |               initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
 553 |         else:
 554 |             cluster_weights2 = tf.scalar_mul(FLAGS.fv_coupling_factor,cluster_weights)
 555 | 
 556 |         a = tf.multiply(a_sum,cluster_weights2)
 557 |         
 558 |         activation = tf.transpose(activation,perm=[0,2,1])
 559 |         
 560 |         reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size])
 561 |         fv1 = tf.matmul(activation,reshaped_input)
 562 |         
 563 |         fv1 = tf.transpose(fv1,perm=[0,2,1])
 564 | 
 565 |         # computing second order FV
 566 |         a2 = tf.multiply(a_sum,tf.square(cluster_weights2)) 
 567 | 
 568 |         b2 = tf.multiply(fv1,cluster_weights2) 
 569 |         fv2 = tf.matmul(activation,tf.square(reshaped_input)) 
 570 |      
 571 |         fv2 = tf.transpose(fv2,perm=[0,2,1])
 572 |         fv2 = tf.add_n([a2,fv2,tf.scalar_mul(-2,b2)])
 573 | 
 574 |         fv2 = tf.divide(fv2,tf.square(covar_weights))
 575 |         fv2 = tf.subtract(fv2,a_sum)
 576 | 
 577 |         fv2 = tf.reshape(fv2,[-1,self.cluster_size*self.feature_size])
 578 |       
 579 |         fv2 = tf.nn.l2_normalize(fv2,1)
 580 |         fv2 = tf.reshape(fv2,[-1,self.cluster_size*self.feature_size])
 581 |         fv2 = tf.nn.l2_normalize(fv2,1)
 582 | 
 583 |         fv1 = tf.subtract(fv1,a)
 584 |         fv1 = tf.divide(fv1,covar_weights) 
 585 | 
 586 |         fv1 = tf.nn.l2_normalize(fv1,1)
 587 |         fv1 = tf.reshape(fv1,[-1,self.cluster_size*self.feature_size])
 588 |         fv1 = tf.nn.l2_normalize(fv1,1)
 589 | 
 590 |         return tf.concat([fv1,fv2],1)
 591 | 
 592 | class NetVLADModelLF(models.BaseModel):
 593 |   """Creates a NetVLAD based model.
 594 | 
 595 |   Args:
 596 |     model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
 597 |                  input features.
 598 |     vocab_size: The number of classes in the dataset.
 599 |     num_frames: A vector of length 'batch' which indicates the number of
 600 |          frames for each video (before padding).
 601 | 
 602 |   Returns:
 603 |     A dictionary with a tensor containing the probability predictions of the
 604 |     model in the 'predictions' key. The dimensions of the tensor are
 605 |     'batch_size' x 'num_classes'.
 606 |   """
 607 | 
 608 | 
 609 |   def create_model(self,
 610 |                    model_input,
 611 |                    vocab_size,
 612 |                    num_frames,
 613 |                    iterations=None,
 614 |                    add_batch_norm=None,
 615 |                    sample_random_frames=None,
 616 |                    cluster_size=None,
 617 |                    hidden_size=None,
 618 |                    is_training=True,
 619 |                    **unused_params):
 620 |     iterations = iterations or FLAGS.iterations
 621 |     add_batch_norm = add_batch_norm or FLAGS.netvlad_add_batch_norm
 622 |     random_frames = sample_random_frames or FLAGS.sample_random_frames
 623 |     cluster_size = cluster_size or FLAGS.netvlad_cluster_size
 624 |     hidden1_size = hidden_size or FLAGS.netvlad_hidden_size
 625 |     relu = FLAGS.netvlad_relu
 626 |     dimred = FLAGS.netvlad_dimred
 627 |     gating = FLAGS.gating
 628 |     remove_diag = FLAGS.gating_remove_diag
 629 |     lightvlad = FLAGS.lightvlad
 630 |     vlagd = FLAGS.vlagd
 631 | 
 632 |     num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
 633 |     if random_frames:
 634 |       model_input = utils.SampleRandomFrames(model_input, num_frames,
 635 |                                              iterations)
 636 |     else:
 637 |       model_input = utils.SampleRandomSequence(model_input, num_frames,
 638 |                                                iterations)
 639 |     
 640 | 
 641 |     max_frames = model_input.get_shape().as_list()[1]
 642 |     feature_size = model_input.get_shape().as_list()[2]
 643 |     reshaped_input = tf.reshape(model_input, [-1, feature_size])
 644 | 
 645 |     if lightvlad:
 646 |       video_NetVLAD = LightVLAD(1024,max_frames,cluster_size, add_batch_norm, is_training)
 647 |       audio_NetVLAD = LightVLAD(128,max_frames,cluster_size/2, add_batch_norm, is_training)
 648 |     elif vlagd:
 649 |       video_NetVLAD = NetVLAGD(1024,max_frames,cluster_size, add_batch_norm, is_training)
 650 |       audio_NetVLAD = NetVLAGD(128,max_frames,cluster_size/2, add_batch_norm, is_training)
 651 |     else:
 652 |       video_NetVLAD = NetVLAD(1024,max_frames,cluster_size, add_batch_norm, is_training)
 653 |       audio_NetVLAD = NetVLAD(128,max_frames,cluster_size/2, add_batch_norm, is_training)
 654 | 
 655 |   
 656 |     if add_batch_norm:# and not lightvlad:
 657 |       reshaped_input = slim.batch_norm(
 658 |           reshaped_input,
 659 |           center=True,
 660 |           scale=True,
 661 |           is_training=is_training,
 662 |           scope="input_bn")
 663 | 
 664 |     with tf.variable_scope("video_VLAD"):
 665 |         vlad_video = video_NetVLAD.forward(reshaped_input[:,0:1024]) 
 666 | 
 667 |     with tf.variable_scope("audio_VLAD"):
 668 |         vlad_audio = audio_NetVLAD.forward(reshaped_input[:,1024:])
 669 | 
 670 |     vlad = tf.concat([vlad_video, vlad_audio],1)
 671 | 
 672 |     vlad_dim = vlad.get_shape().as_list()[1] 
 673 |     hidden1_weights = tf.get_variable("hidden1_weights",
 674 |       [vlad_dim, hidden1_size],
 675 |       initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
 676 |        
 677 |     activation = tf.matmul(vlad, hidden1_weights)
 678 | 
 679 |     if add_batch_norm and relu:
 680 |       activation = slim.batch_norm(
 681 |           activation,
 682 |           center=True,
 683 |           scale=True,
 684 |           is_training=is_training,
 685 |           scope="hidden1_bn")
 686 | 
 687 |     else:
 688 |       hidden1_biases = tf.get_variable("hidden1_biases",
 689 |         [hidden1_size],
 690 |         initializer = tf.random_normal_initializer(stddev=0.01))
 691 |       tf.summary.histogram("hidden1_biases", hidden1_biases)
 692 |       activation += hidden1_biases
 693 |    
 694 |     if relu:
 695 |       activation = tf.nn.relu6(activation)
 696 |    
 697 | 
 698 |     if gating:
 699 |         gating_weights = tf.get_variable("gating_weights_2",
 700 |           [hidden1_size, hidden1_size],
 701 |           initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size)))
 702 |         
 703 |         gates = tf.matmul(activation, gating_weights)
 704 |  
 705 |         if remove_diag:
 706 |             #removes diagonals coefficients
 707 |             diagonals = tf.matrix_diag_part(gating_weights)
 708 |             gates = gates - tf.multiply(diagonals,activation)
 709 | 
 710 |        
 711 |         if add_batch_norm:
 712 |           gates = slim.batch_norm(
 713 |               gates,
 714 |               center=True,
 715 |               scale=True,
 716 |               is_training=is_training,
 717 |               scope="gating_bn")
 718 |         else:
 719 |           gating_biases = tf.get_variable("gating_biases",
 720 |             [cluster_size],
 721 |             initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
 722 |           gates += gating_biases
 723 | 
 724 |         gates = tf.sigmoid(gates)
 725 | 
 726 |         activation = tf.multiply(activation,gates)
 727 | 
 728 |     aggregated_model = getattr(video_level_models,
 729 |                                FLAGS.video_level_classifier_model)
 730 | 
 731 | 
 732 |     return aggregated_model().create_model(
 733 |         model_input=activation,
 734 |         vocab_size=vocab_size,
 735 |         is_training=is_training,
 736 |         **unused_params)
 737 |   
 738 | class DbofModelLF(models.BaseModel):
 739 |   """Creates a Deep Bag of Frames model.
 740 | 
 741 |   The model projects the features for each frame into a higher dimensional
 742 |   'clustering' space, pools across frames in that space, and then
 743 |   uses a configurable video-level model to classify the now aggregated features.
 744 | 
 745 |   The model will randomly sample either frames or sequences of frames during
 746 |   training to speed up convergence.
 747 | 
 748 |   Args:
 749 |     model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
 750 |                  input features.
 751 |     vocab_size: The number of classes in the dataset.
 752 |     num_frames: A vector of length 'batch' which indicates the number of
 753 |          frames for each video (before padding).
 754 | 
 755 |   Returns:
 756 |     A dictionary with a tensor containing the probability predictions of the
 757 |     model in the 'predictions' key. The dimensions of the tensor are
 758 |     'batch_size' x 'num_classes'.
 759 |   """
 760 | 
 761 |   def create_model(self,
 762 |                    model_input,
 763 |                    vocab_size,
 764 |                    num_frames,
 765 |                    iterations=None,
 766 |                    add_batch_norm=None,
 767 |                    sample_random_frames=None,
 768 |                    cluster_size=None,
 769 |                    hidden_size=None,
 770 |                    is_training=True,
 771 |                    **unused_params):
 772 |     iterations = iterations or FLAGS.iterations
 773 |     add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
 774 |     random_frames = sample_random_frames or FLAGS.sample_random_frames
 775 |     cluster_size = cluster_size or FLAGS.dbof_cluster_size
 776 |     hidden1_size = hidden_size or FLAGS.dbof_hidden_size
 777 |     relu = FLAGS.dbof_relu
 778 |     cluster_activation = FLAGS.dbof_activation
 779 | 
 780 |     num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
 781 |     if random_frames:
 782 |       model_input = utils.SampleRandomFrames(model_input, num_frames,
 783 |                                              iterations)
 784 |     else:
 785 |       model_input = utils.SampleRandomSequence(model_input, num_frames,
 786 |                                                iterations)
 787 |     max_frames = model_input.get_shape().as_list()[1]
 788 |     feature_size = model_input.get_shape().as_list()[2]
 789 |     reshaped_input = tf.reshape(model_input, [-1, feature_size])
 790 |     tf.summary.histogram("input_hist", reshaped_input)
 791 | 
 792 |     if cluster_activation == 'glu':
 793 |         cluster_size = 2*cluster_size
 794 | 
 795 |     video_Dbof = DBoF(1024,max_frames,cluster_size, cluster_activation, add_batch_norm, is_training)
 796 |     audio_Dbof = DBoF(128,max_frames,cluster_size/8, cluster_activation, add_batch_norm, is_training)
 797 | 
 798 | 
 799 |     if add_batch_norm:
 800 |       reshaped_input = slim.batch_norm(
 801 |           reshaped_input,
 802 |           center=True,
 803 |           scale=True,
 804 |           is_training=is_training,
 805 |           scope="input_bn")
 806 | 
 807 |     with tf.variable_scope("video_DBOF"):
 808 |         dbof_video = video_Dbof.forward(reshaped_input[:,0:1024]) 
 809 | 
 810 |     with tf.variable_scope("audio_DBOF"):
 811 |         dbof_audio = audio_Dbof.forward(reshaped_input[:,1024:])
 812 | 
 813 |     dbof = tf.concat([dbof_video, dbof_audio],1)
 814 | 
 815 |     dbof_dim = dbof.get_shape().as_list()[1] 
 816 | 
 817 |     hidden1_weights = tf.get_variable("hidden1_weights",
 818 |       [dbof_dim, hidden1_size],
 819 |       initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
 820 |     tf.summary.histogram("hidden1_weights", hidden1_weights)
 821 |     activation = tf.matmul(dbof, hidden1_weights)
 822 | 
 823 |     if add_batch_norm and relu:
 824 |       activation = slim.batch_norm(
 825 |           activation,
 826 |           center=True,
 827 |           scale=True,
 828 |           is_training=is_training,
 829 |           scope="hidden1_bn")
 830 |     else:
 831 |       hidden1_biases = tf.get_variable("hidden1_biases",
 832 |         [hidden1_size],
 833 |         initializer = tf.random_normal_initializer(stddev=0.01))
 834 |       tf.summary.histogram("hidden1_biases", hidden1_biases)
 835 |       activation += hidden1_biases
 836 | 
 837 |     if relu:
 838 |       activation = tf.nn.relu6(activation)
 839 |     tf.summary.histogram("hidden1_output", activation)
 840 | 
 841 |     aggregated_model = getattr(video_level_models,
 842 |                                FLAGS.video_level_classifier_model)
 843 |     
 844 |     return aggregated_model().create_model(
 845 |         model_input=activation,
 846 |         vocab_size=vocab_size,
 847 |         **unused_params)
 848 | 
 849 | class GatedDbofModelLF(models.BaseModel):
 850 |   """Creates a Gated Deep Bag of Frames model.
 851 | 
 852 |   The model projects the features for each frame into a higher dimensional
 853 |   'clustering' space, pools across frames in that space, and then
 854 |   uses a configurable video-level model to classify the now aggregated features.
 855 | 
 856 |   The model will randomly sample either frames or sequences of frames during
 857 |   training to speed up convergence.
 858 | 
 859 |   Args:
 860 |     model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
 861 |                  input features.
 862 |     vocab_size: The number of classes in the dataset.
 863 |     num_frames: A vector of length 'batch' which indicates the number of
 864 |          frames for each video (before padding).
 865 | 
 866 |   Returns:
 867 |     A dictionary with a tensor containing the probability predictions of the
 868 |     model in the 'predictions' key. The dimensions of the tensor are
 869 |     'batch_size' x 'num_classes'.
 870 |   """
 871 | 
 872 |   def create_model(self,
 873 |                    model_input,
 874 |                    vocab_size,
 875 |                    num_frames,
 876 |                    iterations=None,
 877 |                    add_batch_norm=None,
 878 |                    sample_random_frames=None,
 879 |                    cluster_size=None,
 880 |                    hidden_size=None,
 881 |                    is_training=True,
 882 |                    **unused_params):
 883 |     iterations = iterations or FLAGS.iterations
 884 |     add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
 885 |     random_frames = sample_random_frames or FLAGS.sample_random_frames
 886 |     cluster_size = cluster_size or FLAGS.dbof_cluster_size
 887 |     hidden1_size = hidden_size or FLAGS.dbof_hidden_size
 888 |     fc_dimred = FLAGS.fc_dimred
 889 |     relu = FLAGS.dbof_relu
 890 |     max_pool = FLAGS.softdbof_maxpool
 891 | 
 892 |     num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
 893 |     if random_frames:
 894 |       model_input = utils.SampleRandomFrames(model_input, num_frames,
 895 |                                              iterations)
 896 |     else:
 897 |       model_input = utils.SampleRandomSequence(model_input, num_frames,
 898 |                                                iterations)
 899 |     max_frames = model_input.get_shape().as_list()[1]
 900 |     feature_size = model_input.get_shape().as_list()[2]
 901 |     reshaped_input = tf.reshape(model_input, [-1, feature_size])
 902 |     tf.summary.histogram("input_hist", reshaped_input)
 903 | 
 904 |     video_Dbof = GatedDBoF(1024,max_frames,cluster_size, max_pool, add_batch_norm, is_training)
 905 |     audio_Dbof = SoftDBoF(128,max_frames,cluster_size/8, max_pool, add_batch_norm, is_training)
 906 | 
 907 | 
 908 |     if add_batch_norm:
 909 |       reshaped_input = slim.batch_norm(
 910 |           reshaped_input,
 911 |           center=True,
 912 |           scale=True,
 913 |           is_training=is_training,
 914 |           scope="input_bn")
 915 | 
 916 |     with tf.variable_scope("video_DBOF"):
 917 |         dbof_video = video_Dbof.forward(reshaped_input[:,0:1024]) 
 918 | 
 919 |     with tf.variable_scope("audio_DBOF"):
 920 |         dbof_audio = audio_Dbof.forward(reshaped_input[:,1024:])
 921 | 
 922 |     dbof = tf.concat([dbof_video, dbof_audio],1)
 923 | 
 924 |     dbof_dim = dbof.get_shape().as_list()[1] 
 925 | 
 926 |     if fc_dimred:
 927 |         hidden1_weights = tf.get_variable("hidden1_weights",
 928 |           [dbof_dim, hidden1_size],
 929 |           initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
 930 |         tf.summary.histogram("hidden1_weights", hidden1_weights)
 931 |         activation = tf.matmul(dbof, hidden1_weights)
 932 | 
 933 |         if add_batch_norm and relu:
 934 |           activation = slim.batch_norm(
 935 |               activation,
 936 |               center=True,
 937 |               scale=True,
 938 |               is_training=is_training,
 939 |               scope="hidden1_bn")
 940 |         else:
 941 |           hidden1_biases = tf.get_variable("hidden1_biases",
 942 |             [hidden1_size],
 943 |             initializer = tf.random_normal_initializer(stddev=0.01))
 944 |           tf.summary.histogram("hidden1_biases", hidden1_biases)
 945 |           activation += hidden1_biases
 946 | 
 947 |         if relu:
 948 |           activation = tf.nn.relu6(activation)
 949 |         tf.summary.histogram("hidden1_output", activation)
 950 |     else:
 951 |         activation = dbof
 952 | 
 953 |     aggregated_model = getattr(video_level_models,
 954 |                                FLAGS.video_level_classifier_model)
 955 | 
 956 |     
 957 |     return aggregated_model().create_model(
 958 |         model_input=activation,
 959 |         vocab_size=vocab_size,
 960 |         is_training=is_training,
 961 |         **unused_params)
 962 | 
 963 | 
 964 | class SoftDbofModelLF(models.BaseModel):
 965 |   """Creates a Soft Deep Bag of Frames model.
 966 | 
 967 |   The model projects the features for each frame into a higher dimensional
 968 |   'clustering' space, pools across frames in that space, and then
 969 |   uses a configurable video-level model to classify the now aggregated features.
 970 | 
 971 |   The model will randomly sample either frames or sequences of frames during
 972 |   training to speed up convergence.
 973 | 
 974 |   Args:
 975 |     model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
 976 |                  input features.
 977 |     vocab_size: The number of classes in the dataset.
 978 |     num_frames: A vector of length 'batch' which indicates the number of
 979 |          frames for each video (before padding).
 980 | 
 981 |   Returns:
 982 |     A dictionary with a tensor containing the probability predictions of the
 983 |     model in the 'predictions' key. The dimensions of the tensor are
 984 |     'batch_size' x 'num_classes'.
 985 |   """
 986 | 
 987 |   def create_model(self,
 988 |                    model_input,
 989 |                    vocab_size,
 990 |                    num_frames,
 991 |                    iterations=None,
 992 |                    add_batch_norm=None,
 993 |                    sample_random_frames=None,
 994 |                    cluster_size=None,
 995 |                    hidden_size=None,
 996 |                    is_training=True,
 997 |                    **unused_params):
 998 |     iterations = iterations or FLAGS.iterations
 999 |     add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
1000 |     random_frames = sample_random_frames or FLAGS.sample_random_frames
1001 |     cluster_size = cluster_size or FLAGS.dbof_cluster_size
1002 |     hidden1_size = hidden_size or FLAGS.dbof_hidden_size
1003 |     fc_dimred = FLAGS.fc_dimred
1004 |     relu = FLAGS.dbof_relu
1005 |     max_pool = FLAGS.softdbof_maxpool
1006 | 
1007 |     num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
1008 |     if random_frames:
1009 |       model_input = utils.SampleRandomFrames(model_input, num_frames,
1010 |                                              iterations)
1011 |     else:
1012 |       model_input = utils.SampleRandomSequence(model_input, num_frames,
1013 |                                                iterations)
1014 |     max_frames = model_input.get_shape().as_list()[1]
1015 |     feature_size = model_input.get_shape().as_list()[2]
1016 |     reshaped_input = tf.reshape(model_input, [-1, feature_size])
1017 |     tf.summary.histogram("input_hist", reshaped_input)
1018 | 
1019 |     video_Dbof = SoftDBoF(1024,max_frames,cluster_size, max_pool, add_batch_norm, is_training)
1020 |     audio_Dbof = SoftDBoF(128,max_frames,cluster_size/8, max_pool, add_batch_norm, is_training)
1021 | 
1022 | 
1023 |     if add_batch_norm:
1024 |       reshaped_input = slim.batch_norm(
1025 |           reshaped_input,
1026 |           center=True,
1027 |           scale=True,
1028 |           is_training=is_training,
1029 |           scope="input_bn")
1030 | 
1031 |     with tf.variable_scope("video_DBOF"):
1032 |         dbof_video = video_Dbof.forward(reshaped_input[:,0:1024]) 
1033 | 
1034 |     with tf.variable_scope("audio_DBOF"):
1035 |         dbof_audio = audio_Dbof.forward(reshaped_input[:,1024:])
1036 | 
1037 |     dbof = tf.concat([dbof_video, dbof_audio],1)
1038 | 
1039 |     dbof_dim = dbof.get_shape().as_list()[1] 
1040 | 
1041 |     if fc_dimred:
1042 |         hidden1_weights = tf.get_variable("hidden1_weights",
1043 |           [dbof_dim, hidden1_size],
1044 |           initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
1045 |         tf.summary.histogram("hidden1_weights", hidden1_weights)
1046 |         activation = tf.matmul(dbof, hidden1_weights)
1047 | 
1048 |         if add_batch_norm and relu:
1049 |           activation = slim.batch_norm(
1050 |               activation,
1051 |               center=True,
1052 |               scale=True,
1053 |               is_training=is_training,
1054 |               scope="hidden1_bn")
1055 |         else:
1056 |           hidden1_biases = tf.get_variable("hidden1_biases",
1057 |             [hidden1_size],
1058 |             initializer = tf.random_normal_initializer(stddev=0.01))
1059 |           tf.summary.histogram("hidden1_biases", hidden1_biases)
1060 |           activation += hidden1_biases
1061 | 
1062 |         if relu:
1063 |           activation = tf.nn.relu6(activation)
1064 |         tf.summary.histogram("hidden1_output", activation)
1065 |     else:
1066 |         activation = dbof
1067 | 
1068 |     aggregated_model = getattr(video_level_models,
1069 |                                FLAGS.video_level_classifier_model)
1070 | 
1071 |     
1072 |     return aggregated_model().create_model(
1073 |         model_input=activation,
1074 |         vocab_size=vocab_size,
1075 |         is_training=is_training,
1076 |         **unused_params)
1077 | 
1078 | class LstmModel(models.BaseModel):
1079 | 
1080 |   def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params):
1081 |     """Creates a model which uses a stack of LSTMs to represent the video.
1082 | 
1083 |     Args:
1084 |       model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
1085 |                    input features.
1086 |       vocab_size: The number of classes in the dataset.
1087 |       num_frames: A vector of length 'batch' which indicates the number of
1088 |            frames for each video (before padding).
1089 | 
1090 |     Returns:
1091 |       A dictionary with a tensor containing the probability predictions of the
1092 |       model in the 'predictions' key. The dimensions of the tensor are
1093 |       'batch_size' x 'num_classes'.
1094 |     """
1095 |     lstm_size = FLAGS.lstm_cells
1096 |     number_of_layers = FLAGS.lstm_layers
1097 |     random_frames = FLAGS.lstm_random_sequence
1098 |     iterations = FLAGS.iterations
1099 |     backward = FLAGS.lstm_backward
1100 | 
1101 |     if random_frames:
1102 |       num_frames_2 = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
1103 |       model_input = utils.SampleRandomFrames(model_input, num_frames_2,
1104 |                                              iterations)
1105 |     if backward:
1106 |       model_input = tf.reverse_sequence(model_input, num_frames, seq_axis=1) 
1107 |  
1108 |     stacked_lstm = tf.contrib.rnn.MultiRNNCell(
1109 |             [
1110 |                 tf.contrib.rnn.BasicLSTMCell(
1111 |                     lstm_size, forget_bias=1.0, state_is_tuple=False)
1112 |                 for _ in range(number_of_layers)
1113 |                 ], state_is_tuple=False)
1114 | 
1115 |     loss = 0.0
1116 |     with tf.variable_scope("RNN"):
1117 |       outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
1118 |                                          sequence_length=num_frames,
1119 |                                          dtype=tf.float32)
1120 | 
1121 |     aggregated_model = getattr(video_level_models,
1122 |                                FLAGS.video_level_classifier_model)
1123 | 
1124 |     return aggregated_model().create_model(
1125 |         model_input=state,
1126 |         vocab_size=vocab_size,
1127 |         is_training=is_training,
1128 |         **unused_params)
1129 | 
1130 | 
1131 | class GruModel(models.BaseModel):
1132 | 
1133 |   def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params):
1134 |     """Creates a model which uses a stack of GRUs to represent the video.
1135 | 
1136 |     Args:
1137 |       model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
1138 |                    input features.
1139 |       vocab_size: The number of classes in the dataset.
1140 |       num_frames: A vector of length 'batch' which indicates the number of
1141 |            frames for each video (before padding).
1142 | 
1143 |     Returns:
1144 |       A dictionary with a tensor containing the probability predictions of the
1145 |       model in the 'predictions' key. The dimensions of the tensor are
1146 |       'batch_size' x 'num_classes'.
1147 |     """
1148 |     gru_size = FLAGS.gru_cells
1149 |     number_of_layers = FLAGS.gru_layers
1150 |     backward = FLAGS.gru_backward
1151 |     random_frames = FLAGS.gru_random_sequence
1152 |     iterations = FLAGS.iterations
1153 |     
1154 |     if random_frames:
1155 |       num_frames_2 = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
1156 |       model_input = utils.SampleRandomFrames(model_input, num_frames_2,
1157 |                                              iterations)
1158 |  
1159 |     if backward:
1160 |         model_input = tf.reverse_sequence(model_input, num_frames, seq_axis=1) 
1161 |     
1162 |     stacked_GRU = tf.contrib.rnn.MultiRNNCell(
1163 |             [
1164 |                 tf.contrib.rnn.GRUCell(gru_size)
1165 |                 for _ in range(number_of_layers)
1166 |                 ], state_is_tuple=False)
1167 | 
1168 |     loss = 0.0
1169 |     with tf.variable_scope("RNN"):
1170 |       outputs, state = tf.nn.dynamic_rnn(stacked_GRU, model_input,
1171 |                                          sequence_length=num_frames,
1172 |                                          dtype=tf.float32)
1173 | 
1174 |     aggregated_model = getattr(video_level_models,
1175 |                                FLAGS.video_level_classifier_model)
1176 |     return aggregated_model().create_model(
1177 |         model_input=state,
1178 |         vocab_size=vocab_size,
1179 |         is_training=is_training,
1180 |         **unused_params)
1181 | 
1182 | 
1183 |     
1184 | class NetFVModelLF(models.BaseModel):
1185 |   """Creates a NetFV based model.
1186 |      It emulates a Gaussian Mixture Fisher Vector pooling operations
1187 | 
1188 |   Args:
1189 |     model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
1190 |                  input features.
1191 |     vocab_size: The number of classes in the dataset.
1192 |     num_frames: A vector of length 'batch' which indicates the number of
1193 |          frames for each video (before padding).
1194 | 
1195 |   Returns:
1196 |     A dictionary with a tensor containing the probability predictions of the
1197 |     model in the 'predictions' key. The dimensions of the tensor are
1198 |     'batch_size' x 'num_classes'.
1199 |   """
1200 | 
1201 | 
1202 |   def create_model(self,
1203 |                    model_input,
1204 |                    vocab_size,
1205 |                    num_frames,
1206 |                    iterations=None,
1207 |                    add_batch_norm=None,
1208 |                    sample_random_frames=None,
1209 |                    cluster_size=None,
1210 |                    hidden_size=None,
1211 |                    is_training=True,
1212 |                    **unused_params):
1213 |     iterations = iterations or FLAGS.iterations
1214 |     add_batch_norm = add_batch_norm or FLAGS.netvlad_add_batch_norm
1215 |     random_frames = sample_random_frames or FLAGS.sample_random_frames
1216 |     cluster_size = cluster_size or FLAGS.fv_cluster_size
1217 |     hidden1_size = hidden_size or FLAGS.fv_hidden_size
1218 |     relu = FLAGS.fv_relu
1219 |     gating = FLAGS.gating
1220 | 
1221 |     num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
1222 |     if random_frames:
1223 |       model_input = utils.SampleRandomFrames(model_input, num_frames,
1224 |                                              iterations)
1225 |     else:
1226 |       model_input = utils.SampleRandomSequence(model_input, num_frames,
1227 |                                                iterations)
1228 |     max_frames = model_input.get_shape().as_list()[1]
1229 |     feature_size = model_input.get_shape().as_list()[2]
1230 |     reshaped_input = tf.reshape(model_input, [-1, feature_size])
1231 |     tf.summary.histogram("input_hist", reshaped_input)
1232 | 
1233 |     video_NetFV = NetFV(1024,max_frames,cluster_size, add_batch_norm, is_training)
1234 |     audio_NetFV = NetFV(128,max_frames,cluster_size/2, add_batch_norm, is_training)
1235 | 
1236 | 
1237 |     if add_batch_norm:
1238 |       reshaped_input = slim.batch_norm(
1239 |           reshaped_input,
1240 |           center=True,
1241 |           scale=True,
1242 |           is_training=is_training,
1243 |           scope="input_bn")
1244 | 
1245 |     with tf.variable_scope("video_FV"):
1246 |         fv_video = video_NetFV.forward(reshaped_input[:,0:1024]) 
1247 | 
1248 |     with tf.variable_scope("audio_FV"):
1249 |         fv_audio = audio_NetFV.forward(reshaped_input[:,1024:])
1250 | 
1251 |     fv = tf.concat([fv_video, fv_audio],1)
1252 | 
1253 |     fv_dim = fv.get_shape().as_list()[1] 
1254 |     hidden1_weights = tf.get_variable("hidden1_weights",
1255 |       [fv_dim, hidden1_size],
1256 |       initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
1257 |     
1258 |     activation = tf.matmul(fv, hidden1_weights)
1259 | 
1260 |     if add_batch_norm and relu:
1261 |       activation = slim.batch_norm(
1262 |           activation,
1263 |           center=True,
1264 |           scale=True,
1265 |           is_training=is_training,
1266 |           scope="hidden1_bn")
1267 |     else:
1268 |       hidden1_biases = tf.get_variable("hidden1_biases",
1269 |         [hidden1_size],
1270 |         initializer = tf.random_normal_initializer(stddev=0.01))
1271 |       tf.summary.histogram("hidden1_biases", hidden1_biases)
1272 |       activation += hidden1_biases
1273 |    
1274 |     if relu:
1275 |       activation = tf.nn.relu6(activation)
1276 | 
1277 |     if gating:
1278 |         gating_weights = tf.get_variable("gating_weights_2",
1279 |           [hidden1_size, hidden1_size],
1280 |           initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size)))
1281 |         
1282 |         gates = tf.matmul(activation, gating_weights)
1283 |         
1284 |         if add_batch_norm:
1285 |           gates = slim.batch_norm(
1286 |               gates,
1287 |               center=True,
1288 |               scale=True,
1289 |               is_training=is_training,
1290 |               scope="gating_bn")
1291 |         else:
1292 |           gating_biases = tf.get_variable("gating_biases",
1293 |             [cluster_size],
1294 |             initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
1295 |           gates += gating_biases
1296 | 
1297 |         gates = tf.sigmoid(gates)
1298 | 
1299 |         activation = tf.multiply(activation,gates)
1300 | 
1301 | 
1302 |     aggregated_model = getattr(video_level_models,
1303 |                                FLAGS.video_level_classifier_model)
1304 | 
1305 |     return aggregated_model().create_model(
1306 |         model_input=activation,
1307 |         vocab_size=vocab_size,
1308 |         is_training=is_training,
1309 |         **unused_params)
1310 | 
1311 | 


--------------------------------------------------------------------------------
/inference.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Binary for generating predictions over a set of videos."""
 16 | 
 17 | import os
 18 | import time
 19 | 
 20 | import numpy
 21 | import tensorflow as tf
 22 | 
 23 | from tensorflow import app
 24 | from tensorflow import flags
 25 | from tensorflow import gfile
 26 | from tensorflow import logging
 27 | 
 28 | import eval_util
 29 | import losses
 30 | import readers
 31 | import utils
 32 | 
 33 | FLAGS = flags.FLAGS
 34 | 
 35 | if __name__ == '__main__':
 36 |   flags.DEFINE_string("train_dir", "/tmp/yt8m_model/",
 37 |                       "The directory to load the model files from.")
 38 |   flags.DEFINE_string("output_file", "",
 39 |                       "The file to save the predictions to.")
 40 |   flags.DEFINE_string(
 41 |       "input_data_pattern", "",
 42 |       "File glob defining the evaluation dataset in tensorflow.SequenceExample "
 43 |       "format. The SequenceExamples are expected to have an 'rgb' byte array "
 44 |       "sequence feature as well as a 'labels' int64 context feature.")
 45 | 
 46 |   # Model flags.
 47 |   flags.DEFINE_bool(
 48 |       "frame_features", False,
 49 |       "If set, then --eval_data_pattern must be frame-level features. "
 50 |       "Otherwise, --eval_data_pattern must be aggregated video-level "
 51 |       "features. The model must also be set appropriately (i.e. to read 3D "
 52 |       "batches VS 4D batches.")
 53 |   flags.DEFINE_integer(
 54 |       "batch_size", 8192,
 55 |       "How many examples to process per batch.")
 56 |   flags.DEFINE_string("feature_names", "mean_rgb", "Name of the feature "
 57 |                       "to use for training.")
 58 |   flags.DEFINE_string("feature_sizes", "1024", "Length of the feature vectors.")
 59 | 
 60 | 
 61 |   # Other flags.
 62 |   flags.DEFINE_integer("num_readers", 4,
 63 |                        "How many threads to use for reading input files.")
 64 |   flags.DEFINE_integer("top_k", 20,
 65 |                        "How many predictions to output per video.")
 66 |   flags.DEFINE_integer("check_point",-1,
 67 |                        "Model checkpoint to load, -1 for latest.")
 68 | 
 69 | def format_lines(video_ids, predictions, top_k):
 70 |   batch_size = len(video_ids)
 71 |   for video_index in range(batch_size):
 72 |     top_indices = numpy.argpartition(predictions[video_index], -top_k)[-top_k:]
 73 |     line = [(class_index, predictions[video_index][class_index])
 74 |             for class_index in top_indices]
 75 |     line = sorted(line, key=lambda p: -p[1])
 76 |     yield video_ids[video_index].decode('utf-8') + "," + " ".join("%i %f" % pair
 77 |                                                   for pair in line) + "\n"
 78 | 
 79 | 
 80 | def get_input_data_tensors(reader, data_pattern, batch_size, num_readers=1):
 81 |   """Creates the section of the graph which reads the input data.
 82 | 
 83 |   Args:
 84 |     reader: A class which parses the input data.
 85 |     data_pattern: A 'glob' style path to the data files.
 86 |     batch_size: How many examples to process at a time.
 87 |     num_readers: How many I/O threads to use.
 88 | 
 89 |   Returns:
 90 |     A tuple containing the features tensor, labels tensor, and optionally a
 91 |     tensor containing the number of frames per video. The exact dimensions
 92 |     depend on the reader being used.
 93 | 
 94 |   Raises:
 95 |     IOError: If no files matching the given pattern were found.
 96 |   """
 97 |   with tf.name_scope("input"):
 98 |     files = gfile.Glob(data_pattern)
 99 |     if not files:
100 |       raise IOError("Unable to find input files. data_pattern='" +
101 |                     data_pattern + "'")
102 |     logging.info("number of input files: " + str(len(files)))
103 |     filename_queue = tf.train.string_input_producer(
104 |         files, num_epochs=1, shuffle=False)
105 |     examples_and_labels = [reader.prepare_reader(filename_queue)
106 |                            for _ in range(num_readers)]
107 | 
108 |     video_id_batch, video_batch, unused_labels, num_frames_batch = (
109 |         tf.train.batch_join(examples_and_labels,
110 |                             batch_size=batch_size,
111 |                             allow_smaller_final_batch = True,
112 |                             enqueue_many=True))
113 |     return video_id_batch, video_batch, num_frames_batch
114 | 
115 | def inference(reader, train_dir, data_pattern, out_file_location, batch_size, top_k):
116 |   with tf.Session() as sess, gfile.Open(out_file_location, "w+") as out_file:
117 |     video_id_batch, video_batch, num_frames_batch = get_input_data_tensors(reader, data_pattern, batch_size)
118 |     latest_checkpoint = tf.train.latest_checkpoint(train_dir)
119 |     if latest_checkpoint is None:
120 |       raise Exception("unable to find a checkpoint at location: %s" % train_dir)
121 |     else:
122 |       if FLAGS.check_point < 0:
123 |         meta_graph_location = latest_checkpoint + ".meta"
124 |       else:
125 |         meta_graph_location = FLAGS.train_dir + "/model.ckpt-" + str(FLAGS.check_point) + ".meta"
126 |         latest_checkpoint = FLAGS.train_dir + "/model.ckpt-" + str(FLAGS.check_point)
127 |       logging.info("loading meta-graph: " + meta_graph_location)
128 |     saver = tf.train.import_meta_graph(meta_graph_location, clear_devices=True)
129 |     logging.info("restoring variables from " + latest_checkpoint)
130 |     saver.restore(sess, latest_checkpoint)
131 |     input_tensor = tf.get_collection("input_batch_raw")[0]
132 |     num_frames_tensor = tf.get_collection("num_frames")[0]
133 |     predictions_tensor = tf.get_collection("predictions")[0]
134 | 
135 |     # Workaround for num_epochs issue.
136 |     def set_up_init_ops(variables):
137 |       init_op_list = []
138 |       for variable in list(variables):
139 |         if "train_input" in variable.name:
140 |           init_op_list.append(tf.assign(variable, 1))
141 |           variables.remove(variable)
142 |       init_op_list.append(tf.variables_initializer(variables))
143 |       return init_op_list
144 | 
145 |     sess.run(set_up_init_ops(tf.get_collection_ref(
146 |         tf.GraphKeys.LOCAL_VARIABLES)))
147 | 
148 |     coord = tf.train.Coordinator()
149 |     threads = tf.train.start_queue_runners(sess=sess, coord=coord)
150 |     num_examples_processed = 0
151 |     start_time = time.time()
152 |     out_file.write("VideoId,LabelConfidencePairs\n")
153 | 
154 |     try:
155 |       while not coord.should_stop():
156 |           video_id_batch_val, video_batch_val,num_frames_batch_val = sess.run([video_id_batch, video_batch, num_frames_batch])
157 |           predictions_val, = sess.run([predictions_tensor], feed_dict={input_tensor: video_batch_val, num_frames_tensor: num_frames_batch_val})
158 |           now = time.time()
159 |           num_examples_processed += len(video_batch_val)
160 |           num_classes = predictions_val.shape[1]
161 |           logging.info("num examples processed: " + str(num_examples_processed) + " elapsed seconds: " + "{0:.2f}".format(now-start_time))
162 |           for line in format_lines(video_id_batch_val, predictions_val, top_k):
163 |             out_file.write(line)
164 |           out_file.flush()
165 | 
166 | 
167 |     except tf.errors.OutOfRangeError:
168 |         logging.info('Done with inference. The output file was written to ' + out_file_location)
169 |     finally:
170 |         coord.request_stop()
171 | 
172 |     coord.join(threads)
173 |     sess.close()
174 | 
175 | 
176 | def main(unused_argv):
177 |   logging.set_verbosity(tf.logging.INFO)
178 | 
179 |   # convert feature_names and feature_sizes to lists of values
180 |   feature_names, feature_sizes = utils.GetListOfFeatureNamesAndSizes(
181 |       FLAGS.feature_names, FLAGS.feature_sizes)
182 | 
183 |   if FLAGS.frame_features:
184 |     reader = readers.YT8MFrameFeatureReader(feature_names=feature_names,
185 |                                             feature_sizes=feature_sizes)
186 |   else:
187 |     reader = readers.YT8MAggregatedFeatureReader(feature_names=feature_names,
188 |                                                  feature_sizes=feature_sizes)
189 | 
190 |   if FLAGS.output_file is "":
191 |     raise ValueError("'output_file' was not specified. "
192 |       "Unable to continue with inference.")
193 | 
194 |   if FLAGS.input_data_pattern is "":
195 |     raise ValueError("'input_data_pattern' was not specified. "
196 |       "Unable to continue with inference.")
197 | 
198 |   inference(reader, FLAGS.train_dir, FLAGS.input_data_pattern,
199 |     FLAGS.output_file, FLAGS.batch_size, FLAGS.top_k)
200 | 
201 | 
202 | if __name__ == "__main__":
203 |   app.run()
204 | 


--------------------------------------------------------------------------------
/losses.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Antoine Miech All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Provides definitions for non-regularized training or test losses."""
16 | 
17 | import tensorflow as tf
18 | from tensorflow import flags
19 | import scipy.io as sio
20 | import numpy as np
21 | 
22 | FLAGS = flags.FLAGS
23 | 
24 | flags.DEFINE_float(
25 |       "alpha", "0.5",
26 |       "Ponderation for XENT")
27 | 
28 | 
29 | class BaseLoss(object):
30 |   """Inherit from this class when implementing new losses."""
31 | 
32 |   def calculate_loss(self, unused_predictions, unused_labels, **unused_params):
33 |     """Calculates the average loss of the examples in a mini-batch.
34 | 
35 |      Args:
36 |       unused_predictions: a 2-d tensor storing the prediction scores, in which
37 |         each row represents a sample in the mini-batch and each column
38 |         represents a class.
39 |       unused_labels: a 2-d tensor storing the labels, which has the same shape
40 |         as the unused_predictions. The labels must be in the range of 0 and 1.
41 |       unused_params: loss specific parameters.
42 | 
43 |     Returns:
44 |       A scalar loss tensor.
45 |     """
46 |     raise NotImplementedError()
47 | 
48 | 
49 | class CrossEntropyLoss(BaseLoss):
50 |   """Calculate the cross entropy loss between the predictions and labels.
51 |   """
52 | 
53 |   def calculate_loss(self, predictions, labels, **unused_params):
54 |     with tf.name_scope("loss_xent"):
55 |       epsilon = 10e-6
56 |       alpha = FLAGS.alpha
57 | 
58 |       float_labels = tf.cast(labels, tf.float32)
59 |       cross_entropy_loss = 2*(alpha*float_labels * tf.log(predictions + epsilon) + (1-alpha)*(
60 |           1 - float_labels) * tf.log(1 - predictions + epsilon))
61 |       cross_entropy_loss = tf.negative(cross_entropy_loss)
62 |       return tf.reduce_mean(tf.reduce_sum(cross_entropy_loss, 1))
63 | 


--------------------------------------------------------------------------------
/mean_average_precision_calculator.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Calculate the mean average precision.
 16 | 
 17 | It provides an interface for calculating mean average precision
 18 | for an entire list or the top-n ranked items.
 19 | 
 20 | Example usages:
 21 | We first call the function accumulate many times to process parts of the ranked
 22 | list. After processing all the parts, we call peek_map_at_n
 23 | to calculate the mean average precision.
 24 | 
 25 | ```
 26 | import random
 27 | 
 28 | p = np.array([[random.random() for _ in xrange(50)] for _ in xrange(1000)])
 29 | a = np.array([[random.choice([0, 1]) for _ in xrange(50)]
 30 |      for _ in xrange(1000)])
 31 | 
 32 | # mean average precision for 50 classes.
 33 | calculator = mean_average_precision_calculator.MeanAveragePrecisionCalculator(
 34 |             num_class=50)
 35 | calculator.accumulate(p, a)
 36 | aps = calculator.peek_map_at_n()
 37 | ```
 38 | """
 39 | 
 40 | import numpy
 41 | import average_precision_calculator
 42 | 
 43 | 
 44 | class MeanAveragePrecisionCalculator(object):
 45 |   """This class is to calculate mean average precision.
 46 |   """
 47 | 
 48 |   def __init__(self, num_class):
 49 |     """Construct a calculator to calculate the (macro) average precision.
 50 | 
 51 |     Args:
 52 |       num_class: A positive Integer specifying the number of classes.
 53 |       top_n_array: A list of positive integers specifying the top n for each
 54 |       class. The top n in each class will be used to calculate its average
 55 |       precision at n.
 56 |       The size of the array must be num_class.
 57 | 
 58 |     Raises:
 59 |       ValueError: An error occurred when num_class is not a positive integer;
 60 |       or the top_n_array is not a list of positive integers.
 61 |     """
 62 |     if not isinstance(num_class, int) or num_class <= 1:
 63 |       raise ValueError("num_class must be a positive integer.")
 64 | 
 65 |     self._ap_calculators = []  # member of AveragePrecisionCalculator
 66 |     self._num_class = num_class  # total number of classes
 67 |     for i in range(num_class):
 68 |       self._ap_calculators.append(
 69 |           average_precision_calculator.AveragePrecisionCalculator())
 70 | 
 71 |   def accumulate(self, predictions, actuals, num_positives=None):
 72 |     """Accumulate the predictions and their ground truth labels.
 73 | 
 74 |     Args:
 75 |       predictions: A list of lists storing the prediction scores. The outer
 76 |       dimension corresponds to classes.
 77 |       actuals: A list of lists storing the ground truth labels. The dimensions
 78 |       should correspond to the predictions input. Any value
 79 |       larger than 0 will be treated as positives, otherwise as negatives.
 80 |       num_positives: If provided, it is a list of numbers representing the
 81 |       number of true positives for each class. If not provided, the number of
 82 |       true positives will be inferred from the 'actuals' array.
 83 | 
 84 |     Raises:
 85 |       ValueError: An error occurred when the shape of predictions and actuals
 86 |       does not match.
 87 |     """
 88 |     if not num_positives:
 89 |       num_positives = [None for i in predictions.shape[1]]
 90 | 
 91 |     calculators = self._ap_calculators
 92 |     for i in range(len(predictions)):
 93 |       calculators[i].accumulate(predictions[i], actuals[i], num_positives[i])
 94 | 
 95 |   def clear(self):
 96 |     for calculator in self._ap_calculators:
 97 |       calculator.clear()
 98 | 
 99 |   def is_empty(self):
100 |     return ([calculator.heap_size for calculator in self._ap_calculators] ==
101 |             [0 for _ in range(self._num_class)])
102 | 
103 |   def peek_map_at_n(self):
104 |     """Peek the non-interpolated mean average precision at n.
105 | 
106 |     Returns:
107 |       An array of non-interpolated average precision at n (default 0) for each
108 |       class.
109 |     """
110 |     aps = [self._ap_calculators[i].peek_ap_at_n()
111 |            for i in range(self._num_class)]
112 |     return aps
113 | 


--------------------------------------------------------------------------------
/model_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Contains a collection of util functions for model construction.
16 | """
17 | import numpy
18 | import tensorflow as tf
19 | from tensorflow import logging
20 | from tensorflow import flags
21 | import tensorflow.contrib.slim as slim
22 | 
23 | def SampleRandomSequence(model_input, num_frames, num_samples):
24 |   """Samples a random sequence of frames of size num_samples.
25 | 
26 |   Args:
27 |     model_input: A tensor of size batch_size x max_frames x feature_size
28 |     num_frames: A tensor of size batch_size x 1
29 |     num_samples: A scalar
30 | 
31 |   Returns:
32 |     `model_input`: A tensor of size batch_size x num_samples x feature_size
33 |   """
34 | 
35 |   batch_size = tf.shape(model_input)[0]
36 |   frame_index_offset = tf.tile(
37 |       tf.expand_dims(tf.range(num_samples), 0), [batch_size, 1])
38 |   max_start_frame_index = tf.maximum(num_frames - num_samples, 0)
39 |   start_frame_index = tf.cast(
40 |       tf.multiply(
41 |           tf.random_uniform([batch_size, 1]),
42 |           tf.cast(max_start_frame_index + 1, tf.float32)), tf.int32)
43 |   frame_index = tf.minimum(start_frame_index + frame_index_offset,
44 |                            tf.cast(num_frames - 1, tf.int32))
45 |   batch_index = tf.tile(
46 |       tf.expand_dims(tf.range(batch_size), 1), [1, num_samples])
47 |   index = tf.stack([batch_index, frame_index], 2)
48 |   return tf.gather_nd(model_input, index)
49 | 
50 | 
51 | def SampleRandomFrames(model_input, num_frames, num_samples):
52 |   """Samples a random set of frames of size num_samples.
53 | 
54 |   Args:
55 |     model_input: A tensor of size batch_size x max_frames x feature_size
56 |     num_frames: A tensor of size batch_size x 1
57 |     num_samples: A scalar
58 | 
59 |   Returns:
60 |     `model_input`: A tensor of size batch_size x num_samples x feature_size
61 |   """
62 |   batch_size = tf.shape(model_input)[0]
63 |   frame_index = tf.cast(
64 |       tf.multiply(
65 |           tf.random_uniform([batch_size, num_samples]),
66 |           tf.tile(tf.cast(num_frames, tf.float32), [1, num_samples])), tf.int32)
67 |   batch_index = tf.tile(
68 |       tf.expand_dims(tf.range(batch_size), 1), [1, num_samples])
69 |   index = tf.stack([batch_index, frame_index], 2)
70 |   return tf.gather_nd(model_input, index)
71 | 
72 | def FramePooling(frames, method, **unused_params):
73 |   """Pools over the frames of a video.
74 | 
75 |   Args:
76 |     frames: A tensor with shape [batch_size, num_frames, feature_size].
77 |     method: "average", "max", "attention", or "none".
78 |   Returns:
79 |     A tensor with shape [batch_size, feature_size] for average, max, or
80 |     attention pooling. A tensor with shape [batch_size*num_frames, feature_size]
81 |     for none pooling.
82 | 
83 |   Raises:
84 |     ValueError: if method is other than "average", "max", "attention", or
85 |     "none".
86 |   """
87 |   if method == "average":
88 |     return tf.reduce_mean(frames, 1)
89 |   elif method == "max":
90 |     return tf.reduce_max(frames, 1)
91 |   elif method == "none":
92 |     feature_size = frames.shape_as_list()[2]
93 |     return tf.reshape(frames, [-1, feature_size])
94 |   else:
95 |     raise ValueError("Unrecognized pooling method: %s" % method)
96 | 


--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Contains the base class for models."""
16 | 
17 | class BaseModel(object):
18 |   """Inherit from this class when implementing new models."""
19 | 
20 |   def create_model(self, unused_model_input, **unused_params):
21 |     raise NotImplementedError()
22 | 


--------------------------------------------------------------------------------
/readers.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Provides readers configured for different datasets."""
 16 | 
 17 | import tensorflow as tf
 18 | import utils
 19 | 
 20 | from tensorflow import logging
 21 | def resize_axis(tensor, axis, new_size, fill_value=0):
 22 |   """Truncates or pads a tensor to new_size on on a given axis.
 23 | 
 24 |   Truncate or extend tensor such that tensor.shape[axis] == new_size. If the
 25 |   size increases, the padding will be performed at the end, using fill_value.
 26 | 
 27 |   Args:
 28 |     tensor: The tensor to be resized.
 29 |     axis: An integer representing the dimension to be sliced.
 30 |     new_size: An integer or 0d tensor representing the new value for
 31 |       tensor.shape[axis].
 32 |     fill_value: Value to use to fill any new entries in the tensor. Will be
 33 |       cast to the type of tensor.
 34 | 
 35 |   Returns:
 36 |     The resized tensor.
 37 |   """
 38 |   tensor = tf.convert_to_tensor(tensor)
 39 |   shape = tf.unstack(tf.shape(tensor))
 40 | 
 41 |   pad_shape = shape[:]
 42 |   pad_shape[axis] = tf.maximum(0, new_size - shape[axis])
 43 | 
 44 |   shape[axis] = tf.minimum(shape[axis], new_size)
 45 |   shape = tf.stack(shape)
 46 | 
 47 |   resized = tf.concat([
 48 |       tf.slice(tensor, tf.zeros_like(shape), shape),
 49 |       tf.fill(tf.stack(pad_shape), tf.cast(fill_value, tensor.dtype))
 50 |   ], axis)
 51 | 
 52 |   # Update shape.
 53 |   new_shape = tensor.get_shape().as_list()  # A copy is being made.
 54 |   new_shape[axis] = new_size
 55 |   resized.set_shape(new_shape)
 56 |   return resized
 57 | 
 58 | class BaseReader(object):
 59 |   """Inherit from this class when implementing new readers."""
 60 | 
 61 |   def prepare_reader(self, unused_filename_queue):
 62 |     """Create a thread for generating prediction and label tensors."""
 63 |     raise NotImplementedError()
 64 | 
 65 | 
 66 | class YT8MAggregatedFeatureReader(BaseReader):
 67 |   """Reads TFRecords of pre-aggregated Examples.
 68 | 
 69 |   The TFRecords must contain Examples with a sparse int64 'labels' feature and
 70 |   a fixed length float32 feature, obtained from the features in 'feature_name'.
 71 |   The float features are assumed to be an average of dequantized values.
 72 |   """
 73 | 
 74 |   def __init__(self,
 75 |                num_classes=4716,
 76 |                feature_sizes=[1024],
 77 |                feature_names=["mean_inc3"]):
 78 |     """Construct a YT8MAggregatedFeatureReader.
 79 | 
 80 |     Args:
 81 |       num_classes: a positive integer for the number of classes.
 82 |       feature_sizes: positive integer(s) for the feature dimensions as a list.
 83 |       feature_names: the feature name(s) in the tensorflow record as a list.
 84 |     """
 85 | 
 86 |     assert len(feature_names) == len(feature_sizes), \
 87 |     "length of feature_names (={}) != length of feature_sizes (={})".format( \
 88 |     len(feature_names), len(feature_sizes))
 89 | 
 90 |     self.num_classes = num_classes
 91 |     self.feature_sizes = feature_sizes
 92 |     self.feature_names = feature_names
 93 | 
 94 |   def prepare_reader(self, filename_queue, batch_size=1024):
 95 |     """Creates a single reader thread for pre-aggregated YouTube 8M Examples.
 96 | 
 97 |     Args:
 98 |       filename_queue: A tensorflow queue of filename locations.
 99 | 
100 |     Returns:
101 |       A tuple of video indexes, features, labels, and padding data.
102 |     """
103 |     reader = tf.TFRecordReader()
104 |     _, serialized_examples = reader.read_up_to(filename_queue, batch_size)
105 | 
106 |     tf.add_to_collection("serialized_examples", serialized_examples)
107 |     return self.prepare_serialized_examples(serialized_examples)
108 | 
109 |   def prepare_serialized_examples(self, serialized_examples):
110 |     # set the mapping from the fields to data types in the proto
111 |     num_features = len(self.feature_names)
112 |     assert num_features > 0, "self.feature_names is empty!"
113 |     assert len(self.feature_names) == len(self.feature_sizes), \
114 |     "length of feature_names (={}) != length of feature_sizes (={})".format( \
115 |     len(self.feature_names), len(self.feature_sizes))
116 | 
117 |     feature_map = {"video_id": tf.FixedLenFeature([], tf.string),
118 |                    "labels": tf.VarLenFeature(tf.int64)}
119 |     for feature_index in range(num_features):
120 |       feature_map[self.feature_names[feature_index]] = tf.FixedLenFeature(
121 |           [self.feature_sizes[feature_index]], tf.float32)
122 | 
123 |     features = tf.parse_example(serialized_examples, features=feature_map)
124 | 
125 |     labels = tf.sparse_to_indicator(features["labels"], self.num_classes)
126 |     labels.set_shape([None, self.num_classes])
127 |     concatenated_features = tf.concat([
128 |         features[feature_name] for feature_name in self.feature_names], 1)
129 | 
130 |     return features["video_id"], concatenated_features, labels, tf.ones([tf.shape(serialized_examples)[0]])
131 | 
132 | class YT8MFrameFeatureReader(BaseReader):
133 |   """Reads TFRecords of SequenceExamples.
134 | 
135 |   The TFRecords must contain SequenceExamples with the sparse in64 'labels'
136 |   context feature and a fixed length byte-quantized feature vector, obtained
137 |   from the features in 'feature_names'. The quantized features will be mapped
138 |   back into a range between min_quantized_value and max_quantized_value.
139 |   """
140 | 
141 |   def __init__(self,
142 |                num_classes=4716,
143 |                feature_sizes=[1024],
144 |                feature_names=["inc3"],
145 |                max_frames=300):
146 |     """Construct a YT8MFrameFeatureReader.
147 | 
148 |     Args:
149 |       num_classes: a positive integer for the number of classes.
150 |       feature_sizes: positive integer(s) for the feature dimensions as a list.
151 |       feature_names: the feature name(s) in the tensorflow record as a list.
152 |       max_frames: the maximum number of frames to process.
153 |     """
154 | 
155 |     assert len(feature_names) == len(feature_sizes), \
156 |     "length of feature_names (={}) != length of feature_sizes (={})".format( \
157 |     len(feature_names), len(feature_sizes))
158 | 
159 |     self.num_classes = num_classes
160 |     self.feature_sizes = feature_sizes
161 |     self.feature_names = feature_names
162 |     self.max_frames = max_frames
163 | 
164 |   def get_video_matrix(self,
165 |                        features,
166 |                        feature_size,
167 |                        max_frames,
168 |                        max_quantized_value,
169 |                        min_quantized_value):
170 |     """Decodes features from an input string and quantizes it.
171 | 
172 |     Args:
173 |       features: raw feature values
174 |       feature_size: length of each frame feature vector
175 |       max_frames: number of frames (rows) in the output feature_matrix
176 |       max_quantized_value: the maximum of the quantized value.
177 |       min_quantized_value: the minimum of the quantized value.
178 | 
179 |     Returns:
180 |       feature_matrix: matrix of all frame-features
181 |       num_frames: number of frames in the sequence
182 |     """
183 |     decoded_features = tf.reshape(
184 |         tf.cast(tf.decode_raw(features, tf.uint8), tf.float32),
185 |         [-1, feature_size])
186 | 
187 |     num_frames = tf.minimum(tf.shape(decoded_features)[0], max_frames)
188 |     feature_matrix = utils.Dequantize(decoded_features,
189 |                                       max_quantized_value,
190 |                                       min_quantized_value)
191 |     feature_matrix = resize_axis(feature_matrix, 0, max_frames)
192 |     return feature_matrix, num_frames
193 | 
194 |   def prepare_reader(self,
195 |                      filename_queue,
196 |                      max_quantized_value=2,
197 |                      min_quantized_value=-2):
198 |     """Creates a single reader thread for YouTube8M SequenceExamples.
199 | 
200 |     Args:
201 |       filename_queue: A tensorflow queue of filename locations.
202 |       max_quantized_value: the maximum of the quantized value.
203 |       min_quantized_value: the minimum of the quantized value.
204 | 
205 |     Returns:
206 |       A tuple of video indexes, video features, labels, and padding data.
207 |     """
208 |     reader = tf.TFRecordReader()
209 |     _, serialized_example = reader.read(filename_queue)
210 | 
211 |     return self.prepare_serialized_examples(serialized_example,
212 |         max_quantized_value, min_quantized_value)
213 | 
214 |   def prepare_serialized_examples(self, serialized_example,
215 |       max_quantized_value=2, min_quantized_value=-2):
216 | 
217 |     contexts, features = tf.parse_single_sequence_example(
218 |         serialized_example,
219 |         context_features={"video_id": tf.FixedLenFeature(
220 |             [], tf.string),
221 |                           "labels": tf.VarLenFeature(tf.int64)},
222 |         sequence_features={
223 |             feature_name : tf.FixedLenSequenceFeature([], dtype=tf.string)
224 |             for feature_name in self.feature_names
225 |         })
226 | 
227 |     # read ground truth labels
228 |     labels = (tf.cast(
229 |         tf.sparse_to_dense(contexts["labels"].values, (self.num_classes,), 1,
230 |             validate_indices=False),
231 |         tf.bool))
232 | 
233 |     # loads (potentially) different types of features and concatenates them
234 |     num_features = len(self.feature_names)
235 |     assert num_features > 0, "No feature selected: feature_names is empty!"
236 | 
237 |     assert len(self.feature_names) == len(self.feature_sizes), \
238 |     "length of feature_names (={}) != length of feature_sizes (={})".format( \
239 |     len(self.feature_names), len(self.feature_sizes))
240 | 
241 |     num_frames = -1  # the number of frames in the video
242 |     feature_matrices = [None] * num_features  # an array of different features
243 |     for feature_index in range(num_features):
244 |       feature_matrix, num_frames_in_this_feature = self.get_video_matrix(
245 |           features[self.feature_names[feature_index]],
246 |           self.feature_sizes[feature_index],
247 |           self.max_frames,
248 |           max_quantized_value,
249 |           min_quantized_value)
250 |       if num_frames == -1:
251 |         num_frames = num_frames_in_this_feature
252 |       else:
253 |         tf.assert_equal(num_frames, num_frames_in_this_feature)
254 | 
255 |       feature_matrices[feature_index] = feature_matrix
256 | 
257 |     # cap the number of frames at self.max_frames
258 |     num_frames = tf.minimum(num_frames, self.max_frames)
259 | 
260 |     # concatenate different features
261 |     video_matrix = tf.concat(feature_matrices, 1)
262 | 
263 |     # convert to batch format.
264 |     # TODO: Do proper batch reads to remove the IO bottleneck.
265 |     batch_video_ids = tf.expand_dims(contexts["video_id"], 0)
266 |     batch_video_matrix = tf.expand_dims(video_matrix, 0)
267 |     batch_labels = tf.expand_dims(labels, 0)
268 |     batch_frames = tf.expand_dims(num_frames, 0)
269 | 
270 |     return batch_video_ids, batch_video_matrix, batch_labels, batch_frames
271 | 
272 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Binary for training Tensorflow models on the YouTube-8M dataset."""
 15 | 
 16 | import json
 17 | import os
 18 | import time
 19 | 
 20 | import eval_util
 21 | import export_model
 22 | import losses
 23 | import frame_level_models
 24 | import video_level_models
 25 | import readers
 26 | import tensorflow as tf
 27 | import tensorflow.contrib.slim as slim
 28 | from tensorflow import app
 29 | from tensorflow import flags
 30 | from tensorflow import gfile
 31 | from tensorflow import logging
 32 | import utils
 33 | 
 34 | FLAGS = flags.FLAGS
 35 | 
 36 | if __name__ == "__main__":
 37 |   # Dataset flags.
 38 |   flags.DEFINE_string("train_dir", "/tmp/yt8m_model/",
 39 |                       "The directory to save the model files in.")
 40 |   flags.DEFINE_string(
 41 |       "train_data_pattern", "",
 42 |       "File glob for the training dataset. If the files refer to Frame Level "
 43 |       "features (i.e. tensorflow.SequenceExample), then set --reader_type "
 44 |       "format. The (Sequence)Examples are expected to have 'rgb' byte array "
 45 |       "sequence feature as well as a 'labels' int64 context feature.")
 46 |   flags.DEFINE_string("feature_names", "mean_rgb", "Name of the feature "
 47 |                       "to use for training.")
 48 |   flags.DEFINE_string("feature_sizes", "1024", "Length of the feature vectors.")
 49 | 
 50 |   # Model flags.
 51 |   flags.DEFINE_bool(
 52 |       "frame_features", False,
 53 |       "If set, then --train_data_pattern must be frame-level features. "
 54 |       "Otherwise, --train_data_pattern must be aggregated video-level "
 55 |       "features. The model must also be set appropriately (i.e. to read 3D "
 56 |       "batches VS 4D batches.")
 57 |   flags.DEFINE_string(
 58 |       "model", "LogisticModel",
 59 |       "Which architecture to use for the model. Models are defined "
 60 |       "in models.py.")
 61 |   flags.DEFINE_bool(
 62 |       "start_new_model", False,
 63 |       "If set, this will not resume from a checkpoint and will instead create a"
 64 |       " new model instance.")
 65 | 
 66 |   # Training flags.
 67 |   flags.DEFINE_integer("batch_size", 1024,
 68 |                        "How many examples to process per batch for training.")
 69 |   flags.DEFINE_string("label_loss", "CrossEntropyLoss",
 70 |                       "Which loss function to use for training the model.")
 71 |   flags.DEFINE_float(
 72 |       "regularization_penalty", 1,
 73 |       "How much weight to give to the regularization loss (the label loss has "
 74 |       "a weight of 1).")
 75 |   flags.DEFINE_float("base_learning_rate", 0.001,
 76 |                      "Which learning rate to start with.")
 77 |   flags.DEFINE_float("learning_rate_decay", 0.9,
 78 |                      "Learning rate decay factor to be applied every "
 79 |                      "learning_rate_decay_examples.")
 80 |   flags.DEFINE_float("learning_rate_decay_examples", 4000000,
 81 |                      "Multiply current learning rate by learning_rate_decay "
 82 |                      "every learning_rate_decay_examples.")
 83 |   flags.DEFINE_integer("num_epochs", 15,
 84 |                        "How many passes to make over the dataset before "
 85 |                        "halting training.")
 86 |   flags.DEFINE_integer("max_steps", None,
 87 |                        "The maximum number of iterations of the training loop.")
 88 |   flags.DEFINE_integer("export_model_steps", 10000,
 89 |                        "The period, in number of steps, with which the model "
 90 |                        "is exported for batch prediction.")
 91 | 
 92 |   # Other flags.
 93 |   flags.DEFINE_integer("num_readers", 8,
 94 |                        "How many threads to use for reading input files.")
 95 |   flags.DEFINE_string("optimizer", "AdamOptimizer",
 96 |                       "What optimizer class to use.")
 97 |   flags.DEFINE_float("clip_gradient_norm", 1.0, "Norm to clip gradients to.")
 98 |   flags.DEFINE_bool(
 99 |       "log_device_placement", False,
100 |       "Whether to write the device on which every op will run into the "
101 |       "logs on startup.")
102 | 
103 | def validate_class_name(flag_value, category, modules, expected_superclass):
104 |   """Checks that the given string matches a class of the expected type.
105 | 
106 |   Args:
107 |     flag_value: A string naming the class to instantiate.
108 |     category: A string used further describe the class in error messages
109 |               (e.g. 'model', 'reader', 'loss').
110 |     modules: A list of modules to search for the given class.
111 |     expected_superclass: A class that the given class should inherit from.
112 | 
113 |   Raises:
114 |     FlagsError: If the given class could not be found or if the first class
115 |     found with that name doesn't inherit from the expected superclass.
116 | 
117 |   Returns:
118 |     True if a class was found that matches the given constraints.
119 |   """
120 |   candidates = [getattr(module, flag_value, None) for module in modules]
121 |   for candidate in candidates:
122 |     if not candidate:
123 |       continue
124 |     if not issubclass(candidate, expected_superclass):
125 |       raise flags.FlagsError("%s '%s' doesn't inherit from %s." %
126 |                              (category, flag_value,
127 |                               expected_superclass.__name__))
128 |     return True
129 |   raise flags.FlagsError("Unable to find %s '%s'." % (category, flag_value))
130 | 
131 | def get_input_data_tensors(reader,
132 |                            data_pattern,
133 |                            batch_size=1000,
134 |                            num_epochs=None,
135 |                            num_readers=1):
136 |   """Creates the section of the graph which reads the training data.
137 | 
138 |   Args:
139 |     reader: A class which parses the training data.
140 |     data_pattern: A 'glob' style path to the data files.
141 |     batch_size: How many examples to process at a time.
142 |     num_epochs: How many passes to make over the training data. Set to 'None'
143 |                 to run indefinitely.
144 |     num_readers: How many I/O threads to use.
145 | 
146 |   Returns:
147 |     A tuple containing the features tensor, labels tensor, and optionally a
148 |     tensor containing the number of frames per video. The exact dimensions
149 |     depend on the reader being used.
150 | 
151 |   Raises:
152 |     IOError: If no files matching the given pattern were found.
153 |   """
154 |   logging.info("Using batch size of " + str(batch_size) + " for training.")
155 |   with tf.name_scope("train_input"):
156 |     files = gfile.Glob(data_pattern)
157 |     if not files:
158 |       raise IOError("Unable to find training files. data_pattern='" +
159 |                     data_pattern + "'.")
160 |     logging.info("Number of training files: %s.", str(len(files)))
161 |     filename_queue = tf.train.string_input_producer(
162 |         files, num_epochs=num_epochs, shuffle=True)
163 |     training_data = [
164 |         reader.prepare_reader(filename_queue) for _ in range(num_readers)
165 |     ]
166 | 
167 |     return tf.train.shuffle_batch_join(
168 |         training_data,
169 |         batch_size=batch_size,
170 |         capacity=FLAGS.batch_size * 5,
171 |         min_after_dequeue=FLAGS.batch_size,
172 |         allow_smaller_final_batch=True,
173 |         enqueue_many=True)
174 | 
175 | 
176 | def find_class_by_name(name, modules):
177 |   """Searches the provided modules for the named class and returns it."""
178 |   modules = [getattr(module, name, None) for module in modules]
179 |   return next(a for a in modules if a)
180 | 
181 | 
182 | def build_graph(reader,
183 |                 model,
184 |                 train_data_pattern,
185 |                 label_loss_fn=losses.CrossEntropyLoss(),
186 |                 batch_size=1000,
187 |                 base_learning_rate=0.01,
188 |                 learning_rate_decay_examples=1000000,
189 |                 learning_rate_decay=0.95,
190 |                 optimizer_class=tf.train.AdamOptimizer,
191 |                 clip_gradient_norm=1.0,
192 |                 regularization_penalty=1,
193 |                 num_readers=1,
194 |                 num_epochs=None):
195 |   """Creates the Tensorflow graph.
196 | 
197 |   This will only be called once in the life of
198 |   a training model, because after the graph is created the model will be
199 |   restored from a meta graph file rather than being recreated.
200 | 
201 |   Args:
202 |     reader: The data file reader. It should inherit from BaseReader.
203 |     model: The core model (e.g. logistic or neural net). It should inherit
204 |            from BaseModel.
205 |     train_data_pattern: glob path to the training data files.
206 |     label_loss_fn: What kind of loss to apply to the model. It should inherit
207 |                 from BaseLoss.
208 |     batch_size: How many examples to process at a time.
209 |     base_learning_rate: What learning rate to initialize the optimizer with.
210 |     optimizer_class: Which optimization algorithm to use.
211 |     clip_gradient_norm: Magnitude of the gradient to clip to.
212 |     regularization_penalty: How much weight to give the regularization loss
213 |                             compared to the label loss.
214 |     num_readers: How many threads to use for I/O operations.
215 |     num_epochs: How many passes to make over the data. 'None' means an
216 |                 unlimited number of passes.
217 |   """
218 |   
219 |   global_step = tf.Variable(0, trainable=False, name="global_step")
220 |   
221 |   learning_rate = tf.train.exponential_decay(
222 |       base_learning_rate,
223 |       global_step * batch_size,
224 |       learning_rate_decay_examples,
225 |       learning_rate_decay,
226 |       staircase=True)
227 |   tf.summary.scalar('learning_rate', learning_rate)
228 | 
229 |   optimizer = optimizer_class(learning_rate)
230 |   unused_video_id, model_input_raw, labels_batch, num_frames = (
231 |       get_input_data_tensors(
232 |           reader,
233 |           train_data_pattern,
234 |           batch_size=batch_size,
235 |           num_readers=num_readers,
236 |           num_epochs=num_epochs))
237 |   tf.summary.histogram("model/input_raw", model_input_raw)
238 |   
239 |   feature_dim = len(model_input_raw.get_shape()) - 1
240 | 
241 |   model_input = tf.nn.l2_normalize(model_input_raw, feature_dim)
242 | 
243 |   with tf.name_scope("model"):
244 |     result = model.create_model(
245 |         model_input,
246 |         num_frames=num_frames,
247 |         vocab_size=reader.num_classes,
248 |         labels=labels_batch)
249 | 
250 |     for variable in slim.get_model_variables():
251 |       tf.summary.histogram(variable.op.name, variable)
252 | 
253 |     predictions = result["predictions"]
254 |     if "loss" in result.keys():
255 |       label_loss = result["loss"]
256 |     else:
257 |       label_loss = label_loss_fn.calculate_loss(predictions, labels_batch)
258 |     tf.summary.scalar("label_loss", label_loss)
259 | 
260 |     if "regularization_loss" in result.keys():
261 |       reg_loss = result["regularization_loss"]
262 |     else:
263 |       reg_loss = tf.constant(0.0)
264 |     
265 |     reg_losses = tf.losses.get_regularization_losses()
266 |     if reg_losses:
267 |       reg_loss += tf.add_n(reg_losses)
268 |     
269 |     if regularization_penalty != 0:
270 |       tf.summary.scalar("reg_loss", reg_loss)
271 | 
272 |     # Adds update_ops (e.g., moving average updates in batch normalization) as
273 |     # a dependency to the train_op.
274 |     update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
275 |     if "update_ops" in result.keys():
276 |       update_ops += result["update_ops"]
277 |     if update_ops:
278 |       with tf.control_dependencies(update_ops):
279 |         barrier = tf.no_op(name="gradient_barrier")
280 |         with tf.control_dependencies([barrier]):
281 |           label_loss = tf.identity(label_loss)
282 | 
283 |     # Incorporate the L2 weight penalties etc.
284 |     final_loss = regularization_penalty * reg_loss + label_loss
285 |     train_op = slim.learning.create_train_op(
286 |         final_loss,
287 |         optimizer,
288 |         global_step=global_step,
289 |         clip_gradient_norm=clip_gradient_norm)
290 | 
291 |     tf.add_to_collection("global_step", global_step)
292 |     tf.add_to_collection("loss", label_loss)
293 |     tf.add_to_collection("predictions", predictions)
294 |     tf.add_to_collection("input_batch_raw", model_input_raw)
295 |     tf.add_to_collection("input_batch", model_input)
296 |     tf.add_to_collection("num_frames", num_frames)
297 |     tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32))
298 |     tf.add_to_collection("train_op", train_op)
299 | 
300 | 
301 | class Trainer(object):
302 |   """A Trainer to train a Tensorflow graph."""
303 | 
304 |   def __init__(self, cluster, task, train_dir, model, reader, model_exporter, 
305 |                log_device_placement=True, max_steps=None, 
306 |                export_model_steps=1000):
307 |     """"Creates a Trainer.
308 | 
309 |     Args:
310 |       cluster: A tf.train.ClusterSpec if the execution is distributed.
311 |         None otherwise.
312 |       task: A TaskSpec describing the job type and the task index.
313 |     """
314 | 
315 |     self.cluster = cluster
316 |     self.task = task
317 |     self.is_master = (task.type == "master" and task.index == 0)
318 |     self.train_dir = train_dir
319 |     self.config = tf.ConfigProto(log_device_placement=log_device_placement)
320 |     self.model = model
321 |     self.reader = reader
322 |     self.model_exporter = model_exporter
323 |     self.max_steps = max_steps
324 |     self.max_steps_reached = False
325 |     self.export_model_steps = export_model_steps
326 |     self.last_model_export_step = 0
327 | 
328 |     if self.is_master and self.task.index > 0:
329 |       raise StandardError("%s: Only one replica of master expected",
330 |                           task_as_string(self.task))
331 | 
332 |   def run(self, start_new_model=False):
333 |     """Performs training on the currently defined Tensorflow graph.
334 | 
335 |     Returns:
336 |       A tuple of the training Hit@1 and the training PERR.
337 |     """
338 |     if self.is_master and start_new_model:
339 |       self.remove_training_directory(self.train_dir)
340 | 
341 |     target, device_fn = self.start_server_if_distributed()
342 | 
343 |     meta_filename = self.get_meta_filename(start_new_model, self.train_dir)
344 | 
345 |     with tf.Graph().as_default() as graph:
346 | 
347 |       if meta_filename:
348 |         saver = self.recover_model(meta_filename)
349 | 
350 |       with tf.device(device_fn):
351 | 
352 |         if not meta_filename:
353 |           saver = self.build_model(self.model, self.reader)
354 | 
355 |         global_step = tf.get_collection("global_step")[0]
356 |         loss = tf.get_collection("loss")[0]
357 |         predictions = tf.get_collection("predictions")[0]
358 |         labels = tf.get_collection("labels")[0]
359 |         train_op = tf.get_collection("train_op")[0]
360 |         init_op = tf.global_variables_initializer()
361 | 
362 |     sv = tf.train.Supervisor(
363 |         graph,
364 |         logdir=self.train_dir,
365 |         init_op=init_op,
366 |         is_chief=self.is_master,
367 |         global_step=global_step,
368 |         save_model_secs=15 * 60,
369 |         save_summaries_secs=120,
370 |         saver=saver)
371 | 
372 |     logging.info("%s: Starting managed session.", task_as_string(self.task))
373 |     with sv.managed_session(target, config=self.config) as sess:
374 | 
375 |       try:
376 |         logging.info("%s: Entering training loop.", task_as_string(self.task))
377 |         while (not sv.should_stop()) and (not self.max_steps_reached):
378 | 
379 |           batch_start_time = time.time()
380 |           _, global_step_val, loss_val, predictions_val, labels_val = sess.run(
381 |               [train_op, global_step, loss, predictions, labels])
382 |           seconds_per_batch = time.time() - batch_start_time
383 | 
384 |           if self.max_steps and self.max_steps <= global_step_val:
385 |             self.max_steps_reached = True
386 | 
387 |           if self.is_master:
388 |             examples_per_second = labels_val.shape[0] / seconds_per_batch
389 |             hit_at_one = eval_util.calculate_hit_at_one(predictions_val,
390 |                                                         labels_val)
391 |             perr = eval_util.calculate_precision_at_equal_recall_rate(
392 |                 predictions_val, labels_val)
393 |             gap = eval_util.calculate_gap(predictions_val, labels_val)
394 | 
395 |             logging.info(
396 |                 "%s: training step " + str(global_step_val) + "| Hit@1: " +
397 |                 ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) + " GAP: " +
398 |                 ("%.2f" % gap) + " Loss: " + str(loss_val),
399 |                 task_as_string(self.task))
400 | 
401 |             sv.summary_writer.add_summary(
402 |                 utils.MakeSummary("model/Training_Hit@1", hit_at_one),
403 |                 global_step_val)
404 |             sv.summary_writer.add_summary(
405 |                 utils.MakeSummary("model/Training_Perr", perr), global_step_val)
406 |             sv.summary_writer.add_summary(
407 |                 utils.MakeSummary("model/Training_GAP", gap), global_step_val)
408 |             sv.summary_writer.add_summary(
409 |                 utils.MakeSummary("global_step/Examples/Second",
410 |                                   examples_per_second), global_step_val)
411 |             sv.summary_writer.flush()
412 | 
413 |             # Exporting the model every x steps
414 |             time_to_export = ((self.last_model_export_step == 0) or 
415 |                 (global_step_val - self.last_model_export_step 
416 |                  >= self.export_model_steps))
417 | 
418 |             if self.is_master and time_to_export:
419 |               self.export_model(global_step_val, sv.saver, sv.save_path, sess)
420 |               self.last_model_export_step = global_step_val
421 | 
422 |         # Exporting the final model
423 |         if self.is_master:
424 |           self.export_model(global_step_val, sv.saver, sv.save_path, sess)
425 | 
426 |       except tf.errors.OutOfRangeError:
427 |         logging.info("%s: Done training -- epoch limit reached.",
428 |                      task_as_string(self.task))
429 | 
430 |     logging.info("%s: Exited training loop.", task_as_string(self.task))
431 |     sv.Stop()
432 | 
433 |   def export_model(self, global_step_val, saver, save_path, session):
434 | 
435 |     # If the model has already been exported at this step, return.
436 |     if global_step_val == self.last_model_export_step:
437 |       return
438 | 
439 |     last_checkpoint = saver.save(session, save_path, global_step_val)
440 | 
441 |     model_dir = "{0}/export/step_{1}".format(self.train_dir, global_step_val)
442 |     logging.info("%s: Exporting the model at step %s to %s.",
443 |                  task_as_string(self.task), global_step_val, model_dir)
444 | 
445 |     self.model_exporter.export_model(
446 |         model_dir=model_dir, 
447 |         global_step_val=global_step_val,
448 |         last_checkpoint=last_checkpoint)
449 | 
450 | 
451 |   def start_server_if_distributed(self):
452 |     """Starts a server if the execution is distributed."""
453 | 
454 |     if self.cluster:
455 |       logging.info("%s: Starting trainer within cluster %s.",
456 |                    task_as_string(self.task), self.cluster.as_dict())
457 |       server = start_server(self.cluster, self.task)
458 |       target = server.target
459 |       device_fn = tf.train.replica_device_setter(
460 |           ps_device="/job:ps",
461 |           worker_device="/job:%s/task:%d" % (self.task.type, self.task.index),
462 |           cluster=self.cluster)
463 |     else:
464 |       target = ""
465 |       device_fn = ""
466 |     return (target, device_fn)
467 | 
468 |   def remove_training_directory(self, train_dir):
469 |     """Removes the training directory."""
470 |     try:
471 |       logging.info(
472 |           "%s: Removing existing train directory.",
473 |           task_as_string(self.task))
474 |       gfile.DeleteRecursively(train_dir)
475 |     except:
476 |       logging.error(
477 |           "%s: Failed to delete directory " + train_dir +
478 |           " when starting a new model. Please delete it manually and" +
479 |           " try again.", task_as_string(self.task))
480 | 
481 |   def get_meta_filename(self, start_new_model, train_dir):
482 |     if start_new_model:
483 |       logging.info("%s: Flag 'start_new_model' is set. Building a new model.",
484 |                    task_as_string(self.task))
485 |       return None
486 |     
487 |     latest_checkpoint = tf.train.latest_checkpoint(train_dir)
488 |     if not latest_checkpoint: 
489 |       logging.info("%s: No checkpoint file found. Building a new model.",
490 |                    task_as_string(self.task))
491 |       return None
492 |     
493 |     meta_filename = latest_checkpoint + ".meta"
494 |     if not gfile.Exists(meta_filename):
495 |       logging.info("%s: No meta graph file found. Building a new model.",
496 |                      task_as_string(self.task))
497 |       return None
498 |     else:
499 |       return meta_filename
500 | 
501 |   def recover_model(self, meta_filename):
502 |     logging.info("%s: Restoring from meta graph file %s",
503 |                  task_as_string(self.task), meta_filename)
504 |     return tf.train.import_meta_graph(meta_filename)
505 | 
506 |   def build_model(self, model, reader):
507 |     """Find the model and build the graph."""
508 | 
509 |     label_loss_fn = find_class_by_name(FLAGS.label_loss, [losses])()
510 |     optimizer_class = find_class_by_name(FLAGS.optimizer, [tf.train])
511 |   
512 |     build_graph(reader=reader,
513 |                  model=model,
514 |                  optimizer_class=optimizer_class,
515 |                  clip_gradient_norm=FLAGS.clip_gradient_norm,
516 |                  train_data_pattern=FLAGS.train_data_pattern,
517 |                  label_loss_fn=label_loss_fn,
518 |                  base_learning_rate=FLAGS.base_learning_rate,
519 |                  learning_rate_decay=FLAGS.learning_rate_decay,
520 |                  learning_rate_decay_examples=FLAGS.learning_rate_decay_examples,
521 |                  regularization_penalty=FLAGS.regularization_penalty,
522 |                  num_readers=FLAGS.num_readers,
523 |                  batch_size=FLAGS.batch_size,
524 |                  num_epochs=FLAGS.num_epochs)
525 |   
526 |     return tf.train.Saver(max_to_keep=0, keep_checkpoint_every_n_hours=5)
527 | 
528 | 
529 | def get_reader():
530 |   # Convert feature_names and feature_sizes to lists of values.
531 |   feature_names, feature_sizes = utils.GetListOfFeatureNamesAndSizes(
532 |       FLAGS.feature_names, FLAGS.feature_sizes)
533 | 
534 |   if FLAGS.frame_features:
535 |     reader = readers.YT8MFrameFeatureReader(
536 |         feature_names=feature_names, feature_sizes=feature_sizes)
537 |   else:
538 |     reader = readers.YT8MAggregatedFeatureReader(
539 |         feature_names=feature_names, feature_sizes=feature_sizes)
540 |     
541 |   return reader
542 | 
543 | 
544 | class ParameterServer(object):
545 |   """A parameter server to serve variables in a distributed execution."""
546 | 
547 |   def __init__(self, cluster, task):
548 |     """Creates a ParameterServer.
549 | 
550 |     Args:
551 |       cluster: A tf.train.ClusterSpec if the execution is distributed.
552 |         None otherwise.
553 |       task: A TaskSpec describing the job type and the task index.
554 |     """
555 | 
556 |     self.cluster = cluster
557 |     self.task = task
558 | 
559 |   def run(self):
560 |     """Starts the parameter server."""
561 | 
562 |     logging.info("%s: Starting parameter server within cluster %s.",
563 |                  task_as_string(self.task), self.cluster.as_dict())
564 |     server = start_server(self.cluster, self.task)
565 |     server.join()
566 | 
567 | 
568 | def start_server(cluster, task):
569 |   """Creates a Server.
570 | 
571 |   Args:
572 |     cluster: A tf.train.ClusterSpec if the execution is distributed.
573 |       None otherwise.
574 |     task: A TaskSpec describing the job type and the task index.
575 |   """
576 | 
577 |   if not task.type:
578 |     raise ValueError("%s: The task type must be specified." %
579 |                      task_as_string(task))
580 |   if task.index is None:
581 |     raise ValueError("%s: The task index must be specified." %
582 |                      task_as_string(task))
583 | 
584 |   # Create and start a server.
585 |   return tf.train.Server(
586 |       tf.train.ClusterSpec(cluster),
587 |       protocol="grpc",
588 |       job_name=task.type,
589 |       task_index=task.index)
590 | 
591 | def task_as_string(task):
592 |   return "/job:%s/task:%s" % (task.type, task.index)
593 | 
594 | def main(unused_argv):
595 |   # Load the environment.
596 |   env = json.loads(os.environ.get("TF_CONFIG", "{}"))
597 | 
598 |   # Load the cluster data from the environment.
599 |   cluster_data = env.get("cluster", None)
600 |   cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
601 | 
602 |   # Load the task data from the environment.
603 |   task_data = env.get("task", None) or {"type": "master", "index": 0}
604 |   task = type("TaskSpec", (object,), task_data)
605 | 
606 |   # Logging the version.
607 |   logging.set_verbosity(tf.logging.INFO)
608 |   logging.info("%s: Tensorflow version: %s.",
609 |                task_as_string(task), tf.__version__)
610 | 
611 |   # Dispatch to a master, a worker, or a parameter server.
612 |   if not cluster or task.type == "master" or task.type == "worker":
613 |     
614 |     model = find_class_by_name(FLAGS.model, 
615 |         [frame_level_models, video_level_models])()
616 |     
617 |     reader = get_reader()
618 |     
619 |     model_exporter = export_model.ModelExporter(
620 |         frame_features=FLAGS.frame_features,
621 |         model=model,
622 |         reader=reader)
623 | 
624 |     Trainer(cluster, task, FLAGS.train_dir, model, reader, model_exporter, 
625 |             FLAGS.log_device_placement, FLAGS.max_steps, 
626 |             FLAGS.export_model_steps).run(start_new_model=FLAGS.start_new_model)
627 | 
628 |   elif task.type == "ps":
629 | 
630 |     ParameterServer(cluster, task).run()
631 | 
632 |   else:
633 | 
634 |     raise ValueError("%s: Invalid task_type: %s." %
635 |                      (task_as_string(task), task.type))
636 | 
637 | if __name__ == "__main__":
638 |   app.run()
639 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Contains a collection of util functions for training and evaluating.
 16 | """
 17 | 
 18 | import numpy
 19 | import tensorflow as tf
 20 | from tensorflow import logging
 21 | 
 22 | 
 23 | def Dequantize(feat_vector, max_quantized_value=2, min_quantized_value=-2):
 24 |   """Dequantize the feature from the byte format to the float format.
 25 | 
 26 |   Args:
 27 |     feat_vector: the input 1-d vector.
 28 |     max_quantized_value: the maximum of the quantized value.
 29 |     min_quantized_value: the minimum of the quantized value.
 30 | 
 31 |   Returns:
 32 |     A float vector which has the same shape as feat_vector.
 33 |   """
 34 |   assert max_quantized_value > min_quantized_value
 35 |   quantized_range = max_quantized_value - min_quantized_value
 36 |   scalar = quantized_range / 255.0
 37 |   bias = (quantized_range / 512.0) + min_quantized_value
 38 |   return feat_vector * scalar + bias
 39 | 
 40 | 
 41 | def MakeSummary(name, value):
 42 |   """Creates a tf.Summary proto with the given name and value."""
 43 |   summary = tf.Summary()
 44 |   val = summary.value.add()
 45 |   val.tag = str(name)
 46 |   val.simple_value = float(value)
 47 |   return summary
 48 | 
 49 | 
 50 | def AddGlobalStepSummary(summary_writer,
 51 |                          global_step_val,
 52 |                          global_step_info_dict,
 53 |                          summary_scope="Eval"):
 54 |   """Add the global_step summary to the Tensorboard.
 55 | 
 56 |   Args:
 57 |     summary_writer: Tensorflow summary_writer.
 58 |     global_step_val: a int value of the global step.
 59 |     global_step_info_dict: a dictionary of the evaluation metrics calculated for
 60 |       a mini-batch.
 61 |     summary_scope: Train or Eval.
 62 | 
 63 |   Returns:
 64 |     A string of this global_step summary
 65 |   """
 66 |   this_hit_at_one = global_step_info_dict["hit_at_one"]
 67 |   this_perr = global_step_info_dict["perr"]
 68 |   this_loss = global_step_info_dict["loss"]
 69 |   examples_per_second = global_step_info_dict.get("examples_per_second", -1)
 70 | 
 71 |   summary_writer.add_summary(
 72 |       MakeSummary("GlobalStep/" + summary_scope + "_Hit@1", this_hit_at_one),
 73 |       global_step_val)
 74 |   summary_writer.add_summary(
 75 |       MakeSummary("GlobalStep/" + summary_scope + "_Perr", this_perr),
 76 |       global_step_val)
 77 |   summary_writer.add_summary(
 78 |       MakeSummary("GlobalStep/" + summary_scope + "_Loss", this_loss),
 79 |       global_step_val)
 80 | 
 81 |   if examples_per_second != -1:
 82 |     summary_writer.add_summary(
 83 |         MakeSummary("GlobalStep/" + summary_scope + "_Example_Second",
 84 |                     examples_per_second), global_step_val)
 85 | 
 86 |   summary_writer.flush()
 87 |   info = ("global_step {0} | Batch Hit@1: {1:.3f} | Batch PERR: {2:.3f} | Batch Loss: {3:.3f} "
 88 |           "| Examples_per_sec: {4:.3f}").format(
 89 |               global_step_val, this_hit_at_one, this_perr, this_loss,
 90 |               examples_per_second)
 91 |   return info
 92 | 
 93 | 
 94 | def AddEpochSummary(summary_writer,
 95 |                     global_step_val,
 96 |                     epoch_info_dict,
 97 |                     summary_scope="Eval"):
 98 |   """Add the epoch summary to the Tensorboard.
 99 | 
100 |   Args:
101 |     summary_writer: Tensorflow summary_writer.
102 |     global_step_val: a int value of the global step.
103 |     epoch_info_dict: a dictionary of the evaluation metrics calculated for the
104 |       whole epoch.
105 |     summary_scope: Train or Eval.
106 | 
107 |   Returns:
108 |     A string of this global_step summary
109 |   """
110 |   epoch_id = epoch_info_dict["epoch_id"]
111 |   avg_hit_at_one = epoch_info_dict["avg_hit_at_one"]
112 |   avg_perr = epoch_info_dict["avg_perr"]
113 |   avg_loss = epoch_info_dict["avg_loss"]
114 |   aps = epoch_info_dict["aps"]
115 |   gap = epoch_info_dict["gap"]
116 |   mean_ap = numpy.mean(aps)
117 | 
118 |   summary_writer.add_summary(
119 |       MakeSummary("Epoch/" + summary_scope + "_Avg_Hit@1", avg_hit_at_one),
120 |       global_step_val)
121 |   summary_writer.add_summary(
122 |       MakeSummary("Epoch/" + summary_scope + "_Avg_Perr", avg_perr),
123 |       global_step_val)
124 |   summary_writer.add_summary(
125 |       MakeSummary("Epoch/" + summary_scope + "_Avg_Loss", avg_loss),
126 |       global_step_val)
127 |   summary_writer.add_summary(
128 |       MakeSummary("Epoch/" + summary_scope + "_MAP", mean_ap),
129 |           global_step_val)
130 |   summary_writer.add_summary(
131 |       MakeSummary("Epoch/" + summary_scope + "_GAP", gap),
132 |           global_step_val)
133 |   summary_writer.flush()
134 | 
135 |   info = ("epoch/eval number {0} | Avg_Hit@1: {1:.3f} | Avg_PERR: {2:.3f} "
136 |           "| MAP: {3:.3f} | GAP: {4:.4f} | Avg_Loss: {5:3f}").format(
137 |           epoch_id, avg_hit_at_one, avg_perr, mean_ap, gap, avg_loss)
138 |   return info
139 | 
140 | def GetListOfFeatureNamesAndSizes(feature_names, feature_sizes):
141 |   """Extract the list of feature names and the dimensionality of each feature
142 |      from string of comma separated values.
143 | 
144 |   Args:
145 |     feature_names: string containing comma separated list of feature names
146 |     feature_sizes: string containing comma separated list of feature sizes
147 | 
148 |   Returns:
149 |     List of the feature names and list of the dimensionality of each feature.
150 |     Elements in the first/second list are strings/integers.
151 |   """
152 |   list_of_feature_names = [
153 |       feature_names.strip() for feature_names in feature_names.split(',')]
154 |   list_of_feature_sizes = [
155 |       int(feature_sizes) for feature_sizes in feature_sizes.split(',')]
156 |   if len(list_of_feature_names) != len(list_of_feature_sizes):
157 |     logging.error("length of the feature names (=" +
158 |                   str(len(list_of_feature_names)) + ") != length of feature "
159 |                   "sizes (=" + str(len(list_of_feature_sizes)) + ")")
160 | 
161 |   return list_of_feature_names, list_of_feature_sizes
162 | 
163 | 


--------------------------------------------------------------------------------
/video_level_models.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Antoine Miech All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Contains model definitions."""
 16 | import math
 17 | 
 18 | import models
 19 | import tensorflow as tf
 20 | import utils
 21 | 
 22 | from tensorflow import flags
 23 | import tensorflow.contrib.slim as slim
 24 | 
 25 | FLAGS = flags.FLAGS
 26 | flags.DEFINE_integer(
 27 |     "moe_num_mixtures", 2,
 28 |     "The number of mixtures (excluding the dummy 'expert') used for MoeModel.")
 29 | flags.DEFINE_float(
 30 |     "moe_l2", 1e-8,
 31 |     "L2 penalty for MoeModel.")
 32 | flags.DEFINE_integer(
 33 |     "moe_low_rank_gating", -1,
 34 |     "Low rank gating for MoeModel.")
 35 | flags.DEFINE_bool(
 36 |     "moe_prob_gating", False,
 37 |     "Prob gating for MoeModel.")
 38 | flags.DEFINE_string(
 39 |     "moe_prob_gating_input", "prob",
 40 |     "input Prob gating for MoeModel.")
 41 | 
 42 | 
 43 | class MoeModel(models.BaseModel):
 44 |   """A softmax over a mixture of logistic models (with L2 regularization)."""
 45 | 
 46 |   def create_model(self,
 47 |                    model_input,
 48 |                    vocab_size,
 49 |                    is_training,
 50 |                    num_mixtures=None,
 51 |                    l2_penalty=1e-8,
 52 |                    **unused_params):
 53 |     """Creates a Mixture of (Logistic) Experts model.
 54 |      It also includes the possibility of gating the probabilities
 55 | 
 56 |      The model consists of a per-class softmax distribution over a
 57 |      configurable number of logistic classifiers. One of the classifiers in the
 58 |      mixture is not trained, and always predicts 0.
 59 | 
 60 |     Args:
 61 |       model_input: 'batch_size' x 'num_features' matrix of input features.
 62 |       vocab_size: The number of classes in the dataset.
 63 |       is_training: Is this the training phase ?
 64 |       num_mixtures: The number of mixtures (excluding a dummy 'expert' that
 65 |         always predicts the non-existence of an entity).
 66 |       l2_penalty: How much to penalize the squared magnitudes of parameter
 67 |         values.
 68 |     Returns:
 69 |       A dictionary with a tensor containing the probability predictions of the
 70 |       model in the 'predictions' key. The dimensions of the tensor are
 71 |       batch_size x num_classes.
 72 |     """
 73 |     num_mixtures = num_mixtures or FLAGS.moe_num_mixtures
 74 |     low_rank_gating = FLAGS.moe_low_rank_gating
 75 |     l2_penalty = FLAGS.moe_l2;
 76 |     gating_probabilities = FLAGS.moe_prob_gating
 77 |     gating_input = FLAGS.moe_prob_gating_input
 78 | 
 79 |     input_size = model_input.get_shape().as_list()[1]
 80 |     remove_diag = FLAGS.gating_remove_diag
 81 | 
 82 |     if low_rank_gating == -1:
 83 |         gate_activations = slim.fully_connected(
 84 |             model_input,
 85 |             vocab_size * (num_mixtures + 1),
 86 |             activation_fn=None,
 87 |             biases_initializer=None,
 88 |             weights_regularizer=slim.l2_regularizer(l2_penalty),
 89 |             scope="gates")
 90 |     else:
 91 |        gate_activations1 = slim.fully_connected(
 92 |             model_input,
 93 |             low_rank_gating,
 94 |             activation_fn=None,
 95 |             biases_initializer=None,
 96 |             weights_regularizer=slim.l2_regularizer(l2_penalty),
 97 |             scope="gates1")
 98 |        gate_activations = slim.fully_connected(
 99 |             gate_activations1,
100 |             vocab_size * (num_mixtures + 1),
101 |             activation_fn=None,
102 |             biases_initializer=None,
103 |             weights_regularizer=slim.l2_regularizer(l2_penalty),
104 |             scope="gates2")
105 | 
106 | 
107 |     expert_activations = slim.fully_connected(
108 |         model_input,
109 |         vocab_size * num_mixtures,
110 |         activation_fn=None,
111 |         weights_regularizer=slim.l2_regularizer(l2_penalty),
112 |         scope="experts")
113 | 
114 |     gating_distribution = tf.nn.softmax(tf.reshape(
115 |         gate_activations,
116 |         [-1, num_mixtures + 1]))  # (Batch * #Labels) x (num_mixtures + 1)
117 |     expert_distribution = tf.nn.sigmoid(tf.reshape(
118 |         expert_activations,
119 |         [-1, num_mixtures]))  # (Batch * #Labels) x num_mixtures
120 | 
121 |     probabilities_by_class_and_batch = tf.reduce_sum(
122 |         gating_distribution[:, :num_mixtures] * expert_distribution, 1)
123 |     probabilities = tf.reshape(probabilities_by_class_and_batch,
124 |                                      [-1, vocab_size])
125 | 
126 |     if gating_probabilities:
127 |         if gating_input == 'prob':
128 |             gating_weights = tf.get_variable("gating_prob_weights",
129 |               [vocab_size, vocab_size],
130 |               initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
131 |             gates = tf.matmul(probabilities, gating_weights)
132 |         else:
133 |             gating_weights = tf.get_variable("gating_prob_weights",
134 |               [input_size, vocab_size],
135 |               initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
136 |  
137 |             gates = tf.matmul(model_input, gating_weights)
138 |         
139 |         if remove_diag:
140 |             #removes diagonals coefficients
141 |             diagonals = tf.matrix_diag_part(gating_weights)
142 |             gates = gates - tf.multiply(diagonals,probabilities)
143 | 
144 |         gates = slim.batch_norm(
145 |               gates,
146 |               center=True,
147 |               scale=True,
148 |               is_training=is_training,
149 |               scope="gating_prob_bn")
150 | 
151 |         gates = tf.sigmoid(gates)
152 | 
153 |         probabilities = tf.multiply(probabilities,gates)
154 | 
155 | 
156 |     return {"predictions": probabilities}
157 | 


--------------------------------------------------------------------------------