├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── aggregation_modules.py ├── attention_modules.py ├── average_precision_calculator.py ├── eval.py ├── eval_util.py ├── export_model.py ├── frame_level_models.py ├── inference.py ├── losses.py ├── mean_average_precision_calculator.py ├── model_utils.py ├── models.py ├── module_utils.py ├── modules.py ├── paper └── Learnable_Pooling_Methods_for_Video_Classification.pdf ├── pathmagic.py ├── readers.py ├── rnn_modules.py ├── scripts ├── batch_evaluate.py ├── generate_gcloud_evaluation.py ├── generate_gcloud_inference.py ├── generate_gcloud_train.py └── generate_gcloud_train_valid.py ├── train.py ├── transformer_utils.py ├── utils.py ├── video_level_models.py └── video_pooling_modules.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | ## Contents sepcific to MAC OS 107 | # General 108 | .DS_Store 109 | .AppleDouble 110 | .LSOverride 111 | 112 | # Icon must end with two \r 113 | Icon 114 | 115 | 116 | # Thumbnails 117 | ._* 118 | 119 | # Files that might appear in the root of a volume 120 | .DocumentRevisions-V100 121 | .fseventsd 122 | .Spotlight-V100 123 | .TemporaryItems 124 | .Trashes 125 | .VolumeIcon.icns 126 | .com.apple.timemachine.donotpresent 127 | 128 | # Directories potentially created on remote AFP share 129 | .AppleDB 130 | .AppleDesktop 131 | Network Trash Folder 132 | Temporary Items 133 | .apdisk 134 | 135 | # Research file 136 | history/ 137 | 138 | .idea/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Learnable Pooling Methods for Video Classification 2 | The repository is based on the starter code provided by Google AI. It contains a code for training and evaluating models for [YouTube-8M](https://research.google.com/youtube8m/) dataset. The detailed table of contents and descriptions can be found at [original repository](https://github.com/google/youtube-8m). 3 | 4 | The repository contains models from team "Deep Topology". Our approach was accepted in [ECCV - The 2nd Workshop on YouTube-8M Large-Scale Video Understanding](https://research.google.com/youtube8m/workshop2018/index.html). The presentation is accessible in ECCV Workshop page. 5 | 6 | Presentation: TBA \ 7 | Paper: [Link](paper/Learnable_Pooling_Methods_for_Video_Classification.pdf), [Arxiv](https://arxiv.org/abs/1810.00530) 8 | 9 | # Usage 10 | In [frame_level_models.py](frame_level_models.py), prototype 1, 2 and 3 refer to sections 3.1, 3.2 and 3.2 in the paper. The detailed instructions instructions to train and evaluate the model can be found at [YT8M repository](https://github.com/google/youtube-8m). The following is the example training command to reproduce the result. 11 | ### Prototype 1 (Attention Enhanced NetVLAD) 12 | ``` 13 | python train.py --train_data_pattern="" --model=NetVladV1 --train_dir="" --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=80 --base_learning_rate=0.0002 --netvlad_cluster_size=256 --netvlad_hidden_size=512 --iterations=256 --learning_rate_decay=0.85 14 | ``` 15 | ### Prototype 2 (NetVLAD with Attention Based Cluster Similarities) 16 | ``` 17 | python train.py --train_data_pattern="" --model=NetVladV2 --train_dir="" --frame_features=True --feature_names="rgb,audio" --feature_sizes="1024,128" --batch_size=80 --base_learning_rate=0.0002 --netvlad_cluster_size=256 --netvlad_hidden_size=512 --iterations=256 --learning_rate_decay=0.85 18 | ``` 19 | ### Prototype 3 (Regularized Function Approximation Approach) 20 | ``` 21 | TBD 22 | ``` 23 | 24 | # Changes 25 | - **1.00** (31 August 2018) 26 | - Initial public release 27 | - **2.00** (30 September 2018) 28 | - Code cleaning 29 | - Model usage 30 | 31 | # Citations 32 | If you find our apporaches useful, please cite our paper. 33 | ``` 34 | @article{kmiec2018learnable, 35 | title={Learnable Pooling Methods for Video Classification}, 36 | author={Kmiec, Sebastian and Bae, Juhan and An, Ruijian}, 37 | journal={arXiv preprint arXiv:1810.00530}, 38 | year={2018} 39 | } 40 | ``` 41 | 42 | # Contributors (Alphabetical Order) 43 | - [Ruijian An](https://github.com/RuijianSZ) 44 | - [Juhan Bae](https://github.com/pomonam) 45 | - [Sebastian Kmiec](https://github.com/sebastiankmiec) 46 | 47 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Copyright 2018 Juhan Bae, Ruijian An Inc. All Rights Reserved. 16 | # 17 | # Licensed under the Apache License, Version 2.0 (the "License"); 18 | # you may not use this file except in compliance with the License. 19 | # You may obtain a copy of the License at 20 | # 21 | # http://www.apache.org/licenses/LICENSE-2.0 22 | # 23 | # Unless required by applicable law or agreed to in writing, software 24 | # distributed under the License is distributed on an "AS-IS" BASIS, 25 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 26 | # See the License for the specific language governing permissions and 27 | # limitations under the License. 28 | # noinspection PyUnresolvedReferences 29 | -------------------------------------------------------------------------------- /aggregation_modules.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Deep Topology Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ Modules for feature pooling and aggregation. """ 16 | 17 | import tensorflow as tf 18 | import modules 19 | 20 | 21 | class IndirectClusterMeanPoolModule(modules.BaseModule): 22 | """ Mean pooling method. Mean is computed from weighted average 23 | inspired from self-attention mechanism (indirect clustering) 24 | """ 25 | def __init__(self, l2_normalize): 26 | """ Initialize IndirectClusterMaxMeanPoolModule 27 | :param l2_normalize: bool 28 | """ 29 | self.l2_normalize = l2_normalize 30 | 31 | def forward(self, t_inputs, c_inputs, **unused_params): 32 | """ Forward method for max & mean pooling with indirect clustering (self-attention). 33 | :param t_inputs: batch_size x max_frames x num_features 34 | :param c_inputs: batch_size x max_frames x num_features 35 | :return: batch_size x feature_size 36 | """ 37 | attention = tf.matmul(t_inputs, tf.transpose(t_inputs, perm=[0, 2, 1])) 38 | # -> batch_size x max_frames x max_frames 39 | attention = tf.expand_dims(attention, -1) 40 | # Zero-out negative weight. 41 | attention = tf.nn.relu(attention) 42 | 43 | attention = tf.reduce_sum(attention, axis=2) 44 | # -> batch_size x max_frames x 1 45 | attention = tf.nn.softmax(attention, axis=1) 46 | 47 | mean_pool = tf.reduce_mean(tf.multiply(c_inputs, attention), axis=1) 48 | # -> batch_size x num_features 49 | 50 | if self.l2_normalize: 51 | mean_pool = tf.nn.l2_normalize(mean_pool, 1) 52 | 53 | return mean_pool 54 | 55 | 56 | class MeanStdPoolModule(modules.BaseModule): 57 | """ Mean-Std pooling method. 58 | """ 59 | def __init__(self, l2_normalize): 60 | """ Initialize Mean STD module. 61 | :param l2_normalize: 62 | """ 63 | self.l2_normalize = l2_normalize 64 | 65 | def forward(self, inputs, **unused_params): 66 | """ Forward method for MeanStdPoolModule. 67 | :param inputs: batch_size x max_frames x num_features 68 | :return: batch_size x feature_size 69 | """ 70 | moments = tf.reduce_mean(inputs, 1) 71 | return moments 72 | 73 | 74 | class IndirectClusterMaxMeanPoolModule(modules.BaseModule): 75 | """ Max-Mean pooling method. Mean is computed from weighted average 76 | inspired from self-attention mechanism (indirect clustering) 77 | """ 78 | def __init__(self, l2_normalize): 79 | """ Initialize IndirectClusterMaxMeanPoolModule 80 | :param l2_normalize: bool 81 | """ 82 | self.l2_normalize = l2_normalize 83 | 84 | def forward(self, inputs, **unused_params): 85 | """ Forward method for max & mean pooling with indirect clustering (self-attention). 86 | Where 87 | :param inputs: batch_size x max_frames x num_features 88 | :return: batch_size x feature_size 89 | """ 90 | attention = tf.matmul(inputs, tf.transpose(inputs, perm=[0, 2, 1])) 91 | # -> batch_size x max_frames x max_frames 92 | attention = tf.expand_dims(attention, -1) 93 | attention = tf.nn.relu(attention) 94 | 95 | attention = tf.reduce_sum(attention, axis=2) 96 | # -> batch_size x max_frames x 1 97 | attention = tf.nn.softmax(attention, axis=1) 98 | 99 | mean_pool = tf.reduce_mean(tf.multiply(inputs, attention), axis=1) 100 | max_pool = tf.reduce_max(inputs, axis=1) 101 | # -> batch_size x num_features 102 | 103 | if self.l2_normalize: 104 | mean_pool = tf.nn.l2_normalize(mean_pool, 1) 105 | max_pool = tf.nn.l2_normalize(max_pool, 1) 106 | 107 | concat_pool = tf.concat([mean_pool, max_pool], 1) 108 | return concat_pool 109 | 110 | 111 | class MaxMeanPoolingModule(modules.BaseModule): 112 | """ Max-Mean pooling method. """ 113 | def __init__(self, l2_normalize=True): 114 | """ Initialize MaxMeanPoolingModule. 115 | :param l2_normalize: bool 116 | """ 117 | self.l2_normalize = l2_normalize 118 | 119 | def forward(self, inputs, **unused_params): 120 | """ Forward method for mean & max pooling. 121 | :param inputs: batch_size x max_frames x num_features 122 | :return: batch_size x feature_size 123 | """ 124 | max_pooled = tf.reduce_max(inputs, 1) 125 | avg_pooled = tf.reduce_mean(inputs, 1) 126 | 127 | if self.l2_normalize: 128 | max_pooled = tf.nn.l2_normalize(max_pooled, 1) 129 | avg_pooled = tf.nn.l2_normalize(avg_pooled, 1) 130 | # -> batch_size x num_features 131 | 132 | concat = tf.concat([max_pooled, avg_pooled], 1) 133 | return concat 134 | 135 | 136 | class MaxPoolingModule(modules.BaseModule): 137 | """ Max pooling method. """ 138 | def __init__(self, l2_normalize=False): 139 | """ Initialize MaxPoolingModule. 140 | :param l2_normalize: bool 141 | """ 142 | self.l2_normalize = l2_normalize 143 | 144 | def forward(self, inputs, **unused_params): 145 | """ Forward method for max pooling. 146 | :param inputs: batch_size x max_frames x num_features 147 | :return: batch_size x feature_size 148 | """ 149 | return tf.reduce_max(inputs, 1) 150 | 151 | 152 | class MeanPooling(modules.BaseModule): 153 | """ Average pooling method. """ 154 | def __init__(self, l2_normalize=False): 155 | """ Initialize MeanPooling. 156 | :param l2_normalize: bool 157 | """ 158 | self.l2_normalize = l2_normalize 159 | 160 | def forward(self, inputs, **unused_params): 161 | """ Forward method for mean pooling. 162 | :param inputs: batch_size x max_frames x num_features 163 | :return: batch_size x feature_size 164 | """ 165 | return tf.reduce_mean(inputs, 1) 166 | 167 | 168 | class GemPoolingModule(modules.BaseModule): 169 | """ Generalized Mean Pooling. """ 170 | def __init__(self, l2_normalize=False, eps=1e-6): 171 | """ Initialize GemPoolingModule. 172 | :param l2_normalize: bool 173 | """ 174 | self.l2_normalize = l2_normalize 175 | self.eps = eps 176 | 177 | # TODO: Implementation is incorrect / incomplete. 178 | def forward(self, inputs, **unused_params): 179 | """ Forward method for GeM pooling 180 | :param inputs: batch_size x max_frames x num_features 181 | :return: batch_size x feature_size 182 | """ 183 | p = tf.get_variable("p", 184 | shape=[1]) 185 | # Clip some values. 186 | frames = tf.clip_by_value(inputs, clip_value_min=self.eps, clip_value_max=None) 187 | frames = tf.pow(frames, p) 188 | frames = tf.reduce_mean(frames, 1) 189 | frames = tf.pow(frames, 1. / p) 190 | return frames 191 | -------------------------------------------------------------------------------- /attention_modules.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Deep Topology All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from tensorflow.python.ops import nn 16 | import tensorflow as tf 17 | import tensorflow.contrib.slim as slim 18 | import math 19 | import modules 20 | 21 | 22 | class OneFcAttention(modules.BaseModule): 23 | def __init__(self, num_features, num_frames, num_cluster, do_shift=True): 24 | self.num_feature = num_features 25 | self.num_frames = num_frames 26 | self.num_cluster = num_cluster 27 | self.do_shift = do_shift 28 | 29 | def forward(self, inputs, **unused_params): 30 | attention_weights = \ 31 | tf.get_variable("one_fc_attention_weight", 32 | [self.num_feature, self.num_cluster], 33 | initializer=tf.contrib.layers.xavier_initializer()) 34 | attention = tf.matmul(inputs, attention_weights) 35 | attention = tf.reshape(attention, [-1, self.num_frames, self.num_cluster]) 36 | attention = tf.scalar_mul(1 / math.sqrt(self.num_feature), attention) 37 | attention = tf.nn.softmax(attention, dim=1) 38 | 39 | reshaped_inputs = tf.reshape(inputs, [-1, self.num_frames, self.num_feature]) 40 | activation = tf.transpose(attention, perm=[0, 2, 1]) 41 | activation = tf.matmul(activation, reshaped_inputs) 42 | # -> batch_size x num_cluster x feature_size 43 | 44 | reshaped_activation = tf.reshape(activation, [-1, self.num_feature]) 45 | 46 | if self.do_shift: 47 | alpha = \ 48 | tf.get_variable("alpha", 49 | [1], 50 | initializer=tf.constant_initializer(1)) 51 | beta = \ 52 | tf.get_variable("beta", 53 | [1], 54 | initializer=tf.constant_initializer(0.01)) 55 | 56 | reshaped_activation = alpha * reshaped_activation 57 | reshaped_activation = reshaped_activation + beta 58 | reshaped_activation = tf.nn.l2_normalize(reshaped_activation, 1) 59 | reshaped_activation = tf.scalar_mul(1 / math.sqrt(self.num_cluster), reshaped_activation) 60 | 61 | activation = tf.reshape(reshaped_activation, [-1, self.num_cluster * self.num_feature]) 62 | 63 | return activation 64 | 65 | 66 | class MultiHeadAttention(modules.BaseModule): 67 | def __init__(self, num_heads, num_units, max_frames, block_id): 68 | """ Initialize MultiHeadAttention 69 | :param num_heads: Number of self-attention modules 70 | :param num_units: last dimension of Q, K, V 71 | """ 72 | self.num_heads = num_heads 73 | self.num_units = num_units 74 | self.max_frames = max_frames 75 | self.block_id = block_id 76 | 77 | def self_attention(self, inputs, scope_id): 78 | with tf.variable_scope("Block{}Layer{}".format(self.block_id, scope_id), reuse=tf.AUTO_REUSE): 79 | # Calculate query, key, value pair 80 | Q = tf.layers.dense(inputs, self.num_units, activation=tf.nn.relu) 81 | K = tf.layers.dense(inputs, self.num_units, activation=tf.nn.relu) 82 | V = tf.layers.dense(inputs, self.num_units, activation=tf.nn.relu) 83 | # Q, K, V: -> (batch_size * max_frames) x num_units 84 | 85 | # Reshape for self-attention calculation 86 | Q = tf.reshape(Q, [-1, self.max_frames, self.num_units]) 87 | K = tf.reshape(K, [-1, self.max_frames, self.num_units]) 88 | V = tf.reshape(V, [-1, self.max_frames, self.num_units]) 89 | # Q, K, V: -> batch_size x max_frames x num_units 90 | 91 | # Self-attention 92 | attention = tf.matmul(Q, tf.transpose(K, perm=[0, 2, 1])) 93 | # attention: -> batch_size x max_frames x max_frames 94 | float_cpy = tf.cast(self.num_units, dtype=tf.float32) 95 | attention = tf.divide(attention, tf.sqrt(float_cpy)) 96 | attention = tf.nn.softmax(tf.divide(attention, tf.sqrt(float_cpy))) 97 | 98 | output = tf.matmul(attention, V) 99 | # output: -> batch_size x max_frames x num_units 100 | return output 101 | 102 | def forward(self, inputs, **unused_params): 103 | result = self.self_attention(inputs, scope_id=0) 104 | for i in range(1, self.num_heads): 105 | result = tf.identity(result) 106 | output = self.self_attention(inputs, scope_id=i) 107 | result = tf.concat([result, output], 2) 108 | # result: -> batch_size x max_frames x (num_units * num_heads) 109 | return result 110 | 111 | 112 | class TransformerEncoderBlock(modules.BaseModule): 113 | def __init__(self, is_training, num_units, max_frames, feature_size, num_heads, block_id): 114 | """ Initialize Transformer Encoder block 115 | 116 | :param is_training: bool 117 | :param num_units: Number of hidden units of fully connected layers 118 | """ 119 | self.is_training = is_training 120 | self.num_units = num_units 121 | self.max_frames = max_frames 122 | self.feature_size = feature_size 123 | self.num_heads = num_heads 124 | self.block_id = block_id 125 | 126 | def forward(self, inputs, **unused_params): 127 | """ One block of encoder containing one self-attention layer and one fully connected layer. 128 | """ 129 | multi_head_layer = MultiHeadAttention(self.num_heads, self.num_units, self.max_frames, self.block_id) 130 | 131 | attention_output = multi_head_layer.forward(inputs) 132 | # output: -> batch_size x max_frames x (num_units * num_heads) 133 | 134 | attention_output = tf.reshape(attention_output, [-1, self.num_units * self.num_heads]) 135 | # output: -> (batch_size * max_frames) x (num_units * num_heads) 136 | 137 | attention_output = tf.layers.dense(attention_output, self.feature_size, activation=tf.nn.relu) 138 | # output: -> (batch_size * max_frames) x feature_size 139 | 140 | # Residual connection & Layer normalization 141 | attention_output += inputs 142 | attention_output = tf.contrib.layers.layer_norm(attention_output) 143 | 144 | # 2 layers of 1 x 1 convolution 145 | output = tf.reshape(attention_output, [-1, self.max_frames, self.feature_size]) 146 | output = tf.layers.conv1d(output, filters=4 * self.num_units, kernel_size=1, activation=tf.nn.relu, 147 | use_bias=True) 148 | output = tf.layers.conv1d(output, filters=self.num_units, kernel_size=1, activation=None, use_bias=True) 149 | 150 | # Residual connection & Layer normalization 151 | output = tf.contrib.layers.layer_norm(output) 152 | output = tf.reshape(output, [-1, self.feature_size]) 153 | 154 | return output 155 | 156 | 157 | class PnGateModule(modules.BaseModule): 158 | def __init__(self, vocab_size, is_training, scope_id=None): 159 | """ Initialize class PnGateModule. 160 | :param vocab_size: int 161 | Size of the classes. 162 | :param is_training: bool 163 | True iff the model is being trained. 164 | :param scope_id: Object 165 | """ 166 | self.vocab_size = vocab_size 167 | self.scope_id = scope_id 168 | self.is_training = is_training 169 | 170 | def forward(self, inputs, **unused_params): 171 | """ PN Gate for correlation learning. 172 | vocabularies -> P gate -> N gate -> output 173 | :param inputs: batch_size x vocab_size 174 | :return: batch_size x vocab_size 175 | """ 176 | p_gating_weights = \ 177 | tf.get_variable("p_pn_gate", 178 | [self.vocab_size, self.vocab_size], 179 | initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(self.vocab_size))) 180 | n_gating_weights = \ 181 | tf.get_variable("n_pn_gate", 182 | [self.vocab_size, self.vocab_size], 183 | initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(self.vocab_size))) 184 | 185 | # batch_size x vocab_size, vocab_size x vocab_size --> batch_size x vocab_size 186 | p_activation = tf.matmul(inputs, p_gating_weights) 187 | p_activation = tf.nn.relu6(p_activation) 188 | p_gate = inputs + p_activation 189 | 190 | # batch_size x vocab_size, vocab_size x vocab_size --> batch_size x vocab_size 191 | n_activation = tf.matmul(p_gate, n_gating_weights) 192 | n_activation = -1 * n_activation 193 | n_activation = tf.nn.relu6(n_activation) 194 | n_gate = p_gate + (-1 * n_activation) 195 | 196 | output = tf.nn.softmax(n_gate) 197 | return output 198 | 199 | 200 | class NpGateModule(modules.BaseModule): 201 | def __init__(self, vocab_size, is_training, scope_id=None): 202 | """ Initialize class NpGateModule. 203 | :param vocab_size: int 204 | Size of the classes. 205 | :param is_training: bool 206 | True iff the model is being trained. 207 | :param scope_id: Object 208 | """ 209 | self.vocab_size = vocab_size 210 | self.scope_id = scope_id 211 | self.is_training = is_training 212 | 213 | def forward(self, inputs, **unused_params): 214 | """ PN Gate for correlation learning. 215 | vocabularies -> N gate -> P gate -> output 216 | :param inputs: batch_size x vocab_size 217 | :return: batch_size x vocab_size 218 | """ 219 | p_gating_weights = \ 220 | tf.get_variable("p_np_gate", 221 | [self.vocab_size, self.vocab_size], 222 | initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(self.vocab_size))) 223 | n_gating_weights = \ 224 | tf.get_variable("n_np_gate", 225 | [self.vocab_size, self.vocab_size], 226 | initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(self.vocab_size))) 227 | 228 | # batch_size x vocab_size, vocab_size x vocab_size --> batch_size x vocab_size 229 | n_activation = tf.matmul(inputs, n_gating_weights) 230 | n_activation = -1 * n_activation 231 | n_activation = tf.nn.relu6(n_activation) 232 | n_gate = inputs + (-1 * n_activation) 233 | 234 | # batch_size x vocab_size, vocab_size x vocab_size --> batch_size x vocab_size 235 | p_activation = tf.matmul(n_gate, p_gating_weights) 236 | p_activation = tf.nn.relu6(p_activation) 237 | p_gate = n_gate + p_activation 238 | output = tf.nn.softmax(p_gate) 239 | 240 | return output 241 | 242 | 243 | class PGateModule(modules.BaseModule): 244 | def __init__(self, vocab_size, is_training, scope_id=None): 245 | """ Initialize class PGateModule. 246 | :param vocab_size: int 247 | Size of the classes. 248 | :param is_training: bool 249 | True iff the model is being trained. 250 | :param scope_id: Object 251 | """ 252 | self.vocab_size = vocab_size 253 | self.scope_id = scope_id 254 | self.is_training = is_training 255 | 256 | def forward(self, inputs, **unused_params): 257 | """ PN Gate for correlation learning. 258 | vocabularies -> P gate -> output 259 | :param inputs: batch_size x vocab_size 260 | :return: batch_size x vocab_size 261 | """ 262 | p_gating_weights = \ 263 | tf.get_variable("p_p_gate", 264 | [self.vocab_size, self.vocab_size], 265 | initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(self.vocab_size))) 266 | 267 | # batch_size x vocab_size, vocab_size x vocab_size --> batch_size x vocab_size 268 | p_activation = tf.matmul(inputs, p_gating_weights) 269 | p_activation = tf.nn.relu6(p_activation) 270 | p_gate = inputs + p_activation 271 | output = tf.nn.softmax(p_gate) 272 | 273 | return output 274 | 275 | 276 | class CorNNGateModule(modules.BaseModule): 277 | def __init__(self, vocab_size, is_training, batch_norm=True, scope_id=None): 278 | """ Initialize a class CorNNGateModule. 279 | :param vocab_size: int 280 | Size of the classes. 281 | :param is_training: bool 282 | :param batch_norm: bool 283 | :param scope_id: int 284 | """ 285 | self.vocab_size = vocab_size 286 | self.is_training = is_training 287 | self.batch_norm = batch_norm 288 | self.scope_id = scope_id 289 | 290 | def forward(self, inputs, **unused_params): 291 | """ Forward function of CorNNGateModule. 292 | :param inputs: batch_size x vocab_size 293 | :return: batch_size x vocab_size 294 | """ 295 | fc1_out = slim.fully_connected( 296 | inputs=inputs, 297 | num_outputs=self.vocab_size, 298 | activation_fn=nn.relu, 299 | scope="vocab_gate1_v1{}".format("" if self.scope_id is None else str(self.scope_id)) 300 | ) 301 | 302 | fc2_out = slim.fully_connected( 303 | inputs=fc1_out, 304 | num_outputs=self.vocab_size, 305 | activation_fn=nn.relu, 306 | scope="vocab_gate2_v1{}".format("" if self.scope_id is None else str(self.scope_id)) 307 | ) 308 | 309 | fc3_out = slim.fully_connected( 310 | inputs=fc2_out, 311 | num_outputs=self.vocab_size, 312 | activation_fn=nn.sigmoid, 313 | scope="vocab_gate3_v1{}".format("" if self.scope_id is None else str(self.scope_id)) 314 | ) 315 | 316 | return fc3_out 317 | 318 | 319 | class ContextGateV1(modules.BaseModule): 320 | """ 321 | Given the weight W, calculate sigmoid(WX + b) o X. o is an element-wise 322 | multiplication. 323 | """ 324 | def __init__(self, vocab_size, is_training, batch_norm=True, scope_id=None): 325 | """ Initialize a class ContextGateV1. The idea and implementation is adopted from WILLOW. 326 | :param vocab_size: int 327 | Size of the classes. 328 | :param is_training: bool 329 | :param batch_norm: bool 330 | :param scope_id: int 331 | """ 332 | self.vocab_size = vocab_size 333 | self.is_training = is_training 334 | self.batch_norm = batch_norm 335 | self.scope_id = scope_id 336 | 337 | def forward(self, inputs, **unused_params): 338 | """ Forward function of ContextGateV1 339 | :param inputs: batch_size x vocab_size 340 | :return: batch_size x vocab_size 341 | """ 342 | gating_weights = tf.get_variable("vocab_gate_v1{}".format("" if self.scope_id is None else str(self.scope_id)), 343 | [self.vocab_size, self.vocab_size]) 344 | 345 | # batch_size x vocab_size, vocab_size x vocab_size --> batch_size x vocab_size 346 | gates = tf.matmul(inputs, gating_weights) 347 | 348 | if self.batch_norm: 349 | gates = slim.batch_norm( 350 | gates, 351 | center=True, 352 | scale=True, 353 | is_training=self.is_training, 354 | scope="vocab_gate_bn_v1{}".format("" if self.scope_id is None else str(self.scope_id))) 355 | 356 | gates = tf.sigmoid(gates) 357 | 358 | # batch_size x vocab_size, batch_size x vocab_size -> batch_size x vocab_size 359 | updated_inputs = tf.multiply(inputs, gates) 360 | 361 | # batch_size x vocab_size 362 | return updated_inputs 363 | -------------------------------------------------------------------------------- /average_precision_calculator.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Calculate or keep track of the interpolated average precision. 16 | 17 | It provides an interface for calculating interpolated average precision for an 18 | entire list or the top-n ranked items. For the definition of the 19 | (non-)interpolated average precision: 20 | http://trec.nist.gov/pubs/trec15/appendices/CE.MEASURES06.pdf 21 | 22 | Example usages: 23 | 1) Use it as a static function call to directly calculate average precision for 24 | a short ranked list in the memory. 25 | 26 | ``` 27 | import random 28 | 29 | p = np.array([random.random() for _ in xrange(10)]) 30 | a = np.array([random.choice([0, 1]) for _ in xrange(10)]) 31 | 32 | ap = average_precision_calculator.AveragePrecisionCalculator.ap(p, a) 33 | ``` 34 | 35 | 2) Use it as an object for long ranked list that cannot be stored in memory or 36 | the case where partial predictions can be observed at a time (Tensorflow 37 | predictions). In this case, we first call the function accumulate many times 38 | to process parts of the ranked list. After processing all the parts, we call 39 | peek_interpolated_ap_at_n. 40 | ``` 41 | p1 = np.array([random.random() for _ in xrange(5)]) 42 | a1 = np.array([random.choice([0, 1]) for _ in xrange(5)]) 43 | p2 = np.array([random.random() for _ in xrange(5)]) 44 | a2 = np.array([random.choice([0, 1]) for _ in xrange(5)]) 45 | 46 | # interpolated average precision at 10 using 1000 break points 47 | calculator = average_precision_calculator.AveragePrecisionCalculator(10) 48 | calculator.accumulate(p1, a1) 49 | calculator.accumulate(p2, a2) 50 | ap3 = calculator.peek_ap_at_n() 51 | ``` 52 | """ 53 | # noinspection PyUnresolvedReferences 54 | import pathmagic 55 | import heapq 56 | import random 57 | import numbers 58 | import numpy 59 | 60 | 61 | class AveragePrecisionCalculator(object): 62 | """Calculate the average precision and average precision at n.""" 63 | 64 | def __init__(self, top_n=None): 65 | """Construct an AveragePrecisionCalculator to calculate average precision. 66 | 67 | This class is used to calculate the average precision for a single label. 68 | 69 | Args: 70 | top_n: A positive Integer specifying the average precision at n, or 71 | None to use all provided data points. 72 | 73 | Raises: 74 | ValueError: An error occurred when the top_n is not a positive integer. 75 | """ 76 | if not ((isinstance(top_n, int) and top_n >= 0) or top_n is None): 77 | raise ValueError("top_n must be a positive integer or None.") 78 | 79 | self._top_n = top_n # average precision at n 80 | self._total_positives = 0 # total number of positives have seen 81 | self._heap = [] # max heap of (prediction, actual) 82 | 83 | @property 84 | def heap_size(self): 85 | """Gets the heap size maintained in the class.""" 86 | return len(self._heap) 87 | 88 | @property 89 | def num_accumulated_positives(self): 90 | """Gets the number of positive samples that have been accumulated.""" 91 | return self._total_positives 92 | 93 | def accumulate(self, predictions, actuals, num_positives=None): 94 | """Accumulate the predictions and their ground truth labels. 95 | 96 | After the function call, we may call peek_ap_at_n to actually calculate 97 | the average precision. 98 | Note predictions and actuals must have the same shape. 99 | 100 | Args: 101 | predictions: a list storing the prediction scores. 102 | actuals: a list storing the ground truth labels. Any value 103 | larger than 0 will be treated as positives, otherwise as negatives. 104 | num_positives = If the 'predictions' and 'actuals' inputs aren't complete, 105 | then it's possible some true positives were missed in them. In that case, 106 | you can provide 'num_positives' in order to accurately track recall. 107 | 108 | Raises: 109 | ValueError: An error occurred when the format of the input is not the 110 | numpy 1-D array or the shape of predictions and actuals does not match. 111 | """ 112 | if len(predictions) != len(actuals): 113 | raise ValueError("the shape of predictions and actuals does not match.") 114 | 115 | if not num_positives is None: 116 | if not isinstance(num_positives, numbers.Number) or num_positives < 0: 117 | raise ValueError("'num_positives' was provided but it wan't a nonzero number.") 118 | 119 | if not num_positives is None: 120 | self._total_positives += num_positives 121 | else: 122 | self._total_positives += numpy.size(numpy.where(actuals > 0)) 123 | topk = self._top_n 124 | heap = self._heap 125 | 126 | for i in range(numpy.size(predictions)): 127 | if topk is None or len(heap) < topk: 128 | heapq.heappush(heap, (predictions[i], actuals[i])) 129 | else: 130 | if predictions[i] > heap[0][0]: # heap[0] is the smallest 131 | heapq.heappop(heap) 132 | heapq.heappush(heap, (predictions[i], actuals[i])) 133 | 134 | def clear(self): 135 | """Clear the accumulated predictions.""" 136 | self._heap = [] 137 | self._total_positives = 0 138 | 139 | def peek_ap_at_n(self): 140 | """Peek the non-interpolated average precision at n. 141 | 142 | Returns: 143 | The non-interpolated average precision at n (default 0). 144 | If n is larger than the length of the ranked list, 145 | the average precision will be returned. 146 | """ 147 | if self.heap_size <= 0: 148 | return 0 149 | predlists = numpy.array(list(zip(*self._heap))) 150 | 151 | ap = self.ap_at_n(predlists[0], 152 | predlists[1], 153 | n=self._top_n, 154 | total_num_positives=self._total_positives) 155 | return ap 156 | 157 | @staticmethod 158 | def ap(predictions, actuals): 159 | """Calculate the non-interpolated average precision. 160 | 161 | Args: 162 | predictions: a numpy 1-D array storing the sparse prediction scores. 163 | actuals: a numpy 1-D array storing the ground truth labels. Any value 164 | larger than 0 will be treated as positives, otherwise as negatives. 165 | 166 | Returns: 167 | The non-interpolated average precision at n. 168 | If n is larger than the length of the ranked list, 169 | the average precision will be returned. 170 | 171 | Raises: 172 | ValueError: An error occurred when the format of the input is not the 173 | numpy 1-D array or the shape of predictions and actuals does not match. 174 | """ 175 | return AveragePrecisionCalculator.ap_at_n(predictions, 176 | actuals, 177 | n=None) 178 | 179 | @staticmethod 180 | def ap_at_n(predictions, actuals, n=20, total_num_positives=None): 181 | """Calculate the non-interpolated average precision. 182 | 183 | Args: 184 | predictions: a numpy 1-D array storing the sparse prediction scores. 185 | actuals: a numpy 1-D array storing the ground truth labels. Any value 186 | larger than 0 will be treated as positives, otherwise as negatives. 187 | n: the top n items to be considered in ap@n. 188 | total_num_positives : (optionally) you can specify the number of total 189 | positive 190 | in the list. If specified, it will be used in calculation. 191 | 192 | Returns: 193 | The non-interpolated average precision at n. 194 | If n is larger than the length of the ranked list, 195 | the average precision will be returned. 196 | 197 | Raises: 198 | ValueError: An error occurred when 199 | 1) the format of the input is not the numpy 1-D array; 200 | 2) the shape of predictions and actuals does not match; 201 | 3) the input n is not a positive integer. 202 | """ 203 | if len(predictions) != len(actuals): 204 | raise ValueError("the shape of predictions and actuals does not match.") 205 | 206 | if n is not None: 207 | if not isinstance(n, int) or n <= 0: 208 | raise ValueError("n must be 'None' or a positive integer." 209 | " It was '%s'." % n) 210 | 211 | ap = 0.0 212 | 213 | predictions = numpy.array(predictions) 214 | actuals = numpy.array(actuals) 215 | 216 | # add a shuffler to avoid overestimating the ap 217 | predictions, actuals = AveragePrecisionCalculator._shuffle(predictions, 218 | actuals) 219 | sortidx = sorted( 220 | range(len(predictions)), 221 | key=lambda k: predictions[k], 222 | reverse=True) 223 | 224 | if total_num_positives is None: 225 | numpos = numpy.size(numpy.where(actuals > 0)) 226 | else: 227 | numpos = total_num_positives 228 | 229 | if numpos == 0: 230 | return 0 231 | 232 | if n is not None: 233 | numpos = min(numpos, n) 234 | delta_recall = 1.0 / numpos 235 | poscount = 0.0 236 | 237 | # calculate the ap 238 | r = len(sortidx) 239 | if n is not None: 240 | r = min(r, n) 241 | for i in range(r): 242 | if actuals[sortidx[i]] > 0: 243 | poscount += 1 244 | ap += poscount / (i + 1) * delta_recall 245 | return ap 246 | 247 | @staticmethod 248 | def _shuffle(predictions, actuals): 249 | random.seed(0) 250 | suffidx = random.sample(range(len(predictions)), len(predictions)) 251 | predictions = predictions[suffidx] 252 | actuals = actuals[suffidx] 253 | return predictions, actuals 254 | 255 | @staticmethod 256 | def _zero_one_normalize(predictions, epsilon=1e-7): 257 | """Normalize the predictions to the range between 0.0 and 1.0. 258 | 259 | For some predictions like SVM predictions, we need to normalize them before 260 | calculate the interpolated average precision. The normalization will not 261 | change the rank in the original list and thus won't change the average 262 | precision. 263 | 264 | Args: 265 | predictions: a numpy 1-D array storing the sparse prediction scores. 266 | epsilon: a small constant to avoid denominator being zero. 267 | 268 | Returns: 269 | The normalized prediction. 270 | """ 271 | denominator = numpy.max(predictions) - numpy.min(predictions) 272 | ret = (predictions - numpy.min(predictions)) / numpy.max(denominator, 273 | epsilon) 274 | return ret 275 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Binary for evaluating Tensorflow models on the YouTube-8M dataset.""" 15 | import os 16 | import sys 17 | 18 | # Explicitly add the file's directory to the path list. 19 | file_dir = os.path.dirname(__file__) 20 | sys.path.append(file_dir) 21 | sys.path.append(os.path.join(os.getcwd(), "modules")) 22 | 23 | # noinspection PyUnresolvedReferences 24 | import pathmagic 25 | import glob 26 | import json 27 | import os 28 | import time 29 | import sys 30 | import eval_util 31 | import losses 32 | import video_level_models 33 | import frame_level_models 34 | import readers 35 | import tensorflow as tf 36 | from tensorflow.python.lib.io import file_io 37 | from tensorflow import app 38 | from tensorflow import flags 39 | from tensorflow import gfile 40 | from tensorflow import logging 41 | import utils 42 | 43 | 44 | FLAGS = flags.FLAGS 45 | 46 | if __name__ == "__main__": 47 | # Data set flags. 48 | flags.DEFINE_string("train_dir", "/tmp/yt8m_model/", 49 | "The directory to load the model files from. " 50 | "The tensorboard metrics files are also saved to this " 51 | "directory.") 52 | flags.DEFINE_string( 53 | "eval_data_pattern", "", 54 | "File glob defining the evaluation dataset in tensorflow.SequenceExample " 55 | "format. The SequenceExamples are expected to have an 'rgb' byte array " 56 | "sequence feature as well as a 'labels' int64 context feature.") 57 | 58 | # Other flags. 59 | flags.DEFINE_integer("batch_size", 1024, 60 | "How many examples to process per batch.") 61 | flags.DEFINE_integer("num_readers", 8, 62 | "How many threads to use for reading input files.") 63 | flags.DEFINE_boolean("run_once", False, "Whether to run eval only once.") 64 | flags.DEFINE_integer("top_k", 20, "How many predictions to output per video.") 65 | 66 | 67 | def find_class_by_name(name, modules): 68 | """Searches the provided modules for the named class and returns it.""" 69 | modules = [getattr(module, name, None) for module in modules] 70 | return next(a for a in modules if a) 71 | 72 | 73 | def get_input_evaluation_tensors(reader, 74 | data_pattern, 75 | batch_size=1024, 76 | num_readers=1): 77 | """Creates the section of the graph which reads the evaluation data. 78 | 79 | Args: 80 | reader: A class which parses the training data. 81 | data_pattern: A 'glob' style path to the data files. 82 | batch_size: How many examples to process at a time. 83 | num_readers: How many I/O threads to use. 84 | 85 | Returns: 86 | A tuple containing the features tensor, labels tensor, and optionally a 87 | tensor containing the number of frames per video. The exact dimensions 88 | depend on the reader being used. 89 | 90 | Raises: 91 | IOError: If no files matching the given pattern were found. 92 | """ 93 | logging.info("Using batch size of " + str(batch_size) + " for evaluation.") 94 | with tf.name_scope("eval_input"): 95 | files = gfile.Glob(data_pattern) 96 | if not files: 97 | raise IOError("Unable to find the evaluation files.") 98 | logging.info("number of evaluation files: " + str(len(files))) 99 | filename_queue = tf.train.string_input_producer( 100 | files, shuffle=False, num_epochs=1) 101 | eval_data = [ 102 | reader.prepare_reader(filename_queue) for _ in range(num_readers) 103 | ] 104 | return tf.train.batch_join( 105 | eval_data, 106 | batch_size=batch_size, 107 | capacity=3 * batch_size, 108 | allow_smaller_final_batch=True, 109 | enqueue_many=True) 110 | 111 | 112 | def build_graph(reader, 113 | model, 114 | eval_data_pattern, 115 | label_loss_fn, 116 | batch_size=1024, 117 | num_readers=1): 118 | """Creates the Tensorflow graph for evaluation. 119 | 120 | Args: 121 | reader: The data file reader. It should inherit from BaseReader. 122 | model: The core model (e.g. logistic or neural net). It should inherit 123 | from BaseModel. 124 | eval_data_pattern: glob path to the evaluation data files. 125 | label_loss_fn: What kind of loss to apply to the model. It should inherit 126 | from BaseLoss. 127 | batch_size: How many examples to process at a time. 128 | num_readers: How many threads to use for I/O operations. 129 | """ 130 | 131 | global_step = tf.Variable(0, trainable=False, name="global_step") 132 | video_id_batch, model_input_raw, labels_batch, num_frames = get_input_evaluation_tensors( 133 | # pylint: disable=g-line-too-long 134 | reader, 135 | eval_data_pattern, 136 | batch_size=batch_size, 137 | num_readers=num_readers) 138 | tf.summary.histogram("model_input_raw", model_input_raw) 139 | 140 | feature_dim = len(model_input_raw.get_shape()) - 1 141 | 142 | # Normalize input features. 143 | model_input = tf.nn.l2_normalize(model_input_raw, feature_dim) 144 | 145 | with tf.variable_scope("tower"): 146 | result = model.create_model(model_input, 147 | num_frames=num_frames, 148 | vocab_size=reader.num_classes, 149 | labels=labels_batch, 150 | is_training=False) 151 | predictions = result["predictions"] 152 | tf.summary.histogram("model_activations", predictions) 153 | if "loss" in result.keys(): 154 | label_loss = result["loss"] 155 | else: 156 | label_loss = label_loss_fn.calculate_loss(predictions, labels_batch) 157 | 158 | tf.add_to_collection("global_step", global_step) 159 | tf.add_to_collection("loss", label_loss) 160 | tf.add_to_collection("predictions", predictions) 161 | tf.add_to_collection("input_batch", model_input) 162 | tf.add_to_collection("input_batch_raw", model_input_raw) 163 | tf.add_to_collection("video_id_batch", video_id_batch) 164 | tf.add_to_collection("num_frames", num_frames) 165 | tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) 166 | tf.add_to_collection("summary_op", tf.summary.merge_all()) 167 | 168 | 169 | def get_latest_checkpoint(): 170 | index_files = file_io.get_matching_files(os.path.join(FLAGS.train_dir, 'model.ckpt-*.index')) 171 | tf.logging.debug("Looking at {}".format(index_files)) 172 | 173 | # No files 174 | if not index_files: 175 | return None 176 | 177 | # Index file path with the maximum step size. 178 | latest_index_file = sorted( 179 | [(int(os.path.basename(f).split("-")[-1].split(".")[0]), f) 180 | for f in index_files])[-1][1] 181 | 182 | # Chop off .index suffix and return 183 | return latest_index_file[:-6] 184 | 185 | 186 | def evaluation_loop(video_id_batch, prediction_batch, label_batch, loss, 187 | summary_op, saver, summary_writer, evl_metrics, 188 | last_global_step_val): 189 | """Run the evaluation loop once. 190 | 191 | Args: 192 | video_id_batch: a tensor of video ids mini-batch. 193 | prediction_batch: a tensor of predictions mini-batch. 194 | label_batch: a tensor of label_batch mini-batch. 195 | loss: a tensor of loss for the examples in the mini-batch. 196 | summary_op: a tensor which runs the tensorboard summary operations. 197 | saver: a tensorflow saver to restore the model. 198 | summary_writer: a tensorflow summary_writer 199 | evl_metrics: an EvaluationMetrics object. 200 | last_global_step_val: the global step used in the previous evaluation. 201 | 202 | Returns: 203 | The global_step used in the latest model. 204 | """ 205 | 206 | global_step_val = -1 207 | with tf.Session() as sess: 208 | latest_checkpoint = get_latest_checkpoint() 209 | print(latest_checkpoint) 210 | if latest_checkpoint: 211 | logging.info("Loading checkpoint for eval: " + latest_checkpoint) 212 | # Restores from checkpoint 213 | saver.restore(sess, latest_checkpoint) 214 | # Assuming model_checkpoint_path looks something like: 215 | # /my-favorite-path/yt8m_train/model.ckpt-0, extract global_step from it. 216 | global_step_val = os.path.basename(latest_checkpoint).split("-")[-1] 217 | 218 | # Save model 219 | saver.save(sess, os.path.join(FLAGS.train_dir, "inference_model")) 220 | else: 221 | logging.info("No checkpoint file found.") 222 | return global_step_val 223 | 224 | if global_step_val == last_global_step_val: 225 | logging.info("skip this checkpoint global_step_val=%s " 226 | "(same as the previous one).", global_step_val) 227 | return global_step_val 228 | 229 | sess.run([tf.local_variables_initializer()]) 230 | 231 | # Start the queue runners. 232 | fetches = [video_id_batch, prediction_batch, label_batch, loss, summary_op] 233 | coord = tf.train.Coordinator() 234 | try: 235 | threads = [] 236 | for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS): 237 | threads.extend(qr.create_threads( 238 | sess, coord=coord, daemon=True, 239 | start=True)) 240 | logging.info("enter eval_once loop global_step_val = %s. ", 241 | global_step_val) 242 | 243 | evl_metrics.clear() 244 | 245 | examples_processed = 0 246 | while not coord.should_stop(): 247 | batch_start_time = time.time() 248 | _, predictions_val, labels_val, loss_val, summary_val = sess.run( 249 | fetches) 250 | seconds_per_batch = time.time() - batch_start_time 251 | example_per_second = labels_val.shape[0] / seconds_per_batch 252 | examples_processed += labels_val.shape[0] 253 | 254 | iteration_info_dict = evl_metrics.accumulate(predictions_val, 255 | labels_val, loss_val) 256 | iteration_info_dict["examples_per_second"] = example_per_second 257 | 258 | iterinfo = utils.AddGlobalStepSummary( 259 | summary_writer, 260 | global_step_val, 261 | iteration_info_dict, 262 | summary_scope="Eval") 263 | logging.info("examples_processed: %d | %s", examples_processed, 264 | iterinfo) 265 | 266 | except tf.errors.OutOfRangeError as e: 267 | logging.info( 268 | "Done with batched inference. Now calculating global performance " 269 | "metrics.") 270 | # calculate the metrics for the entire epoch 271 | epoch_info_dict = evl_metrics.get() 272 | epoch_info_dict["epoch_id"] = global_step_val 273 | 274 | summary_writer.add_summary(summary_val, global_step_val) 275 | epochinfo = utils.AddEpochSummary( 276 | summary_writer, 277 | global_step_val, 278 | epoch_info_dict, 279 | summary_scope="Eval") 280 | logging.info(epochinfo) 281 | evl_metrics.clear() 282 | except Exception as e: # pylint: disable=broad-except 283 | logging.info("Unexpected exception: " + str(e)) 284 | coord.request_stop(e) 285 | 286 | coord.request_stop() 287 | coord.join(threads, stop_grace_period_secs=10) 288 | 289 | return global_step_val 290 | 291 | 292 | def evaluate(): 293 | tf.set_random_seed(0) # for reproducibility 294 | 295 | # Write json of flags 296 | model_flags_path = os.path.join(FLAGS.train_dir, "model_flags.json") 297 | if not file_io.file_exists(model_flags_path): 298 | raise IOError(("Cannot find file %s. Did you run train.py on the same " 299 | "--train_dir?") % model_flags_path) 300 | flags_dict = json.loads(file_io.FileIO(model_flags_path, mode="r").read()) 301 | 302 | with tf.Graph().as_default(): 303 | # convert feature_names and feature_sizes to lists of values 304 | feature_names, feature_sizes = utils.GetListOfFeatureNamesAndSizes( 305 | flags_dict["feature_names"], flags_dict["feature_sizes"]) 306 | 307 | if flags_dict["frame_features"]: 308 | reader = readers.YT8MFrameFeatureReader(feature_names=feature_names, 309 | feature_sizes=feature_sizes) 310 | else: 311 | reader = readers.YT8MAggregatedFeatureReader(feature_names=feature_names, 312 | feature_sizes=feature_sizes) 313 | 314 | model = find_class_by_name(flags_dict["model"], 315 | [frame_level_models, video_level_models])() 316 | label_loss_fn = find_class_by_name(flags_dict["label_loss"], [losses])() 317 | 318 | if FLAGS.eval_data_pattern is "": 319 | raise IOError("'eval_data_pattern' was not specified. " + 320 | "Nothing to evaluate.") 321 | 322 | build_graph( 323 | reader=reader, 324 | model=model, 325 | eval_data_pattern=FLAGS.eval_data_pattern, 326 | label_loss_fn=label_loss_fn, 327 | num_readers=FLAGS.num_readers, 328 | batch_size=FLAGS.batch_size) 329 | logging.info("built evaluation graph") 330 | video_id_batch = tf.get_collection("video_id_batch")[0] 331 | prediction_batch = tf.get_collection("predictions")[0] 332 | label_batch = tf.get_collection("labels")[0] 333 | loss = tf.get_collection("loss")[0] 334 | summary_op = tf.get_collection("summary_op")[0] 335 | 336 | saver = tf.train.Saver(tf.global_variables()) 337 | summary_writer = tf.summary.FileWriter( 338 | FLAGS.train_dir, graph=tf.get_default_graph()) 339 | 340 | evl_metrics = eval_util.EvaluationMetrics(reader.num_classes, FLAGS.top_k) 341 | 342 | last_global_step_val = -1 343 | while True: 344 | last_global_step_val = evaluation_loop(video_id_batch, prediction_batch, 345 | label_batch, loss, summary_op, 346 | saver, summary_writer, evl_metrics, 347 | last_global_step_val) 348 | if FLAGS.run_once: 349 | break 350 | 351 | 352 | def main(unused_argv): 353 | logging.set_verbosity(tf.logging.INFO) 354 | print("tensorflow version: %s" % tf.__version__) 355 | evaluate() 356 | 357 | 358 | if __name__ == "__main__": 359 | app.run() 360 | 361 | -------------------------------------------------------------------------------- /eval_util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Provides functions to help with evaluating models.""" 16 | import mean_average_precision_calculator as map_calculator 17 | import average_precision_calculator as ap_calculator 18 | import numpy 19 | 20 | 21 | def flatten(l): 22 | """ Merges a list of lists into a single list. """ 23 | return [item for sublist in l for item in sublist] 24 | 25 | 26 | def calculate_hit_at_one(predictions, actuals): 27 | """Performs a local (numpy) calculation of the hit at one. 28 | 29 | Args: 30 | predictions: Matrix containing the outputs of the model. 31 | Dimensions are 'batch' x 'num_classes'. 32 | actuals: Matrix containing the ground truth labels. 33 | Dimensions are 'batch' x 'num_classes'. 34 | 35 | Returns: 36 | float: The average hit at one across the entire batch. 37 | """ 38 | top_prediction = numpy.argmax(predictions, 1) 39 | hits = actuals[numpy.arange(actuals.shape[0]), top_prediction] 40 | return numpy.average(hits) 41 | 42 | 43 | def calculate_precision_at_equal_recall_rate(predictions, actuals): 44 | """Performs a local (numpy) calculation of the PERR. 45 | 46 | Args: 47 | predictions: Matrix containing the outputs of the model. 48 | Dimensions are 'batch' x 'num_classes'. 49 | actuals: Matrix containing the ground truth labels. 50 | Dimensions are 'batch' x 'num_classes'. 51 | 52 | Returns: 53 | float: The average precision at equal recall rate across the entire batch. 54 | """ 55 | aggregated_precision = 0.0 56 | num_videos = actuals.shape[0] 57 | for row in numpy.arange(num_videos): 58 | num_labels = int(numpy.sum(actuals[row])) 59 | top_indices = numpy.argpartition(predictions[row], 60 | -num_labels)[-num_labels:] 61 | item_precision = 0.0 62 | for label_index in top_indices: 63 | if predictions[row][label_index] > 0: 64 | item_precision += actuals[row][label_index] 65 | item_precision /= top_indices.size 66 | aggregated_precision += item_precision 67 | aggregated_precision /= num_videos 68 | return aggregated_precision 69 | 70 | 71 | def calculate_gap(predictions, actuals, top_k=20): 72 | """Performs a local (numpy) calculation of the global average precision. 73 | 74 | Only the top_k predictions are taken for each of the videos. 75 | 76 | Args: 77 | predictions: Matrix containing the outputs of the model. 78 | Dimensions are 'batch' x 'num_classes'. 79 | actuals: Matrix containing the ground truth labels. 80 | Dimensions are 'batch' x 'num_classes'. 81 | top_k: How many predictions to use per video. 82 | 83 | Returns: 84 | float: The global average precision. 85 | """ 86 | gap_calculator = ap_calculator.AveragePrecisionCalculator() 87 | sparse_predictions, sparse_labels, num_positives = top_k_by_class(predictions, actuals, top_k) 88 | gap_calculator.accumulate(flatten(sparse_predictions), flatten(sparse_labels), sum(num_positives)) 89 | return gap_calculator.peek_ap_at_n() 90 | 91 | 92 | def top_k_by_class(predictions, labels, k=20): 93 | """Extracts the top k predictions for each video, sorted by class. 94 | 95 | Args: 96 | predictions: A numpy matrix containing the outputs of the model. 97 | Dimensions are 'batch' x 'num_classes'. 98 | k: the top k non-zero entries to preserve in each prediction. 99 | 100 | Returns: 101 | A tuple (predictions,labels, true_positives). 'predictions' and 'labels' 102 | are lists of lists of floats. 'true_positives' is a list of scalars. The 103 | length of the lists are equal to the number of classes. The entries in the 104 | predictions variable are probability predictions, and 105 | the corresponding entries in the labels variable are the ground truth for 106 | those predictions. The entries in 'true_positives' are the number of true 107 | positives for each class in the ground truth. 108 | 109 | Raises: 110 | ValueError: An error occurred when the k is not a positive integer. 111 | """ 112 | if k <= 0: 113 | raise ValueError("k must be a positive integer.") 114 | k = min(k, predictions.shape[1]) 115 | num_classes = predictions.shape[1] 116 | prediction_triplets= [] 117 | for video_index in range(predictions.shape[0]): 118 | prediction_triplets.extend(top_k_triplets(predictions[video_index],labels[video_index], k)) 119 | out_predictions = [[] for v in range(num_classes)] 120 | out_labels = [[] for v in range(num_classes)] 121 | for triplet in prediction_triplets: 122 | out_predictions[triplet[0]].append(triplet[1]) 123 | out_labels[triplet[0]].append(triplet[2]) 124 | out_true_positives = [numpy.sum(labels[:,i]) for i in range(num_classes)] 125 | 126 | return out_predictions, out_labels, out_true_positives 127 | 128 | 129 | def top_k_triplets(predictions, labels, k=20): 130 | """Get the top_k for a 1-d numpy array. Returns a sparse list of tuples in 131 | (prediction, class) format""" 132 | m = len(predictions) 133 | k = min(k, m) 134 | indices = numpy.argpartition(predictions, -k)[-k:] 135 | return [(index, predictions[index], labels[index]) for index in indices] 136 | 137 | 138 | class EvaluationMetrics(object): 139 | """A class to store the evaluation metrics.""" 140 | def __init__(self, num_class, top_k): 141 | """Construct an EvaluationMetrics object to store the evaluation metrics. 142 | 143 | Args: 144 | num_class: A positive integer specifying the number of classes. 145 | top_k: A positive integer specifying how many predictions are considered per video. 146 | 147 | Raises: 148 | ValueError: An error occurred when MeanAveragePrecisionCalculator cannot 149 | not be constructed. 150 | """ 151 | self.sum_hit_at_one = 0.0 152 | self.sum_perr = 0.0 153 | self.sum_loss = 0.0 154 | self.map_calculator = map_calculator.MeanAveragePrecisionCalculator(num_class) 155 | self.global_ap_calculator = ap_calculator.AveragePrecisionCalculator() 156 | self.top_k = top_k 157 | self.num_examples = 0 158 | 159 | def accumulate(self, predictions, labels, loss): 160 | """Accumulate the metrics calculated locally for this mini-batch. 161 | 162 | Args: 163 | predictions: A numpy matrix containing the outputs of the model. 164 | Dimensions are 'batch' x 'num_classes'. 165 | labels: A numpy matrix containing the ground truth labels. 166 | Dimensions are 'batch' x 'num_classes'. 167 | loss: A numpy array containing the loss for each sample. 168 | 169 | Returns: 170 | dictionary: A dictionary storing the metrics for the mini-batch. 171 | 172 | Raises: 173 | ValueError: An error occurred when the shape of predictions and actuals 174 | does not match. 175 | """ 176 | batch_size = labels.shape[0] 177 | mean_hit_at_one = calculate_hit_at_one(predictions, labels) 178 | mean_perr = calculate_precision_at_equal_recall_rate(predictions, labels) 179 | mean_loss = numpy.mean(loss) 180 | 181 | # Take the top 20 predictions. 182 | sparse_predictions, sparse_labels, num_positives = top_k_by_class(predictions, labels, self.top_k) 183 | self.map_calculator.accumulate(sparse_predictions, sparse_labels, num_positives) 184 | self.global_ap_calculator.accumulate(flatten(sparse_predictions), flatten(sparse_labels), sum(num_positives)) 185 | 186 | self.num_examples += batch_size 187 | self.sum_hit_at_one += mean_hit_at_one * batch_size 188 | self.sum_perr += mean_perr * batch_size 189 | self.sum_loss += mean_loss * batch_size 190 | 191 | return {"hit_at_one": mean_hit_at_one, "perr": mean_perr, "loss": mean_loss} 192 | 193 | def get(self): 194 | """Calculate the evaluation metrics for the whole epoch. 195 | 196 | Raises: 197 | ValueError: If no examples were accumulated. 198 | 199 | Returns: 200 | dictionary: a dictionary storing the evaluation metrics for the epoch. The 201 | dictionary has the fields: avg_hit_at_one, avg_perr, avg_loss, and 202 | aps (default nan). 203 | """ 204 | if self.num_examples <= 0: 205 | raise ValueError("total_sample must be positive.") 206 | avg_hit_at_one = self.sum_hit_at_one / self.num_examples 207 | avg_perr = self.sum_perr / self.num_examples 208 | avg_loss = self.sum_loss / self.num_examples 209 | 210 | aps = self.map_calculator.peek_map_at_n() 211 | gap = self.global_ap_calculator.peek_ap_at_n() 212 | 213 | epoch_info_dict = {} 214 | return {"avg_hit_at_one": avg_hit_at_one, "avg_perr": avg_perr, 215 | "avg_loss": avg_loss, "aps": aps, "gap": gap} 216 | 217 | def clear(self): 218 | """Clear the evaluation metrics and reset the EvaluationMetrics object.""" 219 | self.sum_hit_at_one = 0.0 220 | self.sum_perr = 0.0 221 | self.sum_loss = 0.0 222 | self.map_calculator.clear() 223 | self.global_ap_calculator.clear() 224 | self.num_examples = 0 225 | -------------------------------------------------------------------------------- /export_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Utilities to export a model for batch prediction.""" 15 | # noinspection PyUnresolvedReferences 16 | import pathmagic 17 | import tensorflow as tf 18 | import tensorflow.contrib.slim as slim 19 | 20 | from tensorflow.python.saved_model import builder as saved_model_builder 21 | from tensorflow.python.saved_model import signature_constants 22 | from tensorflow.python.saved_model import signature_def_utils 23 | from tensorflow.python.saved_model import tag_constants 24 | from tensorflow.python.saved_model import utils as saved_model_utils 25 | 26 | _TOP_PREDICTIONS_IN_OUTPUT = 20 27 | 28 | 29 | class ModelExporter(object): 30 | def __init__(self, frame_features, model, reader): 31 | self.frame_features = frame_features 32 | self.model = model 33 | self.reader = reader 34 | 35 | with tf.Graph().as_default() as graph: 36 | self.inputs, self.outputs = self.build_inputs_and_outputs() 37 | self.graph = graph 38 | self.saver = tf.train.Saver(tf.trainable_variables(), sharded=True) 39 | 40 | def export_model(self, model_dir, global_step_val, last_checkpoint): 41 | """Exports the model so that it can used for batch predictions.""" 42 | with self.graph.as_default(): 43 | with tf.Session() as session: 44 | session.run(tf.global_variables_initializer()) 45 | self.saver.restore(session, last_checkpoint) 46 | 47 | signature = signature_def_utils.build_signature_def( 48 | inputs=self.inputs, 49 | outputs=self.outputs, 50 | method_name=signature_constants.PREDICT_METHOD_NAME) 51 | 52 | signature_map = {signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: 53 | signature} 54 | 55 | model_builder = saved_model_builder.SavedModelBuilder(model_dir) 56 | model_builder.add_meta_graph_and_variables(session, 57 | tags=[tag_constants.SERVING], 58 | signature_def_map=signature_map, 59 | clear_devices=True) 60 | model_builder.save() 61 | 62 | def build_inputs_and_outputs(self): 63 | if self.frame_features: 64 | serialized_examples = tf.placeholder(tf.string, shape=(None,)) 65 | 66 | fn = lambda x: self.build_prediction_graph(x) 67 | video_id_output, top_indices_output, top_predictions_output = ( 68 | tf.map_fn(fn, serialized_examples, 69 | dtype=(tf.string, tf.int32, tf.float32))) 70 | 71 | else: 72 | serialized_examples = tf.placeholder(tf.string, shape=(None,)) 73 | 74 | video_id_output, top_indices_output, top_predictions_output = ( 75 | self.build_prediction_graph(serialized_examples)) 76 | 77 | inputs = {"example_bytes": 78 | saved_model_utils.build_tensor_info(serialized_examples)} 79 | 80 | outputs = { 81 | "video_id": saved_model_utils.build_tensor_info(video_id_output), 82 | "class_indexes": saved_model_utils.build_tensor_info(top_indices_output), 83 | "predictions": saved_model_utils.build_tensor_info(top_predictions_output)} 84 | 85 | return inputs, outputs 86 | 87 | def build_prediction_graph(self, serialized_examples): 88 | video_id, model_input_raw, labels_batch, num_frames = ( 89 | self.reader.prepare_serialized_examples(serialized_examples)) 90 | 91 | feature_dim = len(model_input_raw.get_shape()) - 1 92 | model_input = tf.nn.l2_normalize(model_input_raw, feature_dim) 93 | 94 | with tf.variable_scope("tower"): 95 | result = self.model.create_model( 96 | model_input, 97 | num_frames=num_frames, 98 | vocab_size=self.reader.num_classes, 99 | labels=labels_batch, 100 | is_training=False) 101 | 102 | for variable in slim.get_model_variables(): 103 | tf.summary.histogram(variable.op.name, variable) 104 | 105 | predictions = result["predictions"] 106 | 107 | top_predictions, top_indices = tf.nn.top_k(predictions, 108 | _TOP_PREDICTIONS_IN_OUTPUT) 109 | return video_id, top_indices, top_predictions 110 | -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Binary for generating predictions over a set of videos.""" 16 | import os 17 | import sys 18 | 19 | # Explicitly add the file's directory to the path list. 20 | file_dir = os.path.dirname(__file__) 21 | sys.path.append(file_dir) 22 | sys.path.append(os.path.join(os.getcwd(), "modules")) 23 | # noinspection PyUnresolvedReferences 24 | import pathmagic 25 | import os 26 | import glob 27 | import json 28 | import tarfile 29 | import time 30 | import sys 31 | import numpy 32 | import tensorflow as tf 33 | from tensorflow.python.lib.io import file_io 34 | from tensorflow import app 35 | from tensorflow import flags 36 | from tensorflow import gfile 37 | from tensorflow import logging 38 | 39 | import eval_util 40 | import losses 41 | import readers 42 | import utils 43 | 44 | 45 | FLAGS = flags.FLAGS 46 | 47 | if __name__ == '__main__': 48 | # Input 49 | flags.DEFINE_string("train_dir", "", 50 | "The directory to load the model files from. We assume " 51 | "that you have already run eval.py onto this, such that " 52 | "inference_model.* files already exist.") 53 | flags.DEFINE_string( 54 | "input_data_pattern", "", 55 | "File glob defining the evaluation dataset in tensorflow.SequenceExample " 56 | "format. The SequenceExamples are expected to have an 'rgb' byte array " 57 | "sequence feature as well as a 'labels' int64 context feature.") 58 | flags.DEFINE_string("input_model_tgz", "", 59 | "If given, must be path to a .tgz file that was written " 60 | "by this binary using flag --output_model_tgz. In this " 61 | "case, the .tgz file will be untarred to " 62 | "--untar_model_dir and the model will be used for " 63 | "inference.") 64 | flags.DEFINE_string("untar_model_dir", "/tmp/yt8m-model", 65 | "If --input_model_tgz is given, then this directory will " 66 | "be created and the contents of the .tgz file will be " 67 | "untarred here.") 68 | 69 | # Output 70 | flags.DEFINE_string("output_file", "", 71 | "The file to save the predictions to.") 72 | flags.DEFINE_string("output_model_tgz", "", 73 | "If given, should be a filename with a .tgz extension, " 74 | "the model graph and checkpoint will be bundled in this " 75 | "gzip tar. This file can be uploaded to Kaggle for the " 76 | "top 10 participants.") 77 | flags.DEFINE_integer("top_k", 20, 78 | "How many predictions to output per video.") 79 | 80 | # Other flags. 81 | flags.DEFINE_integer( 82 | "batch_size", 1024, 83 | "How many examples to process per batch.") 84 | flags.DEFINE_integer("num_readers", 1, 85 | "How many threads to use for reading input files.") 86 | 87 | 88 | def format_lines(video_ids, predictions, top_k): 89 | batch_size = len(video_ids) 90 | for video_index in range(batch_size): 91 | top_indices = numpy.argpartition(predictions[video_index], -top_k)[-top_k:] 92 | line = [(class_index, predictions[video_index][class_index]) 93 | for class_index in top_indices] 94 | line = sorted(line, key=lambda p: -p[1]) 95 | yield video_ids[video_index].decode('utf-8') + "," + " ".join( 96 | "%i %g" % (label, score) for (label, score) in line) + "\n" 97 | 98 | 99 | def get_input_data_tensors(reader, data_pattern, batch_size, num_readers=1): 100 | """Creates the section of the graph which reads the input data. 101 | 102 | Args: 103 | reader: A class which parses the input data. 104 | data_pattern: A 'glob' style path to the data files. 105 | batch_size: How many examples to process at a time. 106 | num_readers: How many I/O threads to use. 107 | 108 | Returns: 109 | A tuple containing the features tensor, labels tensor, and optionally a 110 | tensor containing the number of frames per video. The exact dimensions 111 | depend on the reader being used. 112 | 113 | Raises: 114 | IOError: If no files matching the given pattern were found. 115 | """ 116 | with tf.name_scope("input"): 117 | files = gfile.Glob(data_pattern) 118 | if not files: 119 | raise IOError("Unable to find input files. data_pattern='" + 120 | data_pattern + "'") 121 | logging.info("number of input files: " + str(len(files))) 122 | filename_queue = tf.train.string_input_producer( 123 | files, num_epochs=1, shuffle=False) 124 | examples_and_labels = [reader.prepare_reader(filename_queue) 125 | for _ in range(num_readers)] 126 | 127 | video_id_batch, video_batch, unused_labels, num_frames_batch = ( 128 | tf.train.batch_join(examples_and_labels, 129 | batch_size=batch_size, 130 | allow_smaller_final_batch=True, 131 | enqueue_many=True)) 132 | return video_id_batch, video_batch, num_frames_batch 133 | 134 | 135 | def inference(reader, train_dir, data_pattern, out_file_location, batch_size, top_k): 136 | with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess, gfile.Open(out_file_location, 137 | "w+") as out_file: 138 | video_id_batch, video_batch, num_frames_batch = get_input_data_tensors(reader, data_pattern, batch_size) 139 | checkpoint_file = os.path.join(FLAGS.train_dir, "inference_model") 140 | if not gfile.Exists(checkpoint_file + ".meta"): 141 | raise IOError("Cannot find %s. Did you run eval.py?" % checkpoint_file) 142 | meta_graph_location = checkpoint_file + ".meta" 143 | logging.info("loading meta-graph: " + meta_graph_location) 144 | 145 | if FLAGS.output_model_tgz: 146 | out_file_tgz = file_io.FileIO(FLAGS.output_model_tgz, "w") 147 | with tarfile.open(fileobj=out_file_tgz, mode="w:gz") as tar: 148 | for model_file in file_io.get_matching_files(checkpoint_file + '.*'): 149 | # tar.addfile(file_io.FileIO(model_file, "r"), arcname=os.path.basename(model_file)) 150 | tar.addfile(file_io.FileIO(model_file, "r")) 151 | # tar.add(os.path.join(FLAGS.train_dir, "model_flags.json"), 152 | # arcname="model_flags.json") 153 | tar.addfile(file_io.FileIO(os.path.join(FLAGS.train_dir, "model_flags.json"), "r")) 154 | print('Tarred model onto ' + FLAGS.output_model_tgz) 155 | with tf.device("/gpu:0"): 156 | saver = tf.train.import_meta_graph(meta_graph_location, clear_devices=True) 157 | logging.info("restoring variables from " + checkpoint_file) 158 | saver.restore(sess, checkpoint_file) 159 | input_tensor = tf.get_collection("input_batch_raw")[0] 160 | num_frames_tensor = tf.get_collection("num_frames")[0] 161 | predictions_tensor = tf.get_collection("predictions")[0] 162 | 163 | # Workaround for num_epochs issue. 164 | def set_up_init_ops(variables): 165 | init_op_list = [] 166 | for variable in list(variables): 167 | if "train_input" in variable.name: 168 | init_op_list.append(tf.assign(variable, 1)) 169 | variables.remove(variable) 170 | init_op_list.append(tf.variables_initializer(variables)) 171 | return init_op_list 172 | 173 | # tf.get_default_graph().clear_collection("queue_runners") 174 | # tf.get_default_graph().clear_collection("local_variables") 175 | sess.run(set_up_init_ops(tf.get_collection_ref( 176 | tf.GraphKeys.LOCAL_VARIABLES))) 177 | 178 | coord = tf.train.Coordinator() 179 | threads = tf.train.start_queue_runners(sess=sess, coord=coord) 180 | num_examples_processed = 0 181 | start_time = time.time() 182 | out_file.write("VideoId,LabelConfidencePairs\n") 183 | 184 | try: 185 | while not coord.should_stop(): 186 | video_id_batch_val, video_batch_val, num_frames_batch_val = sess.run( 187 | [video_id_batch, video_batch, num_frames_batch]) 188 | predictions_val, = sess.run([predictions_tensor], feed_dict={input_tensor: video_batch_val, 189 | num_frames_tensor: num_frames_batch_val}) 190 | now = time.time() 191 | num_examples_processed += len(video_batch_val) 192 | num_classes = predictions_val.shape[1] 193 | logging.info( 194 | "num examples processed: " + str(num_examples_processed) + " elapsed seconds: " + "{0:.2f}".format( 195 | now - start_time)) 196 | for line in format_lines(video_id_batch_val, predictions_val, top_k): 197 | out_file.write(line) 198 | out_file.flush() 199 | 200 | except tf.errors.OutOfRangeError: 201 | logging.info('Done with inference. The output file was written to ' + out_file_location) 202 | finally: 203 | coord.request_stop() 204 | 205 | coord.join(threads) 206 | sess.close() 207 | 208 | 209 | def main(unused_argv): 210 | logging.set_verbosity(tf.logging.INFO) 211 | if FLAGS.input_model_tgz: 212 | if FLAGS.train_dir: 213 | raise ValueError("You cannot supply --train_dir if supplying " 214 | "--input_model_tgz") 215 | # Untar. 216 | if not file_io.file_exists(FLAGS.untar_model_dir): 217 | os.makedirs(FLAGS.untar_model_dir) 218 | tarfile.open(FLAGS.input_model_tgz).extractall(FLAGS.untar_model_dir) 219 | FLAGS.train_dir = FLAGS.untar_model_dir 220 | 221 | flags_dict_file = os.path.join(FLAGS.train_dir, "model_flags.json") 222 | if not file_io.file_exists(flags_dict_file): 223 | raise IOError("Cannot find %s. Did you run eval.py?" % flags_dict_file) 224 | flags_dict = json.loads(file_io.FileIO(flags_dict_file, "r").read()) 225 | 226 | # convert feature_names and feature_sizes to lists of values 227 | feature_names, feature_sizes = utils.GetListOfFeatureNamesAndSizes( 228 | flags_dict["feature_names"], flags_dict["feature_sizes"]) 229 | 230 | if flags_dict["frame_features"]: 231 | reader = readers.YT8MFrameFeatureReader(feature_names=feature_names, 232 | feature_sizes=feature_sizes) 233 | else: 234 | reader = readers.YT8MAggregatedFeatureReader(feature_names=feature_names, 235 | feature_sizes=feature_sizes) 236 | 237 | if FLAGS.output_file is "": 238 | raise ValueError("'output_file' was not specified. " 239 | "Unable to continue with inference.") 240 | 241 | if FLAGS.input_data_pattern is "": 242 | raise ValueError("'input_data_pattern' was not specified. " 243 | "Unable to continue with inference.") 244 | 245 | inference(reader, FLAGS.train_dir, FLAGS.input_data_pattern, 246 | FLAGS.output_file, FLAGS.batch_size, FLAGS.top_k) 247 | 248 | 249 | if __name__ == "__main__": 250 | app.run() 251 | -------------------------------------------------------------------------------- /losses.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Provides definitions for non-regularized training or test losses.""" 16 | # noinspection PyUnresolvedReferences 17 | import pathmagic 18 | import tensorflow as tf 19 | 20 | 21 | class BaseLoss(object): 22 | """Inherit from this class when implementing new losses.""" 23 | 24 | def calculate_loss(self, unused_predictions, unused_labels, **unused_params): 25 | """Calculates the average loss of the examples in a mini-batch. 26 | 27 | Args: 28 | unused_predictions: a 2-d tensor storing the prediction scores, in which 29 | each row represents a sample in the mini-batch and each column 30 | represents a class. 31 | unused_labels: a 2-d tensor storing the labels, which has the same shape 32 | as the unused_predictions. The labels must be in the range of 0 and 1. 33 | unused_params: loss specific parameters. 34 | 35 | Returns: 36 | A scalar loss tensor. 37 | """ 38 | raise NotImplementedError() 39 | 40 | 41 | class CrossEntropyLoss(BaseLoss): 42 | """Calculate the cross entropy loss between the predictions and labels. 43 | """ 44 | def calculate_loss(self, predictions, labels, **unused_params): 45 | with tf.name_scope("loss_xent"): 46 | epsilon = 10e-6 47 | float_labels = tf.cast(labels, tf.float32) 48 | cross_entropy_loss = float_labels * tf.log(predictions + epsilon) + \ 49 | (1 - float_labels) * tf.log(1 - predictions + epsilon) 50 | cross_entropy_loss = tf.negative(cross_entropy_loss) 51 | return tf.reduce_mean(tf.reduce_sum(cross_entropy_loss, 1)) 52 | 53 | 54 | class HingeLoss(BaseLoss): 55 | """Calculate the hinge loss between the predictions and labels. 56 | 57 | Note the subgradient is used in the backpropagation, and thus the optimization 58 | may converge slower. The predictions trained by the hinge loss are between -1 59 | and +1. 60 | """ 61 | def calculate_loss(self, predictions, labels, b=1.0, **unused_params): 62 | with tf.name_scope("loss_hinge"): 63 | float_labels = tf.cast(labels, tf.float32) 64 | all_zeros = tf.zeros(tf.shape(float_labels), dtype=tf.float32) 65 | all_ones = tf.ones(tf.shape(float_labels), dtype=tf.float32) 66 | sign_labels = tf.subtract(tf.scalar_mul(2, float_labels), all_ones) 67 | hinge_loss = tf.maximum( 68 | all_zeros, tf.scalar_mul(b, all_ones) - sign_labels * predictions) 69 | return tf.reduce_mean(tf.reduce_sum(hinge_loss, 1)) 70 | 71 | 72 | class SoftmaxLoss(BaseLoss): 73 | """Calculate the softmax loss between the predictions and labels. 74 | 75 | The function calculates the loss in the following way: first we feed the 76 | predictions to the softmax activation function and then we calculate 77 | the minus linear dot product between the logged softmax activations and the 78 | normalized ground truth label. 79 | 80 | It is an extension to the one-hot label. It allows for more than one positive 81 | labels for each sample. 82 | """ 83 | 84 | def calculate_loss(self, predictions, labels, **unused_params): 85 | with tf.name_scope("loss_softmax"): 86 | epsilon = 10e-8 87 | float_labels = tf.cast(labels, tf.float32) 88 | # l1 normalization (labels are no less than 0) 89 | label_rowsum = tf.maximum( 90 | tf.reduce_sum(float_labels, 1, keep_dims=True), 91 | epsilon) 92 | norm_float_labels = tf.div(float_labels, label_rowsum) 93 | softmax_outputs = tf.nn.softmax(predictions) 94 | softmax_loss = tf.negative(tf.reduce_sum( 95 | tf.multiply(norm_float_labels, tf.log(softmax_outputs)), 1)) 96 | return tf.reduce_mean(softmax_loss) 97 | -------------------------------------------------------------------------------- /mean_average_precision_calculator.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Calculate the mean average precision. 16 | 17 | It provides an interface for calculating mean average precision 18 | for an entire list or the top-n ranked items. 19 | 20 | Example usages: 21 | We first call the function accumulate many times to process parts of the ranked 22 | list. After processing all the parts, we call peek_map_at_n 23 | to calculate the mean average precision. 24 | 25 | ``` 26 | import random 27 | 28 | p = np.array([[random.random() for _ in xrange(50)] for _ in xrange(1000)]) 29 | a = np.array([[random.choice([0, 1]) for _ in xrange(50)] 30 | for _ in xrange(1000)]) 31 | 32 | # mean average precision for 50 classes. 33 | calculator = mean_average_precision_calculator.MeanAveragePrecisionCalculator( 34 | num_class=50) 35 | calculator.accumulate(p, a) 36 | aps = calculator.peek_map_at_n() 37 | ``` 38 | """ 39 | # noinspection PyUnresolvedReferences 40 | import pathmagic 41 | import numpy 42 | import average_precision_calculator 43 | 44 | 45 | class MeanAveragePrecisionCalculator(object): 46 | """This class is to calculate mean average precision. 47 | """ 48 | 49 | def __init__(self, num_class): 50 | """Construct a calculator to calculate the (macro) average precision. 51 | 52 | Args: 53 | num_class: A positive Integer specifying the number of classes. 54 | top_n_array: A list of positive integers specifying the top n for each 55 | class. The top n in each class will be used to calculate its average 56 | precision at n. 57 | The size of the array must be num_class. 58 | 59 | Raises: 60 | ValueError: An error occurred when num_class is not a positive integer; 61 | or the top_n_array is not a list of positive integers. 62 | """ 63 | if not isinstance(num_class, int) or num_class <= 1: 64 | raise ValueError("num_class must be a positive integer.") 65 | 66 | self._ap_calculators = [] # member of AveragePrecisionCalculator 67 | self._num_class = num_class # total number of classes 68 | for i in range(num_class): 69 | self._ap_calculators.append( 70 | average_precision_calculator.AveragePrecisionCalculator()) 71 | 72 | def accumulate(self, predictions, actuals, num_positives=None): 73 | """Accumulate the predictions and their ground truth labels. 74 | 75 | Args: 76 | predictions: A list of lists storing the prediction scores. The outer 77 | dimension corresponds to classes. 78 | actuals: A list of lists storing the ground truth labels. The dimensions 79 | should correspond to the predictions input. Any value 80 | larger than 0 will be treated as positives, otherwise as negatives. 81 | num_positives: If provided, it is a list of numbers representing the 82 | number of true positives for each class. If not provided, the number of 83 | true positives will be inferred from the 'actuals' array. 84 | 85 | Raises: 86 | ValueError: An error occurred when the shape of predictions and actuals 87 | does not match. 88 | """ 89 | if not num_positives: 90 | num_positives = [None for i in predictions.shape[1]] 91 | 92 | calculators = self._ap_calculators 93 | for i in range(len(predictions)): 94 | calculators[i].accumulate(predictions[i], actuals[i], num_positives[i]) 95 | 96 | def clear(self): 97 | for calculator in self._ap_calculators: 98 | calculator.clear() 99 | 100 | def is_empty(self): 101 | return ([calculator.heap_size for calculator in self._ap_calculators] == 102 | [0 for _ in range(self._num_class)]) 103 | 104 | def peek_map_at_n(self): 105 | """Peek the non-interpolated mean average precision at n. 106 | 107 | Returns: 108 | An array of non-interpolated average precision at n (default 0) for each 109 | class. 110 | """ 111 | aps = [self._ap_calculators[i].peek_ap_at_n() 112 | for i in range(self._num_class)] 113 | return aps 114 | -------------------------------------------------------------------------------- /model_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Contains a collection of util functions for model construction. 16 | """ 17 | # noinspection PyUnresolvedReferences 18 | import pathmagic 19 | import numpy 20 | import tensorflow as tf 21 | from tensorflow import logging 22 | from tensorflow import flags 23 | import tensorflow.contrib.slim as slim 24 | 25 | 26 | def SampleRandomSequence(model_input, num_frames, num_samples): 27 | """ Samples a random sequence of frames of size num_samples. 28 | 29 | Args: 30 | model_input: A tensor of size batch_size x max_frames x feature_size 31 | num_frames: A tensor of size batch_size x 1 32 | num_samples: A scalar 33 | 34 | Returns: 35 | `model_input`: A tensor of size batch_size x num_samples x feature_size 36 | """ 37 | 38 | batch_size = tf.shape(model_input)[0] 39 | frame_index_offset = tf.tile( 40 | tf.expand_dims(tf.range(num_samples), 0), [batch_size, 1]) 41 | max_start_frame_index = tf.maximum(num_frames - num_samples, 0) 42 | start_frame_index = tf.cast( 43 | tf.multiply( 44 | tf.random_uniform([batch_size, 1]), 45 | tf.cast(max_start_frame_index + 1, tf.float32)), tf.int32) 46 | frame_index = tf.minimum(start_frame_index + frame_index_offset, 47 | tf.cast(num_frames - 1, tf.int32)) 48 | batch_index = tf.tile( 49 | tf.expand_dims(tf.range(batch_size), 1), [1, num_samples]) 50 | index = tf.stack([batch_index, frame_index], 2) 51 | return tf.gather_nd(model_input, index) 52 | 53 | 54 | def SampleRandomFrames(model_input, num_frames, num_samples): 55 | """ Samples a random set of frames of size num_samples. 56 | 57 | Args: 58 | model_input: A tensor of size batch_size x max_frames x feature_size 59 | num_frames: A tensor of size batch_size x 1 60 | num_samples: A scalar 61 | 62 | Returns: 63 | `model_input`: A tensor of size batch_size x num_samples x feature_size 64 | """ 65 | batch_size = tf.shape(model_input)[0] 66 | frame_index = tf.cast( 67 | tf.multiply( 68 | tf.random_uniform([batch_size, num_samples]), 69 | tf.tile(tf.cast(num_frames, tf.float32), [1, num_samples])), tf.int32) 70 | batch_index = tf.tile( 71 | tf.expand_dims(tf.range(batch_size), 1), [1, num_samples]) 72 | index = tf.stack([batch_index, frame_index], 2) 73 | return tf.gather_nd(model_input, index) 74 | 75 | 76 | def FramePooling(frames, method, **unused_params): 77 | """Pools over the frames of a video. 78 | 79 | Args: 80 | frames: A tensor. 81 | method: "average", "max", "attention", or "none". 82 | Returns: 83 | A tensor with shape [batch_size, feature_size] for average, max, or 84 | attention pooling. A tensor with shape [batch_size*num_frames, feature_size] 85 | for none pooling. 86 | 87 | Raises: 88 | ValueError: if method is other than "average", "max", "attention", or 89 | "none". 90 | """ 91 | if method == "average": 92 | return tf.reduce_mean(frames, 1) 93 | elif method == "max": 94 | return tf.reduce_max(frames, 1) 95 | elif method == "none": 96 | feature_size = frames.shape_as_list()[2] 97 | return tf.reshape(frames, [-1, feature_size]) 98 | else: 99 | raise ValueError("Unrecognized pooling method: %s" % method) 100 | 101 | def SampleUniformFrames(model_input, num_frames, num_samples): 102 | """ Uniformally samples (deterministically) a set of frames of size num_samples. 103 | 104 | Args: 105 | model_input: A tensor of size batch_size x max_frames x feature_size 106 | num_frames: A tensor of size batch_size x 1 107 | num_samples: A scalar 108 | 109 | Returns: 110 | `model_input`: A tensor of size batch_size x num_samples x feature_size 111 | """ 112 | batch_size = tf.shape(model_input)[0] 113 | even_dist_samp = tf.expand_dims(tf.linspace(0.0, 1.0, num_samples+1), axis=0) 114 | even_dist_samp = tf.slice(even_dist_samp, [0, 0], [1, num_samples]) 115 | frame_index = tf.cast( 116 | tf.multiply( 117 | tf.tile(even_dist_samp, [batch_size, 1]), 118 | tf.tile(tf.cast(num_frames, tf.float32), [1, num_samples])), tf.int32) 119 | batch_index = tf.tile( 120 | tf.expand_dims(tf.range(batch_size), 1), [1, num_samples]) 121 | index = tf.stack([batch_index, frame_index], 2) 122 | return tf.gather_nd(model_input, index) 123 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Contains the base class for models.""" 16 | 17 | 18 | class BaseModel(object): 19 | """Inherit from this class when implementing new models.""" 20 | pass 21 | 22 | def create_model(self, unused_model_input, **unused_params): 23 | raise NotImplementedError() 24 | 25 | 26 | -------------------------------------------------------------------------------- /module_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Deep Topology All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from tensorflow.python.platform import tf_logging as logging 16 | from tensorflow.python.ops import standard_ops 17 | from tensorflow.python.framework import ops 18 | import tensorflow as tf 19 | import numbers 20 | 21 | _NEG_INF = -1e9 22 | 23 | 24 | def get_padding(x, padding_value=0): 25 | """Return float tensor representing the padding values in x. 26 | Args: 27 | x: int tensor with any shape 28 | padding_value: int value that 29 | Returns: 30 | flaot tensor with same shape as x containing values 0 or 1. 31 | 0 -> non-padding, 1 -> padding 32 | """ 33 | with tf.name_scope("padding"): 34 | return tf.to_float(tf.equal(x, padding_value)) 35 | 36 | 37 | def get_padding_bias(x): 38 | """Calculate bias tensor from padding values in tensor. 39 | Bias tensor that is added to the pre-softmax multi-headed attention logits, 40 | which has shape [batch_size, num_heads, length, length]. The tensor is zero at 41 | non-padding locations, and -1e9 (negative infinity) at padding locations. 42 | Args: 43 | x: int tensor with shape [batch_size, length] 44 | Returns: 45 | Attention bias tensor of shape [batch_size, 1, 1, length]. 46 | """ 47 | with tf.name_scope("attention_bias"): 48 | padding = get_padding(x) 49 | attention_bias = padding * _NEG_INF 50 | attention_bias = tf.expand_dims( 51 | tf.expand_dims(attention_bias, axis=1), axis=1) 52 | return attention_bias 53 | 54 | 55 | def orthogonal_regularizer(scale, scope=None): 56 | """ Return a function that computes orthogonal regularization. 57 | :param scale: A scalar multiplier `Tensor`. 0.0 disables the regularizer. 58 | :param scope: An optional scope name. 59 | :return: A function with signature `orthogonal_sum(weights)` that applies orthogonal regularization. 60 | """ 61 | if isinstance(scale, numbers.Integral): 62 | raise ValueError('scale cannot be an integer: %s' % (scale,)) 63 | if isinstance(scale, numbers.Real): 64 | if scale < 0.: 65 | raise ValueError('Setting a scale less than 0 on a regularizer: %g.' % 66 | scale) 67 | if scale == 0.: 68 | logging.info('Scale of 0 disables regularizer.') 69 | return lambda _: None 70 | 71 | def orthogonal_sum(weights): 72 | """ Applies orthogonal regularization to weights. """ 73 | with ops.name_scope(scope, 'orthogonal_regularizer', [weights]) as name: 74 | tensor_scale = ops.convert_to_tensor(scale, 75 | dtype=weights.dtype.base_dtype, 76 | name='scale') 77 | 78 | norm_weights = tf.nn.l2_normalize(weights, axis=1) 79 | anchor_weights_t = tf.transpose(norm_weights) 80 | det_reg = tf.matmul(anchor_weights_t, norm_weights) 81 | identity = tf.eye(tf.shape(det_reg)[0]) 82 | det_reg = tf.subtract(det_reg, identity) 83 | det_reg = tf.reduce_sum(tf.abs(det_reg)) 84 | 85 | # Print sum value before scaling 86 | det_reg = tf.Print(det_reg, [det_reg], "Orthogonal sum for \"{}\" :".format(name)) 87 | 88 | return standard_ops.multiply(tensor_scale, det_reg, name=name) 89 | 90 | return orthogonal_sum 91 | 92 | 93 | def reduce_var(x, axis=None, keep_dim=False): 94 | """ Return variance of a tensor, alongside the specified axis. 95 | 96 | Reference: 97 | https://stackoverflow.com/questions/39354566/what-is-the-equivalent-of-np-std-in-tensorflow 98 | 99 | :param x: Tensor or variable 100 | :param axis: int 101 | :param keep_dim: bool 102 | :return: Tensor with the variance of elements of x 103 | """ 104 | m = tf.reduce_mean(x, axis=axis, keep_dims=True) 105 | devs_squared = tf.square(x - m) 106 | return tf.reduce_mean(devs_squared, axis=axis, keep_dims=keep_dim) -------------------------------------------------------------------------------- /modules.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Deep Topology All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Contains the base class for modules.""" 16 | 17 | 18 | class BaseModule(object): 19 | """Inherit from this class when implementing new modules.""" 20 | pass 21 | 22 | def forward(self, unused_module_input, **unused_params): 23 | raise NotImplementedError() 24 | -------------------------------------------------------------------------------- /paper/Learnable_Pooling_Methods_for_Video_Classification.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pomonam/LearnablePoolingMethods/2d0b9b361785743ec397c6104feb30bb581700e5/paper/Learnable_Pooling_Methods_for_Video_Classification.pdf -------------------------------------------------------------------------------- /pathmagic.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Deep Topology All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Path hack to make import statements work.""" 16 | 17 | import os 18 | import sys 19 | 20 | # Explicitly add the file's directory to the path list. 21 | file_dir = os.path.dirname(__file__) 22 | sys.path.append(file_dir) 23 | sys.path.append(os.path.join(os.getcwd(), "modules")) 24 | -------------------------------------------------------------------------------- /readers.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Provides readers configured for different datasets.""" 16 | 17 | import tensorflow as tf 18 | import utils 19 | 20 | from tensorflow import logging 21 | 22 | 23 | def resize_axis(tensor, axis, new_size, fill_value=0): 24 | """Truncates or pads a tensor to new_size on on a given axis. 25 | 26 | Truncate or extend tensor such that tensor.shape[axis] == new_size. If the 27 | size increases, the padding will be performed at the end, using fill_value. 28 | 29 | Args: 30 | tensor: The tensor to be resized. 31 | axis: An integer representing the dimension to be sliced. 32 | new_size: An integer or 0d tensor representing the new value for 33 | tensor.shape[axis]. 34 | fill_value: Value to use to fill any new entries in the tensor. Will be 35 | cast to the type of tensor. 36 | 37 | Returns: 38 | The resized tensor. 39 | """ 40 | tensor = tf.convert_to_tensor(tensor) 41 | shape = tf.unstack(tf.shape(tensor)) 42 | 43 | pad_shape = shape[:] 44 | pad_shape[axis] = tf.maximum(0, new_size - shape[axis]) 45 | 46 | shape[axis] = tf.minimum(shape[axis], new_size) 47 | shape = tf.stack(shape) 48 | 49 | resized = tf.concat([ 50 | tf.slice(tensor, tf.zeros_like(shape), shape), 51 | tf.fill(tf.stack(pad_shape), tf.cast(fill_value, tensor.dtype)) 52 | ], axis) 53 | 54 | # Update shape. 55 | new_shape = tensor.get_shape().as_list() # A copy is being made. 56 | new_shape[axis] = new_size 57 | resized.set_shape(new_shape) 58 | return resized 59 | 60 | class BaseReader(object): 61 | """Inherit from this class when implementing new readers.""" 62 | 63 | def prepare_reader(self, unused_filename_queue): 64 | """Create a thread for generating prediction and label tensors.""" 65 | raise NotImplementedError() 66 | 67 | 68 | class YT8MAggregatedFeatureReader(BaseReader): 69 | """Reads TFRecords of pre-aggregated Examples. 70 | 71 | The TFRecords must contain Examples with a sparse int64 'labels' feature and 72 | a fixed length float32 feature, obtained from the features in 'feature_name'. 73 | The float features are assumed to be an average of dequantized values. 74 | """ 75 | 76 | def __init__(self, 77 | num_classes=3862, 78 | feature_sizes=[1024, 128], 79 | feature_names=["mean_rgb", "mean_audio"]): 80 | """Construct a YT8MAggregatedFeatureReader. 81 | 82 | Args: 83 | num_classes: a positive integer for the number of classes. 84 | feature_sizes: positive integer(s) for the feature dimensions as a list. 85 | feature_names: the feature name(s) in the tensorflow record as a list. 86 | """ 87 | 88 | assert len(feature_names) == len(feature_sizes), \ 89 | "length of feature_names (={}) != length of feature_sizes (={})".format( \ 90 | len(feature_names), len(feature_sizes)) 91 | 92 | self.num_classes = num_classes 93 | self.feature_sizes = feature_sizes 94 | self.feature_names = feature_names 95 | 96 | def prepare_reader(self, filename_queue, batch_size=1024): 97 | """Creates a single reader thread for pre-aggregated YouTube 8M Examples. 98 | 99 | Args: 100 | filename_queue: A tensorflow queue of filename locations. 101 | 102 | Returns: 103 | A tuple of video indexes, features, labels, and padding data. 104 | """ 105 | reader = tf.TFRecordReader() 106 | _, serialized_examples = reader.read_up_to(filename_queue, batch_size) 107 | 108 | tf.add_to_collection("serialized_examples", serialized_examples) 109 | return self.prepare_serialized_examples(serialized_examples) 110 | 111 | def prepare_serialized_examples(self, serialized_examples): 112 | # set the mapping from the fields to data types in the proto 113 | num_features = len(self.feature_names) 114 | assert num_features > 0, "self.feature_names is empty!" 115 | assert len(self.feature_names) == len(self.feature_sizes), \ 116 | "length of feature_names (={}) != length of feature_sizes (={})".format( \ 117 | len(self.feature_names), len(self.feature_sizes)) 118 | 119 | feature_map = {"id": tf.FixedLenFeature([], tf.string), 120 | "labels": tf.VarLenFeature(tf.int64)} 121 | for feature_index in range(num_features): 122 | feature_map[self.feature_names[feature_index]] = tf.FixedLenFeature( 123 | [self.feature_sizes[feature_index]], tf.float32) 124 | 125 | features = tf.parse_example(serialized_examples, features=feature_map) 126 | labels = tf.sparse_to_indicator(features["labels"], self.num_classes) 127 | labels.set_shape([None, self.num_classes]) 128 | concatenated_features = tf.concat([ 129 | features[feature_name] for feature_name in self.feature_names], 1) 130 | 131 | return features["id"], concatenated_features, labels, tf.ones([tf.shape(serialized_examples)[0]]) 132 | 133 | 134 | class YT8MFrameFeatureReader(BaseReader): 135 | """Reads TFRecords of SequenceExamples. 136 | 137 | The TFRecords must contain SequenceExamples with the sparse in64 'labels' 138 | context feature and a fixed length byte-quantized feature vector, obtained 139 | from the features in 'feature_names'. The quantized features will be mapped 140 | back into a range between min_quantized_value and max_quantized_value. 141 | """ 142 | 143 | def __init__(self, 144 | num_classes=3862, 145 | feature_sizes=[1024, 128], 146 | feature_names=["rgb", "audio"], 147 | max_frames=300): 148 | """Construct a YT8MFrameFeatureReader. 149 | 150 | Args: 151 | num_classes: a positive integer for the number of classes. 152 | feature_sizes: positive integer(s) for the feature dimensions as a list. 153 | feature_names: the feature name(s) in the tensorflow record as a list. 154 | max_frames: the maximum number of frames to process. 155 | """ 156 | 157 | assert len(feature_names) == len(feature_sizes), \ 158 | "length of feature_names (={}) != length of feature_sizes (={})".format( \ 159 | len(feature_names), len(feature_sizes)) 160 | 161 | self.num_classes = num_classes 162 | self.feature_sizes = feature_sizes 163 | self.feature_names = feature_names 164 | self.max_frames = max_frames 165 | 166 | def get_video_matrix(self, 167 | features, 168 | feature_size, 169 | max_frames, 170 | max_quantized_value, 171 | min_quantized_value): 172 | """Decodes features from an input string and quantizes it. 173 | 174 | Args: 175 | features: raw feature values 176 | feature_size: length of each frame feature vector 177 | max_frames: number of frames (rows) in the output feature_matrix 178 | max_quantized_value: the maximum of the quantized value. 179 | min_quantized_value: the minimum of the quantized value. 180 | 181 | Returns: 182 | feature_matrix: matrix of all frame-features 183 | num_frames: number of frames in the sequence 184 | """ 185 | decoded_features = tf.reshape( 186 | tf.cast(tf.decode_raw(features, tf.uint8), tf.float32), 187 | [-1, feature_size]) 188 | 189 | num_frames = tf.minimum(tf.shape(decoded_features)[0], max_frames) 190 | feature_matrix = utils.Dequantize(decoded_features, 191 | max_quantized_value, 192 | min_quantized_value) 193 | feature_matrix = resize_axis(feature_matrix, 0, max_frames) 194 | return feature_matrix, num_frames 195 | 196 | def prepare_reader(self, 197 | filename_queue, 198 | max_quantized_value=2, 199 | min_quantized_value=-2): 200 | """Creates a single reader thread for YouTube8M SequenceExamples. 201 | 202 | Args: 203 | filename_queue: A tensorflow queue of filename locations. 204 | max_quantized_value: the maximum of the quantized value. 205 | min_quantized_value: the minimum of the quantized value. 206 | 207 | Returns: 208 | A tuple of video indexes, video features, labels, and padding data. 209 | """ 210 | reader = tf.TFRecordReader() 211 | _, serialized_example = reader.read(filename_queue) 212 | 213 | return self.prepare_serialized_examples(serialized_example, 214 | max_quantized_value, min_quantized_value) 215 | 216 | def prepare_serialized_examples(self, serialized_example, 217 | max_quantized_value=2, min_quantized_value=-2): 218 | 219 | contexts, features = tf.parse_single_sequence_example( 220 | serialized_example, 221 | context_features={"id": tf.FixedLenFeature( 222 | [], tf.string), 223 | "labels": tf.VarLenFeature(tf.int64)}, 224 | sequence_features={ 225 | feature_name: tf.FixedLenSequenceFeature([], dtype=tf.string) 226 | for feature_name in self.feature_names 227 | }) 228 | 229 | # read ground truth labels 230 | labels = (tf.cast( 231 | tf.sparse_to_dense(contexts["labels"].values, (self.num_classes,), 1, 232 | validate_indices=False), 233 | tf.bool)) 234 | 235 | # loads (potentially) different types of features and concatenates them 236 | num_features = len(self.feature_names) 237 | assert num_features > 0, "No feature selected: feature_names is empty!" 238 | 239 | assert len(self.feature_names) == len(self.feature_sizes), \ 240 | "length of feature_names (={}) != length of feature_sizes (={})".format(len(self.feature_names), 241 | len(self.feature_sizes)) 242 | 243 | num_frames = -1 # the number of frames in the video 244 | feature_matrices = [None] * num_features # an array of different features 245 | for feature_index in range(num_features): 246 | feature_matrix, num_frames_in_this_feature = self.get_video_matrix( 247 | features[self.feature_names[feature_index]], 248 | self.feature_sizes[feature_index], 249 | self.max_frames, 250 | max_quantized_value, 251 | min_quantized_value) 252 | if num_frames == -1: 253 | num_frames = num_frames_in_this_feature 254 | else: 255 | tf.assert_equal(num_frames, num_frames_in_this_feature) 256 | 257 | feature_matrices[feature_index] = feature_matrix 258 | 259 | # cap the number of frames at self.max_frames 260 | num_frames = tf.minimum(num_frames, self.max_frames) 261 | 262 | # concatenate different features 263 | video_matrix = tf.concat(feature_matrices, 1) 264 | 265 | # convert to batch format. 266 | batch_video_ids = tf.expand_dims(contexts["id"], 0) 267 | batch_video_matrix = tf.expand_dims(video_matrix, 0) 268 | batch_labels = tf.expand_dims(labels, 0) 269 | batch_frames = tf.expand_dims(num_frames, 0) 270 | 271 | return batch_video_ids, batch_video_matrix, batch_labels, batch_frames 272 | -------------------------------------------------------------------------------- /rnn_modules.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Deep Topology All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # noinspection PyUnresolvedReferences 15 | import pathmagic 16 | import tensorflow as tf 17 | import modules 18 | 19 | 20 | class LstmLastHiddenModule(modules.BaseModule): 21 | """ LSTM network that outputs the last hidden state. """ 22 | def __init__(self, lstm_size, lstm_layers, num_frames, output_dim, scope_id=None): 23 | """ Initialize LSTM hidden module. 24 | :param lstm_size: int 25 | :param lstm_layers: int 26 | :param num_frames: num_frames x 1 27 | :param output_dim: int 28 | :param scope_id: Object 29 | """ 30 | self.lstm_size = lstm_size 31 | self.lstm_layers = lstm_layers 32 | self.output_dim = output_dim 33 | self.num_frames = num_frames 34 | self.scope_id = scope_id 35 | 36 | def forward(self, inputs, **unused_params): 37 | """ Forward method for LstmLastHiddenModule. 38 | :param inputs: batch_size x max_frames x num_features 39 | :return: batch_size x output_dim 40 | """ 41 | stacked_lstm = tf.contrib.rnn.MultiRNNCell( 42 | [ 43 | tf.contrib.rnn.BasicLSTMCell( 44 | self.lstm_size, forget_bias=1.0) 45 | for _ in range(self.lstm_layers) 46 | 47 | ]) 48 | 49 | outputs, state = tf.nn.dynamic_rnn(stacked_lstm, inputs, 50 | sequence_length=self.num_frames, 51 | dtype=tf.float32) 52 | # Only output the hidden state at the end. 53 | return state[-1].h 54 | 55 | 56 | class LstmConcatAverageModule(modules.BaseModule): 57 | """ LSTM layers with stores the average of previous layers. """ 58 | def __init__(self, lstm_size, num_layers, max_frame): 59 | """ Initialize LSTM average concatenation module. 60 | :param lstm_size: int 61 | :param num_layers: int 62 | :param max_frame: num_frames x 1 63 | """ 64 | self.lstm_size = lstm_size 65 | self.num_layers = num_layers 66 | self.max_frame = max_frame 67 | 68 | def forward(self, inputs, **unused_params): 69 | """ Forward method for LstmConcatAverageModule. 70 | :param inputs: batch_size x max_frames x num_features 71 | :return: batch_size x output_dim 72 | """ 73 | stacked_lstm = tf.contrib.rnn.MultiRNNCell( 74 | [ 75 | tf.contrib.rnn.BasicLSTMCell( 76 | self.lstm_size, forget_bias=1.0, state_is_tuple=False) 77 | for _ in range(self.num_layers) 78 | ], state_is_tuple=False) 79 | 80 | outputs, state = tf.nn.dynamic_rnn(stacked_lstm, inputs, 81 | sequence_length=self.max_frame, 82 | dtype=tf.float32) 83 | 84 | context_memory = tf.nn.l2_normalize(tf.reduce_sum(outputs, axis=1), dim=1) 85 | average_state = tf.nn.l2_normalize(tf.reduce_sum(inputs, axis=1), dim=1) 86 | final_state = tf.concat([context_memory, state, average_state], 1) 87 | 88 | return final_state 89 | -------------------------------------------------------------------------------- /scripts/batch_evaluate.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Deep Topology All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ A script to batch-evaluate the algorithm on different epochs. """ 16 | import glob 17 | import json 18 | import os 19 | import time 20 | import sys 21 | import eval_util 22 | import losses 23 | import video_level_models 24 | import frame_level_models 25 | import readers 26 | import tensorflow as tf 27 | from tensorflow.python.lib.io import file_io 28 | from tensorflow import app 29 | from tensorflow import flags 30 | from tensorflow import gfile 31 | from tensorflow import logging 32 | import utils 33 | 34 | FLAGS = flags.FLAGS 35 | 36 | 37 | 38 | def find_class_by_name(name, modules): 39 | """ Searches the provided modules for the named class and returns it. """ 40 | modules = [getattr(module, name, None) for module in modules] 41 | return next(a for a in modules if a) 42 | 43 | 44 | def get_input_evaluation_tensors(reader, 45 | data_pattern, 46 | batch_size=1024, 47 | num_readers=1): 48 | """Creates the section of the graph which reads the evaluation data. 49 | 50 | Args: 51 | reader: A class which parses the training data. 52 | data_pattern: A 'glob' style path to the data files. 53 | batch_size: How many examples to process at a time. 54 | num_readers: How many I/O threads to use. 55 | 56 | Returns: 57 | A tuple containing the features tensor, labels tensor, and optionally a 58 | tensor containing the number of frames per video. The exact dimensions 59 | depend on the reader being used. 60 | 61 | Raises: 62 | IOError: If no files matching the given pattern were found. 63 | """ 64 | logging.info("Using batch size of " + str(batch_size) + " for evaluation.") 65 | with tf.name_scope("eval_input"): 66 | files = gfile.Glob(data_pattern) 67 | if not files: 68 | raise IOError("Unable to find the evaluation files.") 69 | logging.info("number of evaluation files: " + str(len(files))) 70 | filename_queue = tf.train.string_input_producer( 71 | files, shuffle=False, num_epochs=1) 72 | eval_data = [ 73 | reader.prepare_reader(filename_queue) for _ in range(num_readers) 74 | ] 75 | return tf.train.batch_join( 76 | eval_data, 77 | batch_size=batch_size, 78 | capacity=3 * batch_size, 79 | allow_smaller_final_batch=True, 80 | enqueue_many=True) 81 | 82 | 83 | def build_graph(reader, 84 | model, 85 | eval_data_pattern, 86 | label_loss_fn, 87 | batch_size=1024, 88 | num_readers=1): 89 | """Creates the Tensorflow graph for evaluation. 90 | 91 | Args: 92 | reader: The data file reader. It should inherit from BaseReader. 93 | model: The core model (e.g. logistic or neural net). It should inherit 94 | from BaseModel. 95 | eval_data_pattern: glob path to the evaluation data files. 96 | label_loss_fn: What kind of loss to apply to the model. It should inherit 97 | from BaseLoss. 98 | batch_size: How many examples to process at a time. 99 | num_readers: How many threads to use for I/O operations. 100 | """ 101 | 102 | global_step = tf.Variable(0, trainable=False, name="global_step") 103 | video_id_batch, model_input_raw, labels_batch, num_frames = get_input_evaluation_tensors( 104 | # pylint: disable=g-line-too-long 105 | reader, 106 | eval_data_pattern, 107 | batch_size=batch_size, 108 | num_readers=num_readers) 109 | tf.summary.histogram("model_input_raw", model_input_raw) 110 | 111 | feature_dim = len(model_input_raw.get_shape()) - 1 112 | 113 | # Normalize input features. 114 | model_input = tf.nn.l2_normalize(model_input_raw, feature_dim) 115 | 116 | with tf.variable_scope("tower"): 117 | result = model.create_model(model_input, 118 | num_frames=num_frames, 119 | vocab_size=reader.num_classes, 120 | labels=labels_batch, 121 | is_training=False) 122 | predictions = result["predictions"] 123 | tf.summary.histogram("model_activations", predictions) 124 | if "loss" in result.keys(): 125 | label_loss = result["loss"] 126 | else: 127 | label_loss = label_loss_fn.calculate_loss(predictions, labels_batch) 128 | 129 | tf.add_to_collection("global_step", global_step) 130 | tf.add_to_collection("loss", label_loss) 131 | tf.add_to_collection("predictions", predictions) 132 | tf.add_to_collection("input_batch", model_input) 133 | tf.add_to_collection("input_batch_raw", model_input_raw) 134 | tf.add_to_collection("video_id_batch", video_id_batch) 135 | tf.add_to_collection("num_frames", num_frames) 136 | tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) 137 | tf.add_to_collection("summary_op", tf.summary.merge_all()) 138 | 139 | 140 | def get_latest_checkpoint(): 141 | index_files = file_io.get_matching_files(os.path.join(FLAGS.train_dir, 'model.ckpt-*.index')) 142 | tf.logging.debug("Looking at {}".format(index_files)) 143 | 144 | # No files 145 | if not index_files: 146 | return None 147 | 148 | # Index file path with the maximum step size. 149 | latest_index_file = sorted( 150 | [(int(os.path.basename(f).split("-")[-1].split(".")[0]), f) 151 | for f in index_files])[-1][1] 152 | 153 | # Chop off .index suffix and return 154 | return latest_index_file[:-6] 155 | 156 | 157 | def evaluate(): 158 | tf.set_random_seed(0) # for reproducibility 159 | 160 | # Write json of flags 161 | model_flags_path = os.path.join(FLAGS.train_dir, "model_flags.json") 162 | if not file_io.file_exists(model_flags_path): 163 | raise IOError(("Cannot find file %s. Did you run train.py on the same " 164 | "--train_dir?") % model_flags_path) 165 | flags_dict = json.loads(file_io.FileIO(model_flags_path, mode="r").read()) 166 | 167 | with tf.Graph().as_default(): 168 | # convert feature_names and feature_sizes to lists of values 169 | feature_names, feature_sizes = utils.GetListOfFeatureNamesAndSizes( 170 | flags_dict["feature_names"], flags_dict["feature_sizes"]) 171 | 172 | if flags_dict["frame_features"]: 173 | reader = readers.YT8MFrameFeatureReader(feature_names=feature_names, 174 | feature_sizes=feature_sizes) 175 | else: 176 | reader = readers.YT8MAggregatedFeatureReader(feature_names=feature_names, 177 | feature_sizes=feature_sizes) 178 | 179 | model = find_class_by_name(flags_dict["model"], 180 | [frame_level_models, video_level_models])() 181 | label_loss_fn = find_class_by_name(flags_dict["label_loss"], [losses])() 182 | 183 | if FLAGS.eval_data_pattern is "": 184 | raise IOError("'eval_data_pattern' was not specified. " + 185 | "Nothing to evaluate.") 186 | 187 | build_graph( 188 | reader=reader, 189 | model=model, 190 | eval_data_pattern=FLAGS.eval_data_pattern, 191 | label_loss_fn=label_loss_fn, 192 | num_readers=FLAGS.num_readers, 193 | batch_size=FLAGS.batch_size) 194 | logging.info("built evaluation graph") 195 | 196 | summary_writer = tf.summary.FileWriter( 197 | FLAGS.train_dir, graph=tf.get_default_graph()) 198 | 199 | evl_metrics = eval_util.EvaluationMetrics(reader.num_classes, FLAGS.top_k) 200 | -------------------------------------------------------------------------------- /scripts/generate_gcloud_evaluation.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Deep Topology All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Generate command line arguments for Google Cloud MLE training.""" 16 | 17 | import os 18 | 19 | #################################################################### 20 | # Configuration #################################################### 21 | #################################################################### 22 | # yaml settings. cloudml-4gpu.yaml, cloudml-gpu.yaml, cloudml-gpu-distributed.yaml 23 | CLOUD_GPU = "cloudml-gpu.yaml" 24 | # Name and version of the model 25 | MODEL_NAME = "WillowModel" 26 | MODEL_VERSION = "" 27 | # Does it require frame-level models? 28 | FRAME_LEVEL = True 29 | # What features? e.g. RGB, audio 30 | FEATURES = "rgb,audio" 31 | 32 | 33 | def main(): 34 | # Start by defining a job name. 35 | command = "JOB_NAME=yt8m_eval_$(date +%Y%m%d_%H%M%S); " 36 | command += "gcloud --verbosity=debug ml-engine jobs submit training $JOB_NAME " 37 | command += "--package-path=youtube-8m --module-name=youtube-8m.eval " 38 | command += "--staging-bucket=$BUCKET_NAME --region=us-east1 " 39 | command += "--config=youtube-8m/cloudml_config/{} ".format(CLOUD_GPU) 40 | if FRAME_LEVEL: 41 | command += "-- --eval_data_pattern='gs://youtube8m-ml-us-east1/2/frame/validate/validate*.tfrecord' " 42 | command += "--frame_features=True " 43 | else: 44 | command += "-- --eval_data_pattern='gs://youtube8m-ml-us-east1/2/video/validate/validate*.tfrecord " 45 | command += "--frame_features=False " 46 | command += "--train_dir=$BUCKET_NAME/{} ".format(MODEL_NAME + str(MODEL_VERSION)) 47 | command += "--run_once=True" 48 | return command 49 | 50 | 51 | if __name__ == "__main__": 52 | current_directory = os.getcwd() 53 | current_directory = current_directory.split("\\")[:-2] 54 | print("Run the following command here: {}".format(current_directory)) 55 | print(main()) 56 | -------------------------------------------------------------------------------- /scripts/generate_gcloud_inference.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Deep Topology All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Generate command line arguments for Google Cloud MLE training.""" 16 | 17 | import os 18 | 19 | #################################################################### 20 | # Configuration #################################################### 21 | #################################################################### 22 | # yaml settings. cloudml-4gpu.yaml, cloudml-gpu.yaml, cloudml-gpu-distributed.yaml 23 | CLOUD_GPU = "cloudml-gpu-inference.yaml" 24 | # Name and version of the model 25 | MODEL_NAME = "WillowModel" 26 | MODEL_VERSION = "" 27 | # Does it require frame-level models? 28 | FRAME_LEVEL = True 29 | # What features? e.g. RGB, audio 30 | FEATURES = "rgb,audio" 31 | # Some additional flags to execute. 32 | EXTRA = "--batch_size 128" 33 | 34 | 35 | def main(): 36 | # Start by defining a job name. 37 | command = "JOB_NAME=yt8m_inference_$(date +%Y%m%d_%H%M%S); " 38 | command += "gcloud --verbosity=debug ml-engine jobs submit training $JOB_NAME " 39 | command += "--package-path=youtube-8m --module-name=youtube-8m.inference " 40 | command += "--staging-bucket=$BUCKET_NAME --region=us-east1 " 41 | command += "--config=youtube-8m/cloudml_config/{} ".format(CLOUD_GPU) 42 | if FRAME_LEVEL: 43 | command += "-- --input_data_pattern='gs://youtube8m-ml-us-east1/2/frame/test/test*.tfrecord' " 44 | command += "--frame_features=True " 45 | else: 46 | command += "-- --input_data_pattern='gs://youtube8m-ml-us-east1/2/video/test/test*.tfrecord " 47 | command += "--frame_features=False " 48 | command += "--train_dir=$BUCKET_NAME/{} ".format(MODEL_NAME + str(MODEL_VERSION)) 49 | command += "--output_file=$BUCKET_NAME/{}/predictions.csv".format(MODEL_NAME + str(MODEL_VERSION)) 50 | return command 51 | 52 | 53 | if __name__ == "__main__": 54 | current_directory = os.getcwd() 55 | current_directory = current_directory.split("\\")[:-2] 56 | print("Run the following command here: {}".format(current_directory)) 57 | print(main()) 58 | -------------------------------------------------------------------------------- /scripts/generate_gcloud_train.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Deep Topology All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Generate command line arguments for Google Cloud MLE training.""" 16 | 17 | import os 18 | 19 | #################################################################### 20 | # Configuration #################################################### 21 | #################################################################### 22 | # yaml settings. cloudml-4gpu.yaml, cloudml-gpu.yaml, cloudml-gpu-distributed.yaml 23 | CLOUD_GPU = "cloudml-gpu.yaml" 24 | # Name and version of the model 25 | MODEL_NAME = "WillowModel" 26 | MODEL_VERSION = 1 27 | # Does it require frame-level models? 28 | FRAME_LEVEL = True 29 | # What features? e.g. RGB, audio 30 | FEATURES = "rgb,audio" 31 | # Batch size. 32 | BATCH_SIZE = 128 33 | # Base LR. 34 | BASE_LEARNING_RATE = 0.0002 35 | # Initialize a new model? 36 | START_NEW_MODEL = True 37 | 38 | 39 | def main(): 40 | # Start by defining a job name. 41 | command = "JOB_NAME=yt8m_train_$(date +%Y%m%d_%H%M%S); " 42 | command += "gcloud --verbosity=debug ml-engine jobs submit training $JOB_NAME " 43 | command += "--package-path=youtube-8m --module-name=youtube-8m.train " 44 | command += "--staging-bucket=$BUCKET_NAME --region=us-east1 " 45 | command += "--config=youtube-8m/cloudml_config/{} ".format(CLOUD_GPU) 46 | if FRAME_LEVEL: 47 | command += "-- --train_data_pattern='gs://youtube8m-ml-us-east1/2/frame/train/train*.tfrecord' " 48 | command += "--frame_features=True " 49 | else: 50 | command += "-- --train_data_pattern='gs://youtube8m-ml-us-east1/2/video/train/train*.tfrecord' " 51 | command += "--frame_features=False " 52 | command += "--model={} ".format(MODEL_NAME) 53 | command += "--feature_names='{}' ".format(FEATURES) 54 | command += "--feature_sizes='1024,128' " 55 | command += "--batch_size={} ".format(str(BATCH_SIZE)) 56 | command += "--train_dir=$BUCKET_NAME/{} ".format(MODEL_NAME + str(MODEL_VERSION)) 57 | command += "--base_learning_rate={} ".format(str(BASE_LEARNING_RATE)) 58 | if START_NEW_MODEL: 59 | command += "--start_new_model" 60 | return command 61 | 62 | 63 | if __name__ == "__main__": 64 | current_directory = os.getcwd() 65 | current_directory = current_directory.split("\\")[:-2] 66 | print("Run the following command here: {}".format(current_directory)) 67 | print(main()) 68 | -------------------------------------------------------------------------------- /scripts/generate_gcloud_train_valid.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Deep Topology All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | Generate command line arguments for Google Cloud MLE training. 17 | Run command: BUCKET_NAME=gs://dtp1_yt8m_train_bucket 18 | """ 19 | 20 | 21 | import os 22 | 23 | #################################################################### 24 | # Configuration #################################################### 25 | #################################################################### 26 | # yaml settings. cloudml-4gpu.yaml, cloudml-gpu.yaml, cloudml-gpu-distributed.yaml 27 | CLOUD_GPU = "cloudml-gpu.yaml" 28 | # Name and version of the model 29 | MODEL_NAME = "TransformerEncoder" 30 | MODEL_VERSION = "" 31 | # Does it require frame-level models? 32 | FRAME_LEVEL = True 33 | # What features? e.g. RGB, audio 34 | FEATURES = "rgb,audio" 35 | # Batch size. 36 | BATCH_SIZE = 64 37 | # Base LR. 38 | BASE_LEARNING_RATE = 0.00005 39 | # Initialize a new model? 40 | START_NEW_MODEL = True 41 | EXTRA = "-learning_rate_decay=0.7" 42 | # EXTRA = "-tembed_v3_video_anchor_size=64 " \ 43 | # "-tembed_v3_audio_anchor_size=8 " \ 44 | # "-tembed_v3_distrib_concat_hidden_size=4096 " \ 45 | # "-tembed_v3_temporal_concat_hidden_size=4096 " \ 46 | # "-tembed_v3_full_concat_hidden_size=8192" 47 | 48 | 49 | def main(): 50 | # Start by defining a job name. 51 | local_command = "gcloud ml-engine local train " 52 | command = "JOB_NAME=yt8m_train_$(date +%Y%m%d_%H%M%S); " 53 | command += "gcloud --verbosity=debug ml-engine jobs submit training $JOB_NAME " 54 | command += "--package-path=youtube-8m --module-name=youtube-8m.train " 55 | local_command += "--package-path=youtube-8m --module-name=youtube-8m.train " 56 | command += "--staging-bucket=$BUCKET_NAME --region=us-east1 " 57 | command += "--config=youtube-8m/cloudml_config/{} ".format(CLOUD_GPU) 58 | if FRAME_LEVEL: 59 | command += "-- --train_data_pattern='gs://youtube8m-ml-us-east1/2/frame/train/*.tfrecord' " 60 | command += "--frame_features=True " 61 | local_command += "-- --train_data_pattern='gs://youtube8m-ml-us-east1/2/frame/train/*.tfrecord' " 62 | local_command += "--frame_features=True " 63 | else: 64 | command += "-- --train_data_pattern='gs://youtube8m-ml-us-east1/2/video/train/*.tfrecord' " 65 | command += "--frame_features=False " 66 | local_command += "-- --train_data_pattern='gs://youtube8m-ml-us-east1/2/video/train/*.tfrecord' " 67 | local_command += "--frame_features=False " 68 | command += "--base_learning_rate={} ".format(str(BASE_LEARNING_RATE)) 69 | local_command += "--base_learning_rate={} ".format(str(BASE_LEARNING_RATE)) 70 | command += "--model={} ".format(MODEL_NAME) 71 | local_command += "--model={} ".format(MODEL_NAME) 72 | command += "--feature_names='{}' ".format(FEATURES) 73 | local_command += "--feature_names='{}' ".format(FEATURES) 74 | command += "--feature_sizes='1024,128' " 75 | local_command += "--feature_sizes='1024,128' " 76 | command += "--batch_size={} ".format(str(BATCH_SIZE)) 77 | local_command += "--batch_size={} ".format(str(BATCH_SIZE)) 78 | command += "--train_dir=$BUCKET_NAME/{} ".format(MODEL_NAME + str(MODEL_VERSION)) 79 | local_command += "--train_dir=/tmp/yt8m_train " 80 | command += "--base_learning_rate={} ".format(str(BASE_LEARNING_RATE)) 81 | local_command += "--base_learning_rate={} ".format(str(BASE_LEARNING_RATE)) 82 | if START_NEW_MODEL: 83 | command += "--start_new_model " 84 | local_command += "--start_new_model " 85 | local_command += "--runtime-version=1.8" 86 | command += EXTRA 87 | local_command += EXTRA 88 | return command, local_command 89 | 90 | 91 | if __name__ == "__main__": 92 | current_directory = os.getcwd() 93 | current_directory = "/".join(current_directory.split("/")[:-2]) 94 | print("Run the following command here: {}".format(current_directory)) 95 | c, lc = main() 96 | print("Local: \n{}".format(lc)) 97 | print("Cloud: \n{}".format(c)) 98 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Binary for training Tensorflow models on the YouTube-8M dataset.""" 15 | import os 16 | import sys 17 | 18 | # Explicitly add the file's directory to the path list. 19 | file_dir = os.path.dirname(__file__) 20 | sys.path.append(file_dir) 21 | sys.path.append(os.path.join(os.getcwd(), "modules")) 22 | 23 | import json 24 | import os 25 | import time 26 | 27 | import eval_util 28 | import export_model 29 | import losses 30 | import frame_level_models 31 | import video_level_models 32 | import readers 33 | import tensorflow as tf 34 | import tensorflow.contrib.slim as slim 35 | from tensorflow import app 36 | from tensorflow import flags 37 | from tensorflow import gfile 38 | from tensorflow import logging 39 | from tensorflow.python.client import device_lib 40 | import utils 41 | 42 | FLAGS = flags.FLAGS 43 | 44 | if __name__ == "__main__": 45 | flags.DEFINE_string("train_dir", "/tmp/yt8m_model/", 46 | "The directory to save the model files in.") 47 | flags.DEFINE_string( 48 | "train_data_pattern", "gs://youtube8m-ml-us-east1/2/frame/train/train*.tfrecord," 49 | "gs://youtube8m-ml-us-east1/2/frame/validate/validate*.tfrecord", 50 | "File glob for the training dataset. If the files refer to Frame Level " 51 | "features (i.e. tensorflow.SequenceExample), then set --reader_type " 52 | "format. The (Sequence)Examples are expected to have 'rgb' byte array " 53 | "sequence feature as well as a 'labels' int64 context feature.") 54 | flags.DEFINE_string("feature_names", "mean_rgb", "Name of the feature " 55 | "to use for training.") 56 | flags.DEFINE_string("feature_sizes", "1024", "Length of the feature vectors.") 57 | 58 | # Model flags. 59 | flags.DEFINE_bool( 60 | "frame_features", False, 61 | "If set, then --train_data_pattern must be frame-level features. " 62 | "Otherwise, --train_data_pattern must be aggregated video-level " 63 | "features. The model must also be set appropriately (i.e. to read 3D " 64 | "batches VS 4D batches.") 65 | flags.DEFINE_string( 66 | "model", "LogisticModel", 67 | "Which architecture to use for the model. Models are defined " 68 | "in models.py.") 69 | flags.DEFINE_bool( 70 | "start_new_model", False, 71 | "If set, this will not resume from a checkpoint and will instead create a" 72 | " new model instance.") 73 | 74 | # Training flags. 75 | flags.DEFINE_integer("num_gpu", 1, 76 | "The maximum number of GPU devices to use for training. " 77 | "Flag only applies if GPUs are installed") 78 | flags.DEFINE_integer("batch_size", 1024, 79 | "How many examples to process per batch for training.") 80 | flags.DEFINE_string("label_loss", "CrossEntropyLoss", 81 | "Which loss function to use for training the model.") 82 | flags.DEFINE_float( 83 | "regularization_penalty", 1.0, 84 | "How much weight to give to the regularization loss (the label loss has " 85 | "a weight of 1).") 86 | flags.DEFINE_float("base_learning_rate", 0.01, 87 | "Which learning rate to start with.") 88 | flags.DEFINE_float("learning_rate_decay", 0.95, 89 | "Learning rate decay factor to be applied every " 90 | "learning_rate_decay_examples.") 91 | flags.DEFINE_float("learning_rate_decay_examples", 4000000, 92 | "Multiply current learning rate by learning_rate_decay " 93 | "every learning_rate_decay_examples.") 94 | flags.DEFINE_integer("num_epochs", 5, 95 | "How many passes to make over the dataset before " 96 | "halting training.") 97 | flags.DEFINE_integer("max_steps", None, 98 | "The maximum number of iterations of the training loop.") 99 | flags.DEFINE_integer("export_model_steps", 1000, 100 | "The period, in number of steps, with which the model " 101 | "is exported for batch prediction.") 102 | 103 | # Other flags. 104 | flags.DEFINE_integer("num_readers", 8, 105 | "How many threads to use for reading input files.") 106 | flags.DEFINE_string("optimizer", "AdamOptimizer", 107 | "What optimizer class to use.") 108 | flags.DEFINE_float("clip_gradient_norm", 1.0, "Norm to clip gradients to.") 109 | flags.DEFINE_bool( 110 | "log_device_placement", False, 111 | "Whether to write the device on which every op will run into the " 112 | "logs on startup.") 113 | 114 | 115 | def validate_class_name(flag_value, category, modules, expected_superclass): 116 | """Checks that the given string matches a class of the expected type. 117 | Args: 118 | flag_value: A string naming the class to instantiate. 119 | category: A string used further describe the class in error messages 120 | (e.g. 'model', 'reader', 'loss'). 121 | modules: A list of modules to search for the given class. 122 | expected_superclass: A class that the given class should inherit from. 123 | Raises: 124 | FlagsError: If the given class could not be found or if the first class 125 | found with that name doesn't inherit from the expected superclass. 126 | Returns: 127 | True if a class was found that matches the given constraints. 128 | """ 129 | candidates = [getattr(module, flag_value, None) for module in modules] 130 | for candidate in candidates: 131 | if not candidate: 132 | continue 133 | if not issubclass(candidate, expected_superclass): 134 | raise flags.FlagsError("%s '%s' doesn't inherit from %s." % 135 | (category, flag_value, 136 | expected_superclass.__name__)) 137 | return True 138 | raise flags.FlagsError("Unable to find %s '%s'." % (category, flag_value)) 139 | 140 | 141 | def get_input_data_tensors(reader, 142 | data_pattern, 143 | batch_size=1000, 144 | num_epochs=None, 145 | num_readers=1): 146 | """Creates the section of the graph which reads the training data. 147 | Args: 148 | reader: A class which parses the training data. 149 | data_pattern: A 'glob' style path to the data files. 150 | batch_size: How many examples to process at a time. 151 | num_epochs: How many passes to make over the training data. Set to 'None' 152 | to run indefinitely. 153 | num_readers: How many I/O threads to use. 154 | Returns: 155 | A tuple containing the features tensor, labels tensor, and optionally a 156 | tensor containing the number of frames per video. The exact dimensions 157 | depend on the reader being used. 158 | Raises: 159 | IOError: If no files matching the given pattern were found. 160 | """ 161 | logging.info("Using batch size of " + str(batch_size) + " for training.") 162 | with tf.name_scope("train_input"): 163 | file_dirs = data_pattern.split(",") 164 | files = list() 165 | for f in file_dirs: 166 | cur_file = gfile.Glob(f) 167 | files.extend(cur_file) 168 | if not files: 169 | raise IOError("Unable to find training files. data_pattern='" + 170 | data_pattern + "'.") 171 | logging.info("Number of training files: %s.", str(len(files))) 172 | filename_queue = tf.train.string_input_producer( 173 | files, num_epochs=num_epochs, shuffle=True) 174 | training_data = [ 175 | reader.prepare_reader(filename_queue) for _ in range(num_readers) 176 | ] 177 | 178 | return tf.train.shuffle_batch_join( 179 | training_data, 180 | batch_size=batch_size, 181 | capacity=batch_size * 5, 182 | min_after_dequeue=batch_size, 183 | allow_smaller_final_batch=True, 184 | enqueue_many=True) 185 | 186 | 187 | def find_class_by_name(name, modules): 188 | """Searches the provided modules for the named class and returns it.""" 189 | modules = [getattr(module, name, None) for module in modules] 190 | return next(a for a in modules if a) 191 | 192 | 193 | def build_graph(reader, 194 | model, 195 | train_data_pattern, 196 | label_loss_fn=losses.CrossEntropyLoss(), 197 | batch_size=1000, 198 | base_learning_rate=0.01, 199 | learning_rate_decay_examples=1000000, 200 | learning_rate_decay=0.95, 201 | optimizer_class=tf.train.AdamOptimizer, 202 | clip_gradient_norm=1.0, 203 | regularization_penalty=1, 204 | num_readers=1, 205 | num_epochs=None): 206 | """Creates the Tensorflow graph. 207 | This will only be called once in the life of 208 | a training model, because after the graph is created the model will be 209 | restored from a meta graph file rather than being recreated. 210 | Args: 211 | reader: The data file reader. It should inherit from BaseReader. 212 | model: The core model (e.g. logistic or neural net). It should inherit 213 | from BaseModel. 214 | train_data_pattern: glob path to the training data files. 215 | label_loss_fn: What kind of loss to apply to the model. It should inherit 216 | from BaseLoss. 217 | batch_size: How many examples to process at a time. 218 | base_learning_rate: What learning rate to initialize the optimizer with. 219 | optimizer_class: Which optimization algorithm to use. 220 | clip_gradient_norm: Magnitude of the gradient to clip to. 221 | regularization_penalty: How much weight to give the regularization loss 222 | compared to the label loss. 223 | num_readers: How many threads to use for I/O operations. 224 | num_epochs: How many passes to make over the data. 'None' means an 225 | unlimited number of passes. 226 | """ 227 | 228 | global_step = tf.Variable(0, trainable=False, name="global_step") 229 | 230 | local_device_protos = device_lib.list_local_devices() 231 | gpus = [x.name for x in local_device_protos if x.device_type == 'GPU'] 232 | gpus = gpus[:FLAGS.num_gpu] 233 | num_gpus = len(gpus) 234 | 235 | if num_gpus > 0: 236 | logging.info("Using the following GPUs to train: " + str(gpus)) 237 | num_towers = num_gpus 238 | device_string = '/gpu:%d' 239 | else: 240 | logging.info("No GPUs found. Training on CPU.") 241 | num_towers = 1 242 | device_string = '/cpu:%d' 243 | 244 | learning_rate = tf.train.exponential_decay( 245 | base_learning_rate, 246 | global_step * batch_size * num_towers, 247 | learning_rate_decay_examples, 248 | learning_rate_decay, 249 | staircase=True) 250 | tf.summary.scalar('learning_rate', learning_rate) 251 | 252 | optimizer = optimizer_class(learning_rate) 253 | unused_video_id, model_input_raw, labels_batch, num_frames = ( 254 | get_input_data_tensors( 255 | reader, 256 | train_data_pattern, 257 | batch_size=batch_size * num_towers, 258 | num_readers=num_readers, 259 | num_epochs=num_epochs)) 260 | tf.summary.histogram("model/input_raw", model_input_raw) 261 | 262 | feature_dim = len(model_input_raw.get_shape()) - 1 263 | 264 | model_input = tf.nn.l2_normalize(model_input_raw, feature_dim) 265 | 266 | tower_inputs = tf.split(model_input, num_towers) 267 | tower_labels = tf.split(labels_batch, num_towers) 268 | tower_num_frames = tf.split(num_frames, num_towers) 269 | tower_gradients = [] 270 | tower_predictions = [] 271 | tower_label_losses = [] 272 | tower_reg_losses = [] 273 | for i in range(num_towers): 274 | # For some reason these 'with' statements can't be combined onto the same 275 | # line. They have to be nested. 276 | with tf.device(device_string % i): 277 | with (tf.variable_scope(("tower"), reuse=True if i > 0 else None)): 278 | with ( 279 | slim.arg_scope([slim.model_variable, slim.variable], device="/cpu:0" if num_gpus != 1 else "/gpu:0")): 280 | result = model.create_model( 281 | tower_inputs[i], 282 | num_frames=tower_num_frames[i], 283 | vocab_size=reader.num_classes, 284 | labels=tower_labels[i]) 285 | for variable in slim.get_model_variables(): 286 | tf.summary.histogram(variable.op.name, variable) 287 | 288 | predictions = result["predictions"] 289 | tower_predictions.append(predictions) 290 | 291 | if "loss" in result.keys(): 292 | label_loss = result["loss"] 293 | else: 294 | label_loss = label_loss_fn.calculate_loss(predictions, tower_labels[i]) 295 | 296 | if "regularization_loss" in result.keys(): 297 | reg_loss = result["regularization_loss"] 298 | else: 299 | reg_loss = tf.constant(0.0) 300 | 301 | reg_losses = tf.losses.get_regularization_losses() 302 | if reg_losses: 303 | reg_loss += tf.add_n(reg_losses) 304 | 305 | tower_reg_losses.append(reg_loss) 306 | 307 | # Adds update_ops (e.g., moving average updates in batch normalization) as 308 | # a dependency to the train_op. 309 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 310 | if "update_ops" in result.keys(): 311 | update_ops += result["update_ops"] 312 | if update_ops: 313 | with tf.control_dependencies(update_ops): 314 | barrier = tf.no_op(name="gradient_barrier") 315 | with tf.control_dependencies([barrier]): 316 | label_loss = tf.identity(label_loss) 317 | 318 | tower_label_losses.append(label_loss) 319 | 320 | # Incorporate the L2 weight penalties etc. 321 | final_loss = regularization_penalty * reg_loss + label_loss 322 | gradients = optimizer.compute_gradients(final_loss, 323 | colocate_gradients_with_ops=False) 324 | tower_gradients.append(gradients) 325 | label_loss = tf.reduce_mean(tf.stack(tower_label_losses)) 326 | tf.summary.scalar("label_loss", label_loss) 327 | if regularization_penalty != 0: 328 | reg_loss = tf.reduce_mean(tf.stack(tower_reg_losses)) 329 | tf.summary.scalar("reg_loss", reg_loss) 330 | merged_gradients = utils.combine_gradients(tower_gradients) 331 | 332 | if clip_gradient_norm > 0: 333 | with tf.name_scope('clip_grads'): 334 | merged_gradients = utils.clip_gradient_norms(merged_gradients, clip_gradient_norm) 335 | 336 | train_op = optimizer.apply_gradients(merged_gradients, global_step=global_step) 337 | 338 | tf.add_to_collection("global_step", global_step) 339 | tf.add_to_collection("loss", label_loss) 340 | tf.add_to_collection("predictions", tf.concat(tower_predictions, 0)) 341 | tf.add_to_collection("input_batch_raw", model_input_raw) 342 | tf.add_to_collection("input_batch", model_input) 343 | tf.add_to_collection("num_frames", num_frames) 344 | tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) 345 | tf.add_to_collection("train_op", train_op) 346 | 347 | 348 | class Trainer(object): 349 | """A Trainer to train a Tensorflow graph.""" 350 | 351 | def __init__(self, cluster, task, train_dir, model, reader, model_exporter, 352 | log_device_placement=True, max_steps=None, 353 | export_model_steps=1000): 354 | """"Creates a Trainer. 355 | Args: 356 | cluster: A tf.train.ClusterSpec if the execution is distributed. 357 | None otherwise. 358 | task: A TaskSpec describing the job type and the task index. 359 | """ 360 | 361 | self.cluster = cluster 362 | self.task = task 363 | self.is_master = (task.type == "master" and task.index == 0) 364 | self.train_dir = train_dir 365 | self.config = tf.ConfigProto( 366 | allow_soft_placement=True, log_device_placement=log_device_placement) 367 | self.model = model 368 | self.reader = reader 369 | self.model_exporter = model_exporter 370 | self.max_steps = max_steps 371 | self.max_steps_reached = False 372 | self.export_model_steps = export_model_steps 373 | self.last_model_export_step = 0 374 | 375 | # if self.is_master and self.task.index > 0: 376 | # raise StandardError("%s: Only one replica of master expected", 377 | # task_as_string(self.task)) 378 | 379 | def run(self, start_new_model=False): 380 | """Performs training on the currently defined Tensorflow graph. 381 | Returns: 382 | A tuple of the training Hit@1 and the training PERR. 383 | """ 384 | if self.is_master and start_new_model: 385 | self.remove_training_directory(self.train_dir) 386 | 387 | if not os.path.exists(self.train_dir): 388 | os.makedirs(self.train_dir) 389 | 390 | model_flags_dict = { 391 | "model": FLAGS.model, 392 | "feature_sizes": FLAGS.feature_sizes, 393 | "feature_names": FLAGS.feature_names, 394 | "frame_features": FLAGS.frame_features, 395 | "label_loss": FLAGS.label_loss, 396 | } 397 | flags_json_path = os.path.join(FLAGS.train_dir, "model_flags.json") 398 | if os.path.exists(flags_json_path): 399 | existing_flags = json.load(open(flags_json_path)) 400 | if existing_flags != model_flags_dict: 401 | logging.error("Model flags do not match existing file %s. Please " 402 | "delete the file, change --train_dir, or pass flag " 403 | "--start_new_model", 404 | flags_json_path) 405 | logging.error("Ran model with flags: %s", str(model_flags_dict)) 406 | logging.error("Previously ran with flags: %s", str(existing_flags)) 407 | exit(1) 408 | else: 409 | # Write the file. 410 | with open(flags_json_path, "w") as fout: 411 | fout.write(json.dumps(model_flags_dict)) 412 | 413 | target, device_fn = self.start_server_if_distributed() 414 | 415 | meta_filename = self.get_meta_filename(start_new_model, self.train_dir) 416 | 417 | with tf.Graph().as_default() as graph: 418 | if meta_filename: 419 | saver = self.recover_model(meta_filename) 420 | 421 | with tf.device(device_fn): 422 | if not meta_filename: 423 | saver = self.build_model(self.model, self.reader) 424 | 425 | global_step = tf.get_collection("global_step")[0] 426 | loss = tf.get_collection("loss")[0] 427 | predictions = tf.get_collection("predictions")[0] 428 | labels = tf.get_collection("labels")[0] 429 | train_op = tf.get_collection("train_op")[0] 430 | init_op = tf.global_variables_initializer() 431 | 432 | sv = tf.train.Supervisor( 433 | graph, 434 | logdir=self.train_dir, 435 | init_op=init_op, 436 | is_chief=self.is_master, 437 | global_step=global_step, 438 | save_model_secs=15 * 60, 439 | save_summaries_secs=120, 440 | saver=saver) 441 | 442 | logging.info("%s: Starting managed session.", task_as_string(self.task)) 443 | with sv.managed_session(target, config=self.config) as sess: 444 | try: 445 | logging.info("%s: Entering training loop.", task_as_string(self.task)) 446 | while (not sv.should_stop()) and (not self.max_steps_reached): 447 | batch_start_time = time.time() 448 | _, global_step_val, loss_val, predictions_val, labels_val = sess.run( 449 | [train_op, global_step, loss, predictions, labels]) 450 | seconds_per_batch = time.time() - batch_start_time 451 | examples_per_second = labels_val.shape[0] / seconds_per_batch 452 | 453 | if self.max_steps and self.max_steps <= global_step_val: 454 | self.max_steps_reached = True 455 | 456 | if self.is_master and global_step_val % 10 == 0 and self.train_dir: 457 | eval_start_time = time.time() 458 | hit_at_one = eval_util.calculate_hit_at_one(predictions_val, labels_val) 459 | perr = eval_util.calculate_precision_at_equal_recall_rate(predictions_val, 460 | labels_val) 461 | gap = eval_util.calculate_gap(predictions_val, labels_val) 462 | eval_end_time = time.time() 463 | eval_time = eval_end_time - eval_start_time 464 | 465 | logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + 466 | " Examples/sec: " + ("%.2f" % examples_per_second) + " | Hit@1: " + 467 | ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) + 468 | " GAP: " + ("%.2f" % gap)) 469 | 470 | sv.summary_writer.add_summary( 471 | utils.MakeSummary("model/Training_Hit@1", hit_at_one), 472 | global_step_val) 473 | sv.summary_writer.add_summary( 474 | utils.MakeSummary("model/Training_Perr", perr), global_step_val) 475 | sv.summary_writer.add_summary( 476 | utils.MakeSummary("model/Training_GAP", gap), global_step_val) 477 | sv.summary_writer.add_summary( 478 | utils.MakeSummary("global_step/Examples/Second", 479 | examples_per_second), global_step_val) 480 | sv.summary_writer.flush() 481 | 482 | # Exporting the model every x steps 483 | time_to_export = ((self.last_model_export_step == 0) or 484 | (global_step_val - self.last_model_export_step 485 | >= self.export_model_steps)) 486 | 487 | if self.is_master and time_to_export: 488 | self.export_model(global_step_val, sv.saver, sv.save_path, sess) 489 | self.last_model_export_step = global_step_val 490 | else: 491 | logging.info("training step " + str(global_step_val) + " | Loss: " + 492 | ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second)) 493 | except tf.errors.OutOfRangeError: 494 | logging.info("%s: Done training -- epoch limit reached.", 495 | task_as_string(self.task)) 496 | 497 | logging.info("%s: Exited training loop.", task_as_string(self.task)) 498 | sv.Stop() 499 | 500 | 501 | def export_model(self, global_step_val, saver, save_path, session): 502 | # If the model has already been exported at this step, return. 503 | if global_step_val == self.last_model_export_step: 504 | return 505 | 506 | last_checkpoint = saver.save(session, save_path, global_step_val) 507 | 508 | model_dir = "{0}/export/step_{1}".format(self.train_dir, global_step_val) 509 | logging.info("%s: Exporting the model at step %s to %s.", 510 | task_as_string(self.task), global_step_val, model_dir) 511 | 512 | self.model_exporter.export_model( 513 | model_dir=model_dir, 514 | global_step_val=global_step_val, 515 | last_checkpoint=last_checkpoint) 516 | 517 | 518 | def start_server_if_distributed(self): 519 | """Starts a server if the execution is distributed.""" 520 | 521 | if self.cluster: 522 | logging.info("%s: Starting trainer within cluster %s.", 523 | task_as_string(self.task), self.cluster.as_dict()) 524 | server = start_server(self.cluster, self.task) 525 | target = server.target 526 | device_fn = tf.train.replica_device_setter( 527 | ps_device="/job:ps", 528 | worker_device="/job:%s/task:%d" % (self.task.type, self.task.index), 529 | cluster=self.cluster) 530 | else: 531 | target = "" 532 | device_fn = "" 533 | return (target, device_fn) 534 | 535 | def remove_training_directory(self, train_dir): 536 | """Removes the training directory.""" 537 | try: 538 | logging.info( 539 | "%s: Removing existing train directory.", 540 | task_as_string(self.task)) 541 | gfile.DeleteRecursively(train_dir) 542 | except: 543 | logging.error( 544 | "%s: Failed to delete directory " + train_dir + 545 | " when starting a new model. Please delete it manually and" + 546 | " try again.", task_as_string(self.task)) 547 | 548 | def get_meta_filename(self, start_new_model, train_dir): 549 | if start_new_model: 550 | logging.info("%s: Flag 'start_new_model' is set. Building a new model.", 551 | task_as_string(self.task)) 552 | return None 553 | 554 | latest_checkpoint = tf.train.latest_checkpoint(train_dir) 555 | if not latest_checkpoint: 556 | logging.info("%s: No checkpoint file found. Building a new model.", 557 | task_as_string(self.task)) 558 | return None 559 | 560 | meta_filename = latest_checkpoint + ".meta" 561 | if not gfile.Exists(meta_filename): 562 | logging.info("%s: No meta graph file found. Building a new model.", 563 | task_as_string(self.task)) 564 | return None 565 | else: 566 | return meta_filename 567 | 568 | def recover_model(self, meta_filename): 569 | logging.info("%s: Restoring from meta graph file %s", 570 | task_as_string(self.task), meta_filename) 571 | return tf.train.import_meta_graph(meta_filename) 572 | 573 | def build_model(self, model, reader): 574 | """Find the model and build the graph.""" 575 | 576 | label_loss_fn = find_class_by_name(FLAGS.label_loss, [losses])() 577 | optimizer_class = find_class_by_name(FLAGS.optimizer, [tf.train]) 578 | 579 | build_graph(reader=reader, 580 | model=model, 581 | optimizer_class=optimizer_class, 582 | clip_gradient_norm=FLAGS.clip_gradient_norm, 583 | train_data_pattern=FLAGS.train_data_pattern, 584 | label_loss_fn=label_loss_fn, 585 | base_learning_rate=FLAGS.base_learning_rate, 586 | learning_rate_decay=FLAGS.learning_rate_decay, 587 | learning_rate_decay_examples=FLAGS.learning_rate_decay_examples, 588 | regularization_penalty=FLAGS.regularization_penalty, 589 | num_readers=FLAGS.num_readers, 590 | batch_size=FLAGS.batch_size, 591 | num_epochs=FLAGS.num_epochs) 592 | 593 | return tf.train.Saver(max_to_keep=0, keep_checkpoint_every_n_hours=1.0) 594 | 595 | 596 | def get_reader(): 597 | # Convert feature_names and feature_sizes to lists of values. 598 | feature_names, feature_sizes = utils.GetListOfFeatureNamesAndSizes( 599 | FLAGS.feature_names, FLAGS.feature_sizes) 600 | 601 | if FLAGS.frame_features: 602 | reader = readers.YT8MFrameFeatureReader( 603 | feature_names=feature_names, feature_sizes=feature_sizes) 604 | else: 605 | reader = readers.YT8MAggregatedFeatureReader( 606 | feature_names=feature_names, feature_sizes=feature_sizes) 607 | 608 | return reader 609 | 610 | 611 | class ParameterServer(object): 612 | """A parameter server to serve variables in a distributed execution.""" 613 | 614 | def __init__(self, cluster, task): 615 | """Creates a ParameterServer. 616 | Args: 617 | cluster: A tf.train.ClusterSpec if the execution is distributed. 618 | None otherwise. 619 | task: A TaskSpec describing the job type and the task index. 620 | """ 621 | 622 | self.cluster = cluster 623 | self.task = task 624 | 625 | def run(self): 626 | """Starts the parameter server.""" 627 | 628 | logging.info("%s: Starting parameter server within cluster %s.", 629 | task_as_string(self.task), self.cluster.as_dict()) 630 | server = start_server(self.cluster, self.task) 631 | server.join() 632 | 633 | 634 | def start_server(cluster, task): 635 | """Creates a Server. 636 | Args: 637 | cluster: A tf.train.ClusterSpec if the execution is distributed. 638 | None otherwise. 639 | task: A TaskSpec describing the job type and the task index. 640 | """ 641 | 642 | if not task.type: 643 | raise ValueError("%s: The task type must be specified." % 644 | task_as_string(task)) 645 | if task.index is None: 646 | raise ValueError("%s: The task index must be specified." % 647 | task_as_string(task)) 648 | 649 | # Create and start a server. 650 | return tf.train.Server( 651 | tf.train.ClusterSpec(cluster), 652 | protocol="grpc", 653 | job_name=task.type, 654 | task_index=task.index) 655 | 656 | 657 | def task_as_string(task): 658 | return "/job:%s/task:%s" % (task.type, task.index) 659 | 660 | 661 | def main(unused_argv): 662 | # Load the environment. 663 | env = json.loads(os.environ.get("TF_CONFIG", "{}")) 664 | 665 | # Load the cluster data from the environment. 666 | cluster_data = env.get("cluster", None) 667 | cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None 668 | 669 | # Load the task data from the environment. 670 | task_data = env.get("task", None) or {"type": "master", "index": 0} 671 | task = type("TaskSpec", (object,), task_data) 672 | 673 | # Logging the version. 674 | logging.set_verbosity(tf.logging.INFO) 675 | logging.info("%s: Tensorflow version: %s.", 676 | task_as_string(task), tf.__version__) 677 | 678 | # Dispatch to a master, a worker, or a parameter server. 679 | if not cluster or task.type == "master" or task.type == "worker": 680 | model = find_class_by_name(FLAGS.model, 681 | [frame_level_models, video_level_models])() 682 | 683 | reader = get_reader() 684 | 685 | model_exporter = export_model.ModelExporter( 686 | frame_features=FLAGS.frame_features, 687 | model=model, 688 | reader=reader) 689 | 690 | Trainer(cluster, task, FLAGS.train_dir, model, reader, model_exporter, 691 | FLAGS.log_device_placement, FLAGS.max_steps, 692 | FLAGS.export_model_steps).run(start_new_model=FLAGS.start_new_model) 693 | 694 | elif task.type == "ps": 695 | ParameterServer(cluster, task).run() 696 | else: 697 | raise ValueError("%s: Invalid task_type: %s." % 698 | (task_as_string(task), task.type)) 699 | 700 | 701 | if __name__ == "__main__": 702 | app.run() 703 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Contains a collection of util functions for training and evaluating. 16 | """ 17 | 18 | import numpy 19 | import tensorflow as tf 20 | from tensorflow import logging 21 | 22 | try: 23 | xrange # Python 2 24 | except NameError: 25 | xrange = range # Python 3 26 | 27 | 28 | def Dequantize(feat_vector, max_quantized_value=2, min_quantized_value=-2): 29 | """Dequantize the feature from the byte format to the float format. 30 | 31 | Args: 32 | feat_vector: the input 1-d vector. 33 | max_quantized_value: the maximum of the quantized value. 34 | min_quantized_value: the minimum of the quantized value. 35 | 36 | Returns: 37 | A float vector which has the same shape as feat_vector. 38 | """ 39 | assert max_quantized_value > min_quantized_value 40 | quantized_range = max_quantized_value - min_quantized_value 41 | scalar = quantized_range / 255.0 42 | bias = (quantized_range / 512.0) + min_quantized_value 43 | return feat_vector * scalar + bias 44 | 45 | 46 | def MakeSummary(name, value): 47 | """Creates a tf.Summary proto with the given name and value.""" 48 | summary = tf.Summary() 49 | val = summary.value.add() 50 | val.tag = str(name) 51 | val.simple_value = float(value) 52 | return summary 53 | 54 | 55 | def AddGlobalStepSummary(summary_writer, 56 | global_step_val, 57 | global_step_info_dict, 58 | summary_scope="Eval"): 59 | """Add the global_step summary to the Tensorboard. 60 | 61 | Args: 62 | summary_writer: Tensorflow summary_writer. 63 | global_step_val: a int value of the global step. 64 | global_step_info_dict: a dictionary of the evaluation metrics calculated for 65 | a mini-batch. 66 | summary_scope: Train or Eval. 67 | 68 | Returns: 69 | A string of this global_step summary 70 | """ 71 | this_hit_at_one = global_step_info_dict["hit_at_one"] 72 | this_perr = global_step_info_dict["perr"] 73 | this_loss = global_step_info_dict["loss"] 74 | examples_per_second = global_step_info_dict.get("examples_per_second", -1) 75 | 76 | summary_writer.add_summary( 77 | MakeSummary("GlobalStep/" + summary_scope + "_Hit@1", this_hit_at_one), 78 | global_step_val) 79 | summary_writer.add_summary( 80 | MakeSummary("GlobalStep/" + summary_scope + "_Perr", this_perr), 81 | global_step_val) 82 | summary_writer.add_summary( 83 | MakeSummary("GlobalStep/" + summary_scope + "_Loss", this_loss), 84 | global_step_val) 85 | 86 | if examples_per_second != -1: 87 | summary_writer.add_summary( 88 | MakeSummary("GlobalStep/" + summary_scope + "_Example_Second", 89 | examples_per_second), global_step_val) 90 | 91 | summary_writer.flush() 92 | info = ("global_step {0} | Batch Hit@1: {1:.3f} | Batch PERR: {2:.3f} | Batch Loss: {3:.3f} " 93 | "| Examples_per_sec: {4:.3f}").format( 94 | global_step_val, this_hit_at_one, this_perr, this_loss, 95 | examples_per_second) 96 | return info 97 | 98 | 99 | def AddEpochSummary(summary_writer, 100 | global_step_val, 101 | epoch_info_dict, 102 | summary_scope="Eval"): 103 | """Add the epoch summary to the Tensorboard. 104 | 105 | Args: 106 | summary_writer: TensorFlow summary_writer. 107 | global_step_val: a int value of the global step. 108 | epoch_info_dict: a dictionary of the evaluation metrics calculated for the 109 | whole epoch. 110 | summary_scope: Train or Eval. 111 | 112 | Returns: 113 | A string of this global_step summary 114 | """ 115 | epoch_id = epoch_info_dict["epoch_id"] 116 | avg_hit_at_one = epoch_info_dict["avg_hit_at_one"] 117 | avg_perr = epoch_info_dict["avg_perr"] 118 | avg_loss = epoch_info_dict["avg_loss"] 119 | aps = epoch_info_dict["aps"] 120 | gap = epoch_info_dict["gap"] 121 | mean_ap = numpy.mean(aps) 122 | 123 | summary_writer.add_summary( 124 | MakeSummary("Epoch/" + summary_scope + "_Avg_Hit@1", avg_hit_at_one), 125 | global_step_val) 126 | summary_writer.add_summary( 127 | MakeSummary("Epoch/" + summary_scope + "_Avg_Perr", avg_perr), 128 | global_step_val) 129 | summary_writer.add_summary( 130 | MakeSummary("Epoch/" + summary_scope + "_Avg_Loss", avg_loss), 131 | global_step_val) 132 | summary_writer.add_summary( 133 | MakeSummary("Epoch/" + summary_scope + "_MAP", mean_ap), 134 | global_step_val) 135 | summary_writer.add_summary( 136 | MakeSummary("Epoch/" + summary_scope + "_GAP", gap), 137 | global_step_val) 138 | summary_writer.flush() 139 | 140 | info = ("epoch/eval number {0} | Avg_Hit@1: {1:.3f} | Avg_PERR: {2:.3f} " 141 | "| MAP: {3:.3f} | GAP: {4:.3f} | Avg_Loss: {5:3f}").format( 142 | epoch_id, avg_hit_at_one, avg_perr, mean_ap, gap, avg_loss) 143 | return info 144 | 145 | 146 | def GetListOfFeatureNamesAndSizes(feature_names, feature_sizes): 147 | """Extract the list of feature names and the dimensionality of each feature 148 | from string of comma separated values. 149 | 150 | Args: 151 | feature_names: string containing comma separated list of feature names 152 | feature_sizes: string containing comma separated list of feature sizes 153 | 154 | Returns: 155 | List of the feature names and list of the dimensionality of each feature. 156 | Elements in the first/second list are strings/integers. 157 | """ 158 | list_of_feature_names = [ 159 | feature_names.strip() for feature_names in feature_names.split(',')] 160 | list_of_feature_sizes = [ 161 | int(feature_sizes) for feature_sizes in feature_sizes.split(',')] 162 | if len(list_of_feature_names) != len(list_of_feature_sizes): 163 | logging.error("length of the feature names (=" + 164 | str(len(list_of_feature_names)) + ") != length of feature " 165 | "sizes (=" + str(len(list_of_feature_sizes)) + ")") 166 | 167 | return list_of_feature_names, list_of_feature_sizes 168 | 169 | 170 | def clip_gradient_norms(gradients_to_variables, max_norm): 171 | """Clips the gradients by the given value. 172 | 173 | Args: 174 | gradients_to_variables: A list of gradient to variable pairs (tuples). 175 | max_norm: the maximum norm value. 176 | 177 | Returns: 178 | A list of clipped gradient to variable pairs. 179 | """ 180 | clipped_grads_and_vars = [] 181 | for grad, var in gradients_to_variables: 182 | if grad is not None: 183 | if isinstance(grad, tf.IndexedSlices): 184 | tmp = tf.clip_by_norm(grad.values, max_norm) 185 | grad = tf.IndexedSlices(tmp, grad.indices, grad.dense_shape) 186 | else: 187 | grad = tf.clip_by_norm(grad, max_norm) 188 | clipped_grads_and_vars.append((grad, var)) 189 | return clipped_grads_and_vars 190 | 191 | 192 | def combine_gradients(tower_grads): 193 | """Calculate the combined gradient for each shared variable across all towers. 194 | 195 | Note that this function provides a synchronization point across all towers. 196 | 197 | Args: 198 | tower_grads: List of lists of (gradient, variable) tuples. The outer list 199 | is over individual gradients. The inner list is over the gradient 200 | calculation for each tower. 201 | Returns: 202 | List of pairs of (gradient, variable) where the gradient has been summed 203 | across all towers. 204 | """ 205 | filtered_grads = [[x for x in grad_list if x[0] is not None] for grad_list in tower_grads] 206 | final_grads = [] 207 | for i in xrange(len(filtered_grads[0])): 208 | grads = [filtered_grads[t][i] for t in xrange(len(filtered_grads))] 209 | grad = tf.stack([x[0] for x in grads], 0) 210 | grad = tf.reduce_sum(grad, 0) 211 | final_grads.append((grad, filtered_grads[0][i][1],)) 212 | 213 | return final_grads 214 | -------------------------------------------------------------------------------- /video_level_models.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Deep Topology All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Contains model definitions.""" 16 | # noinspection PyUnresolvedReferences 17 | import pathmagic 18 | from tensorflow import flags 19 | import attention_modules 20 | import tensorflow as tf 21 | import tensorflow.contrib.slim as slim 22 | import models 23 | import math 24 | 25 | FLAGS = flags.FLAGS 26 | flags.DEFINE_integer( 27 | "moe_num_mixtures", 2, 28 | "The number of mixtures (excluding the dummy 'expert') used for MoeModel.") 29 | 30 | 31 | ############################################################################### 32 | # Baseline (Benchmark) models ################################################# 33 | ############################################################################### 34 | flags.DEFINE_float( 35 | "moe_l2", 1e-8, 36 | "L2 penalty for MoeModel.") 37 | flags.DEFINE_integer( 38 | "moe_low_rank_gating", -1, 39 | "Low rank gating for MoeModel.") 40 | flags.DEFINE_bool( 41 | "moe_prob_gating", False, 42 | "Prob gating for MoeModel.") 43 | flags.DEFINE_string( 44 | "moe_prob_gating_input", "prob", 45 | "input Prob gating for MoeModel.") 46 | 47 | 48 | class MoeModel(models.BaseModel): 49 | """A softmax over a mixture of logistic models (with L2 regularization).""" 50 | 51 | def create_model(self, 52 | model_input, 53 | vocab_size, 54 | is_training, 55 | num_mixtures=None, 56 | l2_penalty=1e-8, 57 | **unused_params): 58 | """Creates a Mixture of (Logistic) Experts model. 59 | It also includes the possibility of gating the probabilities 60 | The model consists of a per-class softmax distribution over a 61 | configurable number of logistic classifiers. One of the classifiers in the 62 | mixture is not trained, and always predicts 0. 63 | Args: 64 | model_input: 'batch_size' x 'num_features' matrix of input features. 65 | vocab_size: The number of classes in the dataset. 66 | is_training: Is this the training phase ? 67 | num_mixtures: The number of mixtures (excluding a dummy 'expert' that 68 | always predicts the non-existence of an entity). 69 | l2_penalty: How much to penalize the squared magnitudes of parameter 70 | values. 71 | Returns: 72 | A dictionary with a tensor containing the probability predictions of the 73 | model in the 'predictions' key. The dimensions of the tensor are 74 | batch_size x num_classes. 75 | """ 76 | num_mixtures = num_mixtures or FLAGS.moe_num_mixtures 77 | low_rank_gating = FLAGS.moe_low_rank_gating 78 | l2_penalty = FLAGS.moe_l2 79 | gating_probabilities = FLAGS.moe_prob_gating 80 | gating_input = FLAGS.moe_prob_gating_input 81 | 82 | input_size = model_input.get_shape().as_list()[1] 83 | remove_diag = FLAGS.gating_remove_diag 84 | 85 | if low_rank_gating == -1: 86 | gate_activations = slim.fully_connected( 87 | model_input, 88 | vocab_size * (num_mixtures + 1), 89 | activation_fn=None, 90 | biases_initializer=None, 91 | weights_regularizer=slim.l2_regularizer(l2_penalty), 92 | scope="gates") 93 | else: 94 | gate_activations1 = slim.fully_connected( 95 | model_input, 96 | low_rank_gating, 97 | activation_fn=None, 98 | biases_initializer=None, 99 | weights_regularizer=slim.l2_regularizer(l2_penalty), 100 | scope="gates1") 101 | gate_activations = slim.fully_connected( 102 | gate_activations1, 103 | vocab_size * (num_mixtures + 1), 104 | activation_fn=None, 105 | biases_initializer=None, 106 | weights_regularizer=slim.l2_regularizer(l2_penalty), 107 | scope="gates2") 108 | 109 | expert_activations = slim.fully_connected( 110 | model_input, 111 | vocab_size * num_mixtures, 112 | activation_fn=None, 113 | weights_regularizer=slim.l2_regularizer(l2_penalty), 114 | scope="experts") 115 | 116 | gating_distribution = tf.nn.softmax(tf.reshape( 117 | gate_activations, 118 | [-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1) 119 | expert_distribution = tf.nn.sigmoid(tf.reshape( 120 | expert_activations, 121 | [-1, num_mixtures])) # (Batch * #Labels) x num_mixtures 122 | 123 | probabilities_by_class_and_batch = tf.reduce_sum( 124 | gating_distribution[:, :num_mixtures] * expert_distribution, 1) 125 | probabilities = tf.reshape(probabilities_by_class_and_batch, 126 | [-1, vocab_size]) 127 | 128 | if gating_probabilities: 129 | if gating_input == 'prob': 130 | gating_weights = tf.get_variable("gating_prob_weights", 131 | [vocab_size, vocab_size], 132 | initializer=tf.random_normal_initializer( 133 | stddev=1 / math.sqrt(vocab_size))) 134 | gates = tf.matmul(probabilities, gating_weights) 135 | else: 136 | gating_weights = tf.get_variable("gating_prob_weights", 137 | [input_size, vocab_size], 138 | initializer=tf.random_normal_initializer( 139 | stddev=1 / math.sqrt(vocab_size))) 140 | 141 | gates = tf.matmul(model_input, gating_weights) 142 | 143 | if remove_diag: 144 | # removes diagonals coefficients 145 | diagonals = tf.matrix_diag_part(gating_weights) 146 | gates = gates - tf.multiply(diagonals, probabilities) 147 | 148 | gates = slim.batch_norm( 149 | gates, 150 | center=True, 151 | scale=True, 152 | is_training=is_training, 153 | scope="gating_prob_bn") 154 | 155 | gates = tf.sigmoid(gates) 156 | 157 | probabilities = tf.multiply(probabilities, gates) 158 | 159 | return {"predictions": probabilities} 160 | 161 | 162 | class FishMoeModel(models.BaseModel): 163 | """A softmax over a mixture of logistic models (with L2 regularization).""" 164 | 165 | def create_model(self, 166 | model_input, 167 | vocab_size, 168 | is_training, 169 | num_mixtures=None, 170 | l2_penalty=1e-8, 171 | filter_size=2, 172 | **unused_params): 173 | """Creates a Mixture of (Logistic) Experts model. 174 | It also includes the possibility of gating the probabilities 175 | The model consists of a per-class softmax distribution over a 176 | configurable number of logistic classifiers. One of the classifiers in the 177 | mixture is not trained, and always predicts 0. 178 | Args: 179 | model_input: 'batch_size' x 'num_features' matrix of input features. 180 | vocab_size: The number of classes in the dataset. 181 | is_training: Is this the training phase ? 182 | num_mixtures: The number of mixtures (excluding a dummy 'expert' that 183 | always predicts the non-existence of an entity). 184 | l2_penalty: How much to penalize the squared magnitudes of parameter 185 | values. 186 | Returns: 187 | A dictionary with a tensor containing the probability predictions of the 188 | model in the 'predictions' key. The dimensions of the tensor are 189 | batch_size x num_classes. 190 | """ 191 | num_mixtures = num_mixtures or FLAGS.moe_num_mixtures 192 | l2_penalty = FLAGS.moe_l2 193 | 194 | gate_activations = slim.fully_connected( 195 | model_input, 196 | vocab_size * (num_mixtures + 1), 197 | activation_fn=None, 198 | biases_initializer=None, 199 | weights_regularizer=slim.l2_regularizer(l2_penalty), 200 | scope="gates") 201 | 202 | expert_activations = slim.fully_connected( 203 | model_input, 204 | vocab_size * num_mixtures, 205 | activation_fn=None, 206 | weights_regularizer=slim.l2_regularizer(l2_penalty), 207 | scope="experts") 208 | 209 | gating_distribution = tf.nn.softmax(tf.reshape( 210 | gate_activations, 211 | [-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1) 212 | expert_distribution = tf.nn.sigmoid(tf.reshape( 213 | expert_activations, 214 | [-1, num_mixtures])) # (Batch * #Labels) x num_mixtures 215 | 216 | probabilities_by_class_and_batch = tf.reduce_sum( 217 | gating_distribution[:, :num_mixtures] * expert_distribution, 1) 218 | probabilities = tf.reshape(probabilities_by_class_and_batch, 219 | [-1, vocab_size]) 220 | probabilities = tf.layers.batch_normalization(probabilities, training=is_training) 221 | 222 | fish_gate = fish_modules.FishGate(hidden_size=vocab_size, 223 | k=2, 224 | dropout_rate=0.9, 225 | is_training=is_training) 226 | 227 | probabilities = fish_gate.forward(probabilities) 228 | probabilities = tf.contrib.layers.layer_norm(probabilities) 229 | 230 | probabilities = tf.layers.dense(probabilities, vocab_size, use_bias=True, activation=tf.nn.softmax) 231 | 232 | return {"predictions": probabilities} 233 | 234 | 235 | class FishMoeModel2(models.BaseModel): 236 | """A softmax over a mixture of logistic models (with L2 regularization).""" 237 | 238 | def create_model(self, 239 | model_input, 240 | vocab_size, 241 | is_training, 242 | num_mixtures=None, 243 | l2_penalty=1e-8, 244 | filter_size=2, 245 | **unused_params): 246 | """Creates a Mixture of (Logistic) Experts model. 247 | It also includes the possibility of gating the probabilities 248 | The model consists of a per-class softmax distribution over a 249 | configurable number of logistic classifiers. One of the classifiers in the 250 | mixture is not trained, and always predicts 0. 251 | Args: 252 | model_input: 'batch_size' x 'num_features' matrix of input features. 253 | vocab_size: The number of classes in the dataset. 254 | is_training: Is this the training phase ? 255 | num_mixtures: The number of mixtures (excluding a dummy 'expert' that 256 | always predicts the non-existence of an entity). 257 | l2_penalty: How much to penalize the squared magnitudes of parameter 258 | values. 259 | Returns: 260 | A dictionary with a tensor containing the probability predictions of the 261 | model in the 'predictions' key. The dimensions of the tensor are 262 | batch_size x num_classes. 263 | """ 264 | num_mixtures = num_mixtures or FLAGS.moe_num_mixtures 265 | l2_penalty = FLAGS.moe_l2 266 | 267 | gate_activations = slim.fully_connected( 268 | model_input, 269 | vocab_size * (num_mixtures + 1), 270 | activation_fn=None, 271 | biases_initializer=None, 272 | weights_regularizer=slim.l2_regularizer(l2_penalty), 273 | scope="gates") 274 | 275 | expert_activations = slim.fully_connected( 276 | model_input, 277 | vocab_size * num_mixtures, 278 | activation_fn=None, 279 | weights_regularizer=slim.l2_regularizer(l2_penalty), 280 | scope="experts") 281 | 282 | gating_distribution = tf.nn.softmax(tf.reshape( 283 | gate_activations, 284 | [-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1) 285 | expert_distribution = tf.nn.sigmoid(tf.reshape( 286 | expert_activations, 287 | [-1, num_mixtures])) # (Batch * #Labels) x num_mixtures 288 | 289 | probabilities_by_class_and_batch = tf.reduce_sum( 290 | gating_distribution[:, :num_mixtures] * expert_distribution, 1) 291 | probabilities = tf.reshape(probabilities_by_class_and_batch, 292 | [-1, vocab_size]) 293 | 294 | fish_gate = fish_modules.FishGate(hidden_size=vocab_size, 295 | k=filter_size, 296 | dropout_rate=0.8, 297 | is_training=is_training) 298 | 299 | probabilities = fish_gate.forward(probabilities) 300 | 301 | # probabilities = tf.layers.dense(probabilities, vocab_size, use_bias=True, activation=tf.nn.softmax) 302 | 303 | return {"predictions": probabilities} 304 | 305 | 306 | class FishMoeModel4(models.BaseModel): 307 | """A softmax over a mixture of logistic models (with L2 regularization).""" 308 | 309 | def create_model(self, 310 | model_input, 311 | vocab_size, 312 | is_training, 313 | num_mixtures=None, 314 | l2_penalty=1e-8, 315 | filter_size=2, 316 | **unused_params): 317 | """Creates a Mixture of (Logistic) Experts model. 318 | It also includes the possibility of gating the probabilities 319 | The model consists of a per-class softmax distribution over a 320 | configurable number of logistic classifiers. One of the classifiers in the 321 | mixture is not trained, and always predicts 0. 322 | Args: 323 | model_input: 'batch_size' x 'num_features' matrix of input features. 324 | vocab_size: The number of classes in the dataset. 325 | is_training: Is this the training phase ? 326 | num_mixtures: The number of mixtures (excluding a dummy 'expert' that 327 | always predicts the non-existence of an entity). 328 | l2_penalty: How much to penalize the squared magnitudes of parameter 329 | values. 330 | Returns: 331 | A dictionary with a tensor containing the probability predictions of the 332 | model in the 'predictions' key. The dimensions of the tensor are 333 | batch_size x num_classes. 334 | """ 335 | num_mixtures = num_mixtures or FLAGS.moe_num_mixtures 336 | l2_penalty = FLAGS.moe_l2 337 | 338 | fc1 = tf.layers.dense(model_input, vocab_size, activation=tf.nn.relu, 339 | kernel_regularizer=slim.l2_regularizer(l2_penalty)) 340 | fc1 = tf.layers.batch_normalization(fc1, training=is_training) 341 | if is_training: 342 | fc1 = tf.nn.dropout(fc1, keep_prob=0.9) 343 | 344 | fc2 = tf.layers.dense(fc1, vocab_size, activation=tf.nn.relu, 345 | kernel_regularizer=slim.l2_regularizer(l2_penalty)) 346 | fc2 = tf.layers.batch_normalization(fc2, training=is_training) 347 | if is_training: 348 | fc2 = tf.nn.dropout(fc2, keep_prob=0.9) 349 | 350 | fc3 = tf.layers.dense(fc2, vocab_size, activation=tf.nn.sigmoid, 351 | kernel_regularizer=slim.l2_regularizer(l2_penalty)) 352 | fc3 = tf.layers.batch_normalization(fc3, training=is_training) 353 | if is_training: 354 | fc3 = tf.nn.dropout(fc3, keep_prob=0.9) 355 | 356 | fish_gate = fish_modules.FishGate(hidden_size=vocab_size, 357 | k=filter_size, 358 | dropout_rate=0.9, 359 | is_training=is_training) 360 | probabilities = fish_gate.forward(fc3) 361 | 362 | # probabilities = tf.layers.dense(probabilities, vocab_size, use_bias=True, activation=tf.nn.softmax) 363 | 364 | return {"predictions": probabilities} 365 | 366 | 367 | class FishMoeModel3(models.BaseModel): 368 | """A softmax over a mixture of logistic models (with L2 regularization).""" 369 | 370 | def create_model(self, 371 | model_input, 372 | vocab_size, 373 | is_training, 374 | num_mixtures=None, 375 | l2_penalty=1e-6, 376 | filter_size=2, 377 | **unused_params): 378 | """Creates a Mixture of (Logistic) Experts model. 379 | It also includes the possibility of gating the probabilities 380 | The model consists of a per-class softmax distribution over a 381 | configurable number of logistic classifiers. One of the classifiers in the 382 | mixture is not trained, and always predicts 0. 383 | Args: 384 | model_input: 'batch_size' x 'num_features' matrix of input features. 385 | vocab_size: The number of classes in the dataset. 386 | is_training: Is this the training phase ? 387 | num_mixtures: The number of mixtures (excluding a dummy 'expert' that 388 | always predicts the non-existence of an entity). 389 | l2_penalty: How much to penalize the squared magnitudes of parameter 390 | values. 391 | Returns: 392 | A dictionary with a tensor containing the probability predictions of the 393 | model in the 'predictions' key. The dimensions of the tensor are 394 | batch_size x num_classes. 395 | """ 396 | num_mixtures = num_mixtures or FLAGS.moe_num_mixtures 397 | l2_penalty = FLAGS.moe_l2 398 | 399 | gate_activations = slim.fully_connected( 400 | model_input, 401 | vocab_size * (num_mixtures + 1), 402 | activation_fn=None, 403 | biases_initializer=None, 404 | weights_regularizer=slim.l2_regularizer(l2_penalty), 405 | scope="gates") 406 | 407 | expert_activations = slim.fully_connected( 408 | model_input, 409 | vocab_size * num_mixtures, 410 | activation_fn=None, 411 | weights_regularizer=slim.l2_regularizer(l2_penalty), 412 | scope="experts") 413 | 414 | gating_distribution = tf.nn.softmax(tf.reshape( 415 | gate_activations, 416 | [-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1) 417 | expert_distribution = tf.nn.sigmoid(tf.reshape( 418 | expert_activations, 419 | [-1, num_mixtures])) # (Batch * #Labels) x num_mixtures 420 | 421 | probabilities_by_class_and_batch = tf.reduce_sum( 422 | gating_distribution[:, :num_mixtures] * expert_distribution, 1) 423 | probabilities0 = tf.reshape(probabilities_by_class_and_batch, 424 | [-1, vocab_size]) 425 | probabilities0 = tf.layers.batch_normalization(probabilities0, training=is_training) 426 | 427 | r_activation0 = tf.layers.dense(probabilities0, vocab_size * filter_size, use_bias=True, activation=tf.nn.relu) 428 | r_activation0 = tf.layers.batch_normalization(r_activation0, training=is_training) 429 | if is_training: 430 | r_activation0 = tf.layers.dropout(r_activation0, 0.9) 431 | r_activation1 = tf.layers.dense(r_activation0, vocab_size, use_bias=True, activation=None) 432 | 433 | probabilities1 = probabilities0 + r_activation1 434 | probabilities1 = tf.contrib.layers.layer_norm(probabilities1) 435 | probabilities1 = tf.layers.batch_normalization(probabilities1, training=is_training) 436 | probabilities2 = tf.layers.dense(probabilities1, vocab_size, use_bias=True, activation=tf.nn.softmax) 437 | 438 | return {"predictions": probabilities2} 439 | 440 | 441 | class MoeModel2(models.BaseModel): 442 | """A softmax over a mixture of logistic models (with L2 regularization).""" 443 | 444 | def create_model(self, 445 | model_input, 446 | vocab_size, 447 | is_training, 448 | num_mixtures=None, 449 | l2_penalty=1e-8, 450 | **unused_params): 451 | """Creates a Mixture of (Logistic) Experts model. 452 | It also includes the possibility of gating the probabilities 453 | The model consists of a per-class softmax distribution over a 454 | configurable number of logistic classifiers. One of the classifiers in the 455 | mixture is not trained, and always predicts 0. 456 | Args: 457 | model_input: 'batch_size' x 'num_features' matrix of input features. 458 | vocab_size: The number of classes in the dataset. 459 | is_training: Is this the training phase ? 460 | num_mixtures: The number of mixtures (excluding a dummy 'expert' that 461 | always predicts the non-existence of an entity). 462 | l2_penalty: How much to penalize the squared magnitudes of parameter 463 | values. 464 | Returns: 465 | A dictionary with a tensor containing the probability predictions of the 466 | model in the 'predictions' key. The dimensions of the tensor are 467 | batch_size x num_classes. 468 | """ 469 | num_mixtures = 3 470 | low_rank_gating = FLAGS.moe_low_rank_gating 471 | l2_penalty = FLAGS.moe_l2 472 | gating_probabilities = FLAGS.moe_prob_gating 473 | gating_input = FLAGS.moe_prob_gating_input 474 | 475 | if low_rank_gating == -1: 476 | gate_activations = slim.fully_connected( 477 | model_input, 478 | vocab_size * (num_mixtures + 1), 479 | activation_fn=None, 480 | biases_initializer=None, 481 | weights_regularizer=slim.l2_regularizer(l2_penalty), 482 | scope="gates") 483 | else: 484 | gate_activations1 = slim.fully_connected( 485 | model_input, 486 | low_rank_gating, 487 | activation_fn=None, 488 | biases_initializer=None, 489 | weights_regularizer=slim.l2_regularizer(l2_penalty), 490 | scope="gates1") 491 | gate_activations = slim.fully_connected( 492 | gate_activations1, 493 | vocab_size * (num_mixtures + 1), 494 | activation_fn=None, 495 | biases_initializer=None, 496 | weights_regularizer=slim.l2_regularizer(l2_penalty), 497 | scope="gates2") 498 | 499 | expert_activations = slim.fully_connected( 500 | model_input, 501 | vocab_size * num_mixtures, 502 | activation_fn=None, 503 | weights_regularizer=slim.l2_regularizer(l2_penalty), 504 | scope="experts") 505 | 506 | gating_distribution = tf.nn.softmax(tf.reshape( 507 | gate_activations, 508 | [-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1) 509 | expert_distribution = tf.nn.sigmoid(tf.reshape( 510 | expert_activations, 511 | [-1, num_mixtures])) # (Batch * #Labels) x num_mixtures 512 | 513 | probabilities_by_class_and_batch = tf.reduce_sum( 514 | gating_distribution[:, :num_mixtures] * expert_distribution, 1) 515 | probabilities = tf.reshape(probabilities_by_class_and_batch, 516 | [-1, vocab_size]) 517 | 518 | filter1 = tf.layers.dense(probabilities, 519 | vocab_size * 2, 520 | use_bias=True, 521 | activation=tf.nn.relu, 522 | name="v-filter1") 523 | filter1 = tf.layers.batch_normalization(filter1, training=is_training) 524 | 525 | if is_training: 526 | filter1 = tf.nn.dropout(filter1, 0.8) 527 | 528 | filter2 = tf.layers.dense(filter1, 529 | vocab_size, 530 | use_bias=False, 531 | activation=None, 532 | name="v-filter2") 533 | 534 | probabilities = probabilities + filter2 535 | probabilities = tf.nn.relu(probabilities) 536 | probabilities = tf.layers.batch_normalization(probabilities, training=is_training) 537 | 538 | probabilities = tf.layers.dense(probabilities, vocab_size, use_bias=True, 539 | activation=tf.nn.sigmoid, name="v-final_output") 540 | 541 | return {"predictions": probabilities} 542 | 543 | 544 | class JuhanMoeModel(models.BaseModel): 545 | """A softmax over a mixture of logistic models (with L2 regularization).""" 546 | 547 | def create_model(self, 548 | model_input, 549 | vocab_size, 550 | is_training, 551 | num_mixtures=None, 552 | l2_penalty=1e-8, 553 | **unused_params): 554 | """Creates a Mixture of (Logistic) Experts model. 555 | The model consists of a per-class softmax distribution over a 556 | configurable number of logistic classifiers. One of the classifiers in the 557 | mixture is not trained, and always predicts 0. 558 | Args: 559 | model_input: 'batch_size' x 'num_features' matrix of input features. 560 | vocab_size: The number of classes in the dataset. 561 | num_mixtures: The number of mixtures (excluding a dummy 'expert' that 562 | always predicts the non-existence of an entity). 563 | l2_penalty: How much to penalize the squared magnitudes of parameter 564 | values. 565 | Returns: 566 | A dictionary with a tensor containing the probability predictions of the 567 | model in the 'predictions' key. The dimensions of the tensor are 568 | batch_size x num_classes. 569 | """ 570 | num_mixtures = 3 571 | 572 | gate_activations = slim.fully_connected( 573 | model_input, 574 | vocab_size * (num_mixtures + 1), 575 | activation_fn=None, 576 | biases_initializer=None, 577 | weights_regularizer=slim.l2_regularizer(l2_penalty), 578 | scope="gates") 579 | expert_activations = slim.fully_connected( 580 | model_input, 581 | vocab_size * num_mixtures, 582 | activation_fn=None, 583 | weights_regularizer=slim.l2_regularizer(l2_penalty), 584 | scope="experts") 585 | 586 | gating_distribution = tf.nn.softmax(tf.reshape( 587 | gate_activations, 588 | [-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1) 589 | expert_distribution = tf.nn.sigmoid(tf.reshape( 590 | expert_activations, 591 | [-1, num_mixtures])) # (Batch * #Labels) x num_mixtures 592 | 593 | final_probabilities_by_class_and_batch = tf.reduce_sum( 594 | gating_distribution[:, :num_mixtures] * expert_distribution, 1) 595 | probabilities = tf.reshape(final_probabilities_by_class_and_batch, 596 | [-1, vocab_size]) 597 | if is_training: 598 | probabilities = tf.nn.dropout(probabilities, 0.8) 599 | 600 | filter1 = tf.layers.dense(probabilities, 601 | vocab_size * 2, 602 | use_bias=True, 603 | activation=tf.nn.leaky_relu, 604 | name="v-filter1") 605 | filter1 = tf.layers.batch_normalization(filter1, training=is_training) 606 | if is_training: 607 | filter1 = tf.nn.dropout(filter1, 0.8) 608 | 609 | filter2 = tf.layers.dense(filter1, 610 | vocab_size, 611 | use_bias=False, 612 | activation=None, 613 | name="v-filter2") 614 | 615 | probabilities = probabilities + filter2 616 | probabilities = tf.nn.leaky_relu(probabilities) 617 | probabilities = tf.layers.batch_normalization(probabilities, training=is_training) 618 | 619 | probabilities = tf.layers.dense(probabilities, vocab_size, use_bias=True, 620 | activation=tf.nn.sigmoid, name="v-final_output") 621 | 622 | return {"predictions": probabilities} 623 | 624 | 625 | class FourLayerBatchNeuralModel(models.BaseModel): 626 | def create_model(self, 627 | model_input, 628 | vocab_size, 629 | is_training, 630 | l2_penalty=1e-7, 631 | **unused_params): 632 | model_input_dim = model_input.get_shape().as_list()[1] 633 | fc1_weights = tf.get_variable("fc1_weights", 634 | [model_input_dim, vocab_size], 635 | initializer=tf.contrib.layers.xavier_initializer()) 636 | tf.summary.histogram("fc1_weights", fc1_weights) 637 | fc1_activation = tf.matmul(model_input, fc1_weights) 638 | fc1_activation = tf.nn.relu(fc1_activation) 639 | fc1_activation = slim.batch_norm( 640 | fc1_activation, 641 | center=True, 642 | scale=True, 643 | is_training=is_training, 644 | scope="fc1_activation_bn") 645 | 646 | fc2_weights = tf.get_variable("fc2_weights", 647 | [vocab_size, vocab_size], 648 | initializer=tf.contrib.layers.xavier_initializer()) 649 | tf.summary.histogram("fc2_weights", fc2_weights) 650 | fc2_activation = tf.matmul(fc1_activation, fc2_weights) 651 | fc2_activation = tf.nn.relu(fc2_activation) 652 | fc2_activation = slim.batch_norm( 653 | fc2_activation, 654 | center=True, 655 | scale=True, 656 | is_training=is_training, 657 | scope="fc2_activation_bn") 658 | 659 | fc3_weights = tf.get_variable("fc3_weights", 660 | [vocab_size, vocab_size], 661 | initializer=tf.contrib.layers.xavier_initializer()) 662 | tf.summary.histogram("fc3_weights", fc3_weights) 663 | fc3_activation = tf.matmul(fc2_activation, fc3_weights) 664 | fc3_activation = tf.nn.relu(fc3_activation) 665 | fc3_activation = slim.batch_norm( 666 | fc3_activation, 667 | center=True, 668 | scale=True, 669 | is_training=is_training, 670 | scope="fc3_activation_bn") 671 | 672 | fc4_weights = tf.get_variable("fc4_weights", 673 | [vocab_size, vocab_size], 674 | initializer=tf.contrib.layers.xavier_initializer()) 675 | fc4_activation = tf.matmul(fc3_activation, fc4_weights) 676 | cluster_biases = tf.get_variable("fc4_bias", 677 | [vocab_size], 678 | initializer=tf.constant_initializer(0.01)) 679 | tf.summary.histogram("fc4_bias", cluster_biases) 680 | fc4_activation += cluster_biases 681 | 682 | fc4_activation = tf.sigmoid(fc4_activation) 683 | 684 | return {"predictions": fc4_activation} 685 | 686 | 687 | class ClassLearningThreeNnModel(models.BaseModel): 688 | def create_model(self, 689 | model_input, 690 | vocab_size, 691 | is_training, 692 | l2_penalty=1e-8, 693 | ortho_reg=0, 694 | **unused_params): 695 | fc1 = slim.fully_connected( 696 | model_input, vocab_size, activation_fn=None, biases_initializer=None, 697 | weights_regularizer=slim.l2_regularizer(l2_penalty)) 698 | fc1 = tf.contrib.layers.layer_norm(inputs=fc1, center=True, scale=True, activation_fn=tf.nn.leaky_relu) 699 | if is_training: 700 | fc1 = tf.nn.dropout(fc1, keep_prob=0.5) 701 | 702 | fc2 = slim.fully_connected( 703 | fc1, vocab_size, activation_fn=None, biases_initializer=None, 704 | weights_regularizer=slim.l2_regularizer(l2_penalty)) 705 | fc2 = tf.contrib.layers.layer_norm(inputs=fc2, center=True, scale=True, activation_fn=tf.nn.leaky_relu) 706 | if is_training: 707 | fc2 = tf.nn.dropout(fc2, keep_prob=0.5) 708 | 709 | fc3 = slim.fully_connected( 710 | fc2, vocab_size, activation_fn=tf.nn.sigmoid, biases_initializer=tf.constant_initializer(0.1), 711 | weights_regularizer=slim.l2_regularizer(l2_penalty)) 712 | 713 | return {"predictions": fc3, 714 | "regularization_loss": ortho_reg} 715 | 716 | 717 | class ClassLearningFourNnModel(models.BaseModel): 718 | def create_model(self, 719 | model_input, 720 | vocab_size, 721 | is_training, 722 | l2_penalty=1e-8, 723 | ortho_reg=0, 724 | **unused_params): 725 | fc1 = slim.fully_connected( 726 | model_input, vocab_size, activation_fn=None, biases_initializer=None, 727 | weights_regularizer=slim.l2_regularizer(l2_penalty)) 728 | fc1 = tf.contrib.layers.layer_norm(inputs=fc1, center=True, scale=True, activation_fn=tf.nn.leaky_relu) 729 | # if is_training: 730 | # fc1 = tf.nn.dropout(fc1, keep_prob=0.5) 731 | 732 | fc2 = slim.fully_connected( 733 | fc1, vocab_size, activation_fn=None, biases_initializer=None, 734 | weights_regularizer=slim.l2_regularizer(l2_penalty)) 735 | fc2 = tf.contrib.layers.layer_norm(inputs=fc2, center=True, scale=True, activation_fn=tf.nn.leaky_relu) 736 | # if is_training: 737 | # fc2 = tf.nn.dropout(fc2, keep_prob=0.5) 738 | 739 | fc3 = slim.fully_connected( 740 | fc2, vocab_size, activation_fn=None, biases_initializer=None, 741 | weights_regularizer=slim.l2_regularizer(l2_penalty)) 742 | fc3 = tf.contrib.layers.layer_norm(inputs=fc3, center=True, scale=True, activation_fn=tf.nn.leaky_relu) 743 | 744 | fc4 = slim.fully_connected( 745 | fc3, vocab_size, activation_fn=tf.nn.sigmoid, biases_initializer=tf.constant_initializer(0.1), 746 | weights_regularizer=slim.l2_regularizer(l2_penalty)) 747 | 748 | return {"predictions": fc4, 749 | "regularization_loss": ortho_reg} --------------------------------------------------------------------------------