├── LICENSE ├── README.md ├── archs ├── __init__.py ├── bn_inception.py └── mobilenet_v2.py ├── main.py ├── ops ├── PAN_modules.py ├── __init__.py ├── basic_ops.py ├── dataset.py ├── dataset_config.py ├── models.py ├── non_local.py ├── temporal_shift.py ├── transforms.py └── utils.py ├── opts.py ├── pretrained └── models_urls.md ├── scripts ├── test │ ├── sthv1 │ │ ├── En.sh │ │ ├── Full.sh │ │ └── Lite.sh │ └── sthv2 │ │ ├── En.sh │ │ ├── Full.sh │ │ └── Lite.sh └── train │ ├── sthv1 │ ├── Full_PA.sh │ ├── Full_RGB.sh │ └── Lite.sh │ └── sthv2 │ ├── Full_PA.sh │ ├── Full_RGB.sh │ └── Lite.sh ├── test_models.py └── tools ├── gen_label_kinetics.py ├── gen_label_sthv1.py ├── gen_label_sthv2.py ├── kinetics_label_map.txt ├── vid2img_kinetics.py └── vid2img_sthv2.py /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright MIT HAN Lab 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PAN: Persistent Appearance Network 2 | 3 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pan-towards-fast-action-recognition-via/action-recognition-in-videos-on-something-1)](https://paperswithcode.com/sota/action-recognition-in-videos-on-something-1?p=pan-towards-fast-action-recognition-via) 4 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pan-towards-fast-action-recognition-via/action-recognition-in-videos-on-something)](https://paperswithcode.com/sota/action-recognition-in-videos-on-something?p=pan-towards-fast-action-recognition-via) 5 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pan-towards-fast-action-recognition-via/action-recognition-in-videos-on-jester)](https://paperswithcode.com/sota/action-recognition-in-videos-on-jester?p=pan-towards-fast-action-recognition-via) 6 | 7 | PyTorch Implementation of paper: 8 | 9 | > **PAN: Towards Fast Action Recognition via Learning Persistence of Appearance** 10 | > 11 | > Can Zhang, Yuexian Zou\*, Guang Chen and Lei Gan. 12 | > 13 | > [[ArXiv](https://arxiv.org/abs/2008.03462)] 14 | 15 | ## Updates 16 | 17 | **[12 Aug 2020]** We have released the codebase and models of the PAN. 18 | 19 | ## Main Contribution 20 | 21 | Efficiently modeling dynamic motion information in videos is crucial for action recognition task. Most state-of-the-art methods heavily rely on dense optical flow as motion representation. Although combining optical flow with RGB frames as input can achieve excellent recognition performance, the optical flow extraction is very time-consuming. This undoubtably will count against real-time action recognition. In this paper, we shed light on **fast action recognition** by lifting the reliance on optical flow. We design a novel **motion cue** called **Persistence of Appearance (PA)** that focuses more on distilling the motion information at boundaries. Extensive experiments show that our PA is over 1000x faster (8196fps *vs.* 8fps) than conventional optical flow in terms of motion modeling speed. 22 | 23 |

24 | 25 |

26 | 27 | ## Content 28 | 29 | - [Dependencies](#dependencies) 30 | - [Data Preparation](#data-preparation) 31 | - [Core Codes](#core-codes) 32 | - [PA Module](#pa-module) 33 | - [VAP Module](#vap-module) 34 | - [Pretrained Models](#pretrained-models) 35 | + [Something-Something-V1](#something-something-v1) 36 | + [Something-Something-V2](#something-something-v2) 37 | - [Testing](#testing) 38 | - [Training](#training) 39 | - [Other Info](#other-info) 40 | - [References](#references) 41 | - [Citation](#citation) 42 | - [Contact](#contact) 43 | 44 | ## Dependencies 45 | 46 | Please make sure the following libraries are installed successfully: 47 | 48 | - [PyTorch](https://pytorch.org/) >= 1.0 49 | - [TensorboardX](https://github.com/lanpa/tensorboardX) 50 | - [tqdm](https://github.com/tqdm/tqdm.git) 51 | - [scikit-learn](https://scikit-learn.org/stable/) 52 | 53 | ## Data Preparation 54 | 55 | Following the common practice, we need to first extract videos into frames for fast reading. Please refer to [TSN](https://github.com/yjxiong/temporal-segment-networks) repo for the detailed guide of data pre-processing. We have successfully trained on [Kinetics](https://deepmind.com/research/open-source/open-source-datasets/kinetics/), [UCF101](http://crcv.ucf.edu/data/UCF101.php), [HMDB51](http://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/), [Something-Something-V1](https://20bn.com/datasets/something-something/v1) and [V2](https://20bn.com/datasets/something-something/v2), [Jester](https://20bn.com/datasets/jester) datasets with this codebase. Basically, the processing of video data can be summarized into 3 steps: 56 | 57 | 1. Extract frames from videos: 58 | 59 | * For Something-Something-V2 dataset, please use [tools/vid2img_sthv2.py](tools/vid2img_sthv2.py) 60 | 61 | * For Kinetics dataset, please use [tools/vid2img_kinetics.py](tools/vid2img_kinetics.py) 62 | 63 | 2. Generate file lists needed for dataloader: 64 | 65 | * Each line of the list file will contain a tuple of (*extracted video frame folder name, video frame number, and video groundtruth class*). A list file looks like this: 66 | 67 | ``` 68 | video_frame_folder 100 10 69 | video_2_frame_folder 150 31 70 | ... 71 | ``` 72 | 73 | * Or you can use off-the-shelf tools provided by other repos: 74 | * For Something-Something-V1 & V2 datasets, please use [tools/gen_label_sthv1.py](tools/gen_label_sthv1.py) & [tools/gen_label_sthv2.py](tools/gen_label_sthv2.py) 75 | * For Kinetics dataset, please use [tools/gen_label_kinetics.py](tools/gen_label_kinetics.py) 76 | 77 | 3. Add the information to [ops/dataset_configs.py](ops/dataset_configs.py) 78 | 79 | ## Core Codes 80 | 81 | ### PA Module 82 | 83 |

84 | 85 |

86 | 87 | PA module aims to speed up the motion modeling procedure, it can be simply injected at the bottom of the network to lift the reliance on optical flow. 88 | 89 | ```python 90 | from ops.PAN_modules import PA 91 | 92 | PA_module = PA(n_length=4) # adjacent '4' frames are sampled for computing PA 93 | # shape of x: [N*T*m, 3, H, W] 94 | x = torch.randn(5*8*4, 3, 224, 224) 95 | # shape of PA_out: [N*T, m-1, H, W] 96 | PA_out = PA_module(x) # torch.Size([40, 3, 224, 224]) 97 | ``` 98 | 99 | ### VAP Module 100 | 101 | VAP module aims to adaptively emphasize expressive features and suppress less informative ones by observing global information across various timescales. It is adopted at the top of the network to achieve long-term temporal modeling. 102 | 103 |

104 | 105 |

106 | 107 | ```python 108 | from ops.PAN_modules import VAP 109 | 110 | VAP_module = VAP(n_segment=8, feature_dim=2048, num_class=174, dropout_ratio=0.5) 111 | # shape of x: [N*T, D] 112 | x = torch.randn(5*8, 2048) 113 | # shape of VAP_out: [N, num_class] 114 | VAP_out = VAP_module(x) # torch.Size([5, 174]) 115 | ``` 116 | 117 | ## Pretrained Models 118 | 119 | Here, we provide the pretrained models of PAN models on Something-Something-V1 & V2 datasets. Recognizing actions in these datasets requires strong temporal modeling ability, as many action classes are symmetrical. PAN achieves state-of-the-art performance on these datasets. Notably, our method even surpasses optical flow based methods while with only RGB frames as input. 120 | 121 | ### Something-Something-V1 122 | 123 |
124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 |
ModelBackboneFLOPs * viewsVal Top1Val Top5Checkpoints
PANLiteResNet-5035.7G * 148.076.1[Google Drive] or [Weiyun]
PANFull67.7G * 150.579.2
PANEn(46.6G+88.4G) * 253.481.1
PANEnResNet-101(85.6G+166.1G) * 255.382.8[Google Drive] or [Weiyun]
166 |
167 | 168 | ### Something-Something-V2 169 | 170 |
171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 |
ModelBackboneFLOPs * viewsVal Top1Val Top5Checkpoints
PANLiteResNet-5035.7G * 160.886.7[Google Drive] or [Weiyun]
PANFull67.7G * 163.888.6
PANEn(46.6G+88.4G) * 266.290.1
PANEnResNet-101(85.6G+166.1G) * 266.590.6[Google Drive] or [Weiyun]
213 |
214 | 215 | ## Testing 216 | 217 | For example, to test the PAN models on Something-Something-V1, you can first put the downloaded `.pth.tar` files into the "pretrained" folder and then run: 218 | 219 | ```bash 220 | # test PAN_Lite 221 | bash scripts/test/sthv1/Lite.sh 222 | 223 | # test PAN_Full 224 | bash scripts/test/sthv1/Full.sh 225 | 226 | # test PAN_En 227 | bash scripts/test/sthv1/En.sh 228 | ``` 229 | 230 | ## Training 231 | 232 | We provided several scripts to train PAN with this repo, please refer to "[scripts](scripts/)" folder for more details. For example, to train PAN on Something-Something-V1, you can run: 233 | 234 | ```bash 235 | # train PAN_Lite 236 | bash scripts/train/sthv1/Lite.sh 237 | 238 | # train PAN_Full RGB branch 239 | bash scripts/train/sthv1/Full_RGB.sh 240 | 241 | # train PAN_Full PA branch 242 | bash scripts/train/sthv1/Full_PA.sh 243 | ``` 244 | 245 | Notice that you should scale up the learning rate with batch size. For example, if you use a batch size of 256 you should set learning rate to 0.04. 246 | 247 | ## Other Info 248 | 249 | ### References 250 | 251 | This repository is built upon the following baseline implementations for the action recognition task. 252 | 253 | - [TSM](https://github.com/mit-han-lab/temporal-shift-module) 254 | - [TSN](https://github.com/yjxiong/tsn-pytorch) 255 | 256 | ### Citation 257 | 258 | Please **[★star]** this repo and **[cite]** the following arXiv paper if you feel our PAN useful to your research: 259 | 260 | ``` 261 | @misc{zhang2020pan, 262 | title={PAN: Towards Fast Action Recognition via Learning Persistence of Appearance}, 263 | author={Can Zhang and Yuexian Zou and Guang Chen and Lei Gan}, 264 | year={2020}, 265 | eprint={2008.03462}, 266 | archivePrefix={arXiv}, 267 | primaryClass={cs.CV} 268 | } 269 | ``` 270 | 271 | Or if you prefer "publication", you can cite our preliminary work on ACM MM 2019: 272 | 273 | ``` 274 | @inproceedings{zhang2019pan, 275 | title={PAN: Persistent Appearance Network with an Efficient Motion Cue for Fast Action Recognition}, 276 | author={Zhang, Can and Zou, Yuexian and Chen, Guang and Gan, Lei}, 277 | booktitle={Proceedings of the 27th ACM International Conference on Multimedia}, 278 | pages={500--509}, 279 | year={2019} 280 | } 281 | ``` 282 | 283 | ### Contact 284 | 285 | For any questions, please feel free to open an issue or contact: 286 | 287 | ``` 288 | Can Zhang: zhang.can.pku@gmail.com 289 | ``` 290 | -------------------------------------------------------------------------------- /archs/__init__.py: -------------------------------------------------------------------------------- 1 | from .bn_inception import * 2 | -------------------------------------------------------------------------------- /archs/bn_inception.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division, absolute_import 2 | import torch 3 | import torch.nn as nn 4 | import torch.utils.model_zoo as model_zoo 5 | import torch.nn.functional as F 6 | 7 | 8 | __all__ = ['BNInception', 'bninception'] 9 | 10 | pretrained_settings = { 11 | 'bninception': { 12 | 'imagenet': { 13 | 'url': 'https://www.dropbox.com/s/3cvod6kzwluijcw/BNInception-9baff57459f5a1744.pth?dl=1', 14 | 'input_space': 'BGR', 15 | 'input_size': 224, 16 | 'input_range': [0, 255], 17 | 'mean': [104, 117, 128], 18 | 'std': [1, 1, 1], 19 | 'num_classes': 1000 20 | }, 21 | 'kinetics': { 22 | 'url': 'https://www.dropbox.com/s/gx4u7itoyygix0c/BNInceptionKinetics-47f0695e.pth?dl=1', 23 | 'input_space': 'BGR', 24 | 'input_size': 224, 25 | 'input_range': [0, 255], 26 | 'mean': [104, 117, 128], # [96.29023126, 103.16065604, 110.63666788] 27 | 'std': [1, 1, 1], # [40.02898126, 37.88248729, 38.7568578], 28 | 'num_classes': 400 29 | } 30 | }, 31 | } 32 | 33 | 34 | class BNInception(nn.Module): 35 | def __init__(self, num_classes=1000): 36 | super(BNInception, self).__init__() 37 | inplace = True 38 | self._build_features(inplace, num_classes) 39 | 40 | def forward(self, x): 41 | # if self.input_space == 'BGR': 42 | # assert len(x.size()) == 4 43 | # x = x[:, (2, 1, 0)] 44 | x = self.features(x) 45 | x = self.logits(x) 46 | return x 47 | 48 | def features(self, x): 49 | # stage1 50 | pool1_3x3_s2_out = self._temporal_forward_wrap(self._block_1, 0)(x) 51 | # stage2 52 | pool2_3x3_s2_out = self._temporal_forward_wrap(self._block_2, 1)(pool1_3x3_s2_out) 53 | 54 | # stage3 55 | inception_3a_output_out = self._temporal_forward_wrap(self._block_3a, 2)(pool2_3x3_s2_out) 56 | inception_3b_output_out = self._temporal_forward_wrap(self._block_3b, 3)(inception_3a_output_out) 57 | inception_3c_output_out = self._temporal_forward_wrap(self._block_3c, 4)(inception_3b_output_out) 58 | 59 | inception_4a_output_out = self._temporal_forward_wrap(self._block_4a, 5)(inception_3c_output_out) 60 | inception_4b_output_out = self._temporal_forward_wrap(self._block_4b, 6)(inception_4a_output_out) 61 | inception_4c_output_out = self._temporal_forward_wrap(self._block_4c, 7)(inception_4b_output_out) 62 | inception_4d_output_out = self._temporal_forward_wrap(self._block_4d, 8)(inception_4c_output_out) 63 | inception_4e_output_out = self._temporal_forward_wrap(self._block_4e, 9)(inception_4d_output_out) 64 | 65 | inception_5a_output_out = self._temporal_forward_wrap(self._block_5a, 10)(inception_4e_output_out) 66 | inception_5b_output_out = self._temporal_forward_wrap(self._block_5b, 11)(inception_5a_output_out) 67 | 68 | return inception_5b_output_out 69 | 70 | def logits(self, features): 71 | x = self.global_pool(features) 72 | x = x.view(x.size(0), -1) 73 | x = self.fc(x) 74 | return x 75 | 76 | def build_temporal_ops(self, n_segment, is_temporal_shift='0' * 12, shift_div=8): 77 | # must call after loading weights 78 | self.n_segment = n_segment 79 | self.residual = 'res' in is_temporal_shift 80 | if self.residual: 81 | print('=> Using residual shift functions...') 82 | if is_temporal_shift in ['block', 'blockres']: 83 | self.is_temporal_shift = '1' * 12 84 | else: 85 | self.is_temporal_shift = is_temporal_shift 86 | self.is_temporal_shift = '0' + self.is_temporal_shift[1:] # image input does not shift 87 | 88 | assert len(self.is_temporal_shift) == 12 89 | 90 | print('=> Injecting temporal shift with mask {}'.format(self.is_temporal_shift)) 91 | self.fold_div = shift_div 92 | print('=> Using fold div: {}'.format(self.fold_div)) 93 | 94 | def _temporal_forward_wrap(self, layer_func, index): 95 | if hasattr(self, 'is_temporal_shift') and self.is_temporal_shift[index] == '1': # run temporal shuffling 96 | from ops.temporal_shift import TemporalShift 97 | def wrapped_func(x, is_residual, n_segment, fold_div): 98 | if is_residual: 99 | x_shift = TemporalShift.shift(x, n_segment, fold_div=fold_div) 100 | return F.relu(x + layer_func(x_shift)) 101 | else: 102 | x = TemporalShift.shift(x, n_segment, fold_div=fold_div) 103 | return layer_func(x) 104 | from functools import partial 105 | return partial(wrapped_func, is_residual=self.residual, n_segment=self.n_segment, 106 | fold_div=self.fold_div) 107 | else: 108 | return layer_func 109 | 110 | def _block_1(self, x): 111 | conv1_7x7_s2_out = self.conv1_7x7_s2(x) 112 | conv1_7x7_s2_bn_out = self.conv1_7x7_s2_bn(conv1_7x7_s2_out) 113 | conv1_relu_7x7_out = self.conv1_relu_7x7(conv1_7x7_s2_bn_out) 114 | pool1_3x3_s2_out = self.pool1_3x3_s2(conv1_7x7_s2_bn_out) 115 | return pool1_3x3_s2_out 116 | 117 | def _block_2(self, x): 118 | conv2_3x3_reduce_out = self.conv2_3x3_reduce(x) 119 | conv2_3x3_reduce_bn_out = self.conv2_3x3_reduce_bn(conv2_3x3_reduce_out) 120 | conv2_relu_3x3_reduce_out = self.conv2_relu_3x3_reduce(conv2_3x3_reduce_bn_out) 121 | conv2_3x3_out = self.conv2_3x3(conv2_3x3_reduce_bn_out) 122 | conv2_3x3_bn_out = self.conv2_3x3_bn(conv2_3x3_out) 123 | conv2_relu_3x3_out = self.conv2_relu_3x3(conv2_3x3_bn_out) 124 | pool2_3x3_s2_out = self.pool2_3x3_s2(conv2_3x3_bn_out) 125 | return pool2_3x3_s2_out 126 | 127 | def _block_3a(self, pool2_3x3_s2_out): 128 | inception_3a_1x1_out = self.inception_3a_1x1(pool2_3x3_s2_out) 129 | inception_3a_1x1_bn_out = self.inception_3a_1x1_bn(inception_3a_1x1_out) 130 | inception_3a_relu_1x1_out = self.inception_3a_relu_1x1(inception_3a_1x1_bn_out) 131 | inception_3a_3x3_reduce_out = self.inception_3a_3x3_reduce(pool2_3x3_s2_out) 132 | inception_3a_3x3_reduce_bn_out = self.inception_3a_3x3_reduce_bn(inception_3a_3x3_reduce_out) 133 | inception_3a_relu_3x3_reduce_out = self.inception_3a_relu_3x3_reduce(inception_3a_3x3_reduce_bn_out) 134 | inception_3a_3x3_out = self.inception_3a_3x3(inception_3a_3x3_reduce_bn_out) 135 | inception_3a_3x3_bn_out = self.inception_3a_3x3_bn(inception_3a_3x3_out) 136 | inception_3a_relu_3x3_out = self.inception_3a_relu_3x3(inception_3a_3x3_bn_out) 137 | inception_3a_double_3x3_reduce_out = self.inception_3a_double_3x3_reduce(pool2_3x3_s2_out) 138 | inception_3a_double_3x3_reduce_bn_out = self.inception_3a_double_3x3_reduce_bn( 139 | inception_3a_double_3x3_reduce_out) 140 | inception_3a_relu_double_3x3_reduce_out = self.inception_3a_relu_double_3x3_reduce( 141 | inception_3a_double_3x3_reduce_bn_out) 142 | inception_3a_double_3x3_1_out = self.inception_3a_double_3x3_1(inception_3a_double_3x3_reduce_bn_out) 143 | inception_3a_double_3x3_1_bn_out = self.inception_3a_double_3x3_1_bn(inception_3a_double_3x3_1_out) 144 | inception_3a_relu_double_3x3_1_out = self.inception_3a_relu_double_3x3_1(inception_3a_double_3x3_1_bn_out) 145 | inception_3a_double_3x3_2_out = self.inception_3a_double_3x3_2(inception_3a_double_3x3_1_bn_out) 146 | inception_3a_double_3x3_2_bn_out = self.inception_3a_double_3x3_2_bn(inception_3a_double_3x3_2_out) 147 | inception_3a_relu_double_3x3_2_out = self.inception_3a_relu_double_3x3_2(inception_3a_double_3x3_2_bn_out) 148 | inception_3a_pool_out = self.inception_3a_pool(pool2_3x3_s2_out) 149 | inception_3a_pool_proj_out = self.inception_3a_pool_proj(inception_3a_pool_out) 150 | inception_3a_pool_proj_bn_out = self.inception_3a_pool_proj_bn(inception_3a_pool_proj_out) 151 | inception_3a_relu_pool_proj_out = self.inception_3a_relu_pool_proj(inception_3a_pool_proj_bn_out) 152 | inception_3a_output_out = torch.cat( 153 | [inception_3a_1x1_bn_out, inception_3a_3x3_bn_out, inception_3a_double_3x3_2_bn_out, 154 | inception_3a_pool_proj_bn_out], 1) 155 | return inception_3a_output_out 156 | 157 | def _block_3b(self, inception_3a_output_out): 158 | inception_3b_1x1_out = self.inception_3b_1x1(inception_3a_output_out) 159 | inception_3b_1x1_bn_out = self.inception_3b_1x1_bn(inception_3b_1x1_out) 160 | inception_3b_relu_1x1_out = self.inception_3b_relu_1x1(inception_3b_1x1_bn_out) 161 | inception_3b_3x3_reduce_out = self.inception_3b_3x3_reduce(inception_3a_output_out) 162 | inception_3b_3x3_reduce_bn_out = self.inception_3b_3x3_reduce_bn(inception_3b_3x3_reduce_out) 163 | inception_3b_relu_3x3_reduce_out = self.inception_3b_relu_3x3_reduce(inception_3b_3x3_reduce_bn_out) 164 | inception_3b_3x3_out = self.inception_3b_3x3(inception_3b_3x3_reduce_bn_out) 165 | inception_3b_3x3_bn_out = self.inception_3b_3x3_bn(inception_3b_3x3_out) 166 | inception_3b_relu_3x3_out = self.inception_3b_relu_3x3(inception_3b_3x3_bn_out) 167 | inception_3b_double_3x3_reduce_out = self.inception_3b_double_3x3_reduce(inception_3a_output_out) 168 | inception_3b_double_3x3_reduce_bn_out = self.inception_3b_double_3x3_reduce_bn( 169 | inception_3b_double_3x3_reduce_out) 170 | inception_3b_relu_double_3x3_reduce_out = self.inception_3b_relu_double_3x3_reduce( 171 | inception_3b_double_3x3_reduce_bn_out) 172 | inception_3b_double_3x3_1_out = self.inception_3b_double_3x3_1(inception_3b_double_3x3_reduce_bn_out) 173 | inception_3b_double_3x3_1_bn_out = self.inception_3b_double_3x3_1_bn(inception_3b_double_3x3_1_out) 174 | inception_3b_relu_double_3x3_1_out = self.inception_3b_relu_double_3x3_1(inception_3b_double_3x3_1_bn_out) 175 | inception_3b_double_3x3_2_out = self.inception_3b_double_3x3_2(inception_3b_double_3x3_1_bn_out) 176 | inception_3b_double_3x3_2_bn_out = self.inception_3b_double_3x3_2_bn(inception_3b_double_3x3_2_out) 177 | inception_3b_relu_double_3x3_2_out = self.inception_3b_relu_double_3x3_2(inception_3b_double_3x3_2_bn_out) 178 | inception_3b_pool_out = self.inception_3b_pool(inception_3a_output_out) 179 | inception_3b_pool_proj_out = self.inception_3b_pool_proj(inception_3b_pool_out) 180 | inception_3b_pool_proj_bn_out = self.inception_3b_pool_proj_bn(inception_3b_pool_proj_out) 181 | inception_3b_relu_pool_proj_out = self.inception_3b_relu_pool_proj(inception_3b_pool_proj_bn_out) 182 | inception_3b_output_out = torch.cat( 183 | [inception_3b_1x1_bn_out, inception_3b_3x3_bn_out, inception_3b_double_3x3_2_bn_out, 184 | inception_3b_pool_proj_bn_out], 1) 185 | return inception_3b_output_out 186 | 187 | def _block_3c(self, inception_3b_output_out): 188 | inception_3c_3x3_reduce_out = self.inception_3c_3x3_reduce(inception_3b_output_out) 189 | inception_3c_3x3_reduce_bn_out = self.inception_3c_3x3_reduce_bn(inception_3c_3x3_reduce_out) 190 | inception_3c_relu_3x3_reduce_out = self.inception_3c_relu_3x3_reduce(inception_3c_3x3_reduce_bn_out) 191 | inception_3c_3x3_out = self.inception_3c_3x3(inception_3c_3x3_reduce_bn_out) 192 | inception_3c_3x3_bn_out = self.inception_3c_3x3_bn(inception_3c_3x3_out) 193 | inception_3c_relu_3x3_out = self.inception_3c_relu_3x3(inception_3c_3x3_bn_out) 194 | inception_3c_double_3x3_reduce_out = self.inception_3c_double_3x3_reduce(inception_3b_output_out) 195 | inception_3c_double_3x3_reduce_bn_out = self.inception_3c_double_3x3_reduce_bn( 196 | inception_3c_double_3x3_reduce_out) 197 | inception_3c_relu_double_3x3_reduce_out = self.inception_3c_relu_double_3x3_reduce( 198 | inception_3c_double_3x3_reduce_bn_out) 199 | inception_3c_double_3x3_1_out = self.inception_3c_double_3x3_1(inception_3c_double_3x3_reduce_bn_out) 200 | inception_3c_double_3x3_1_bn_out = self.inception_3c_double_3x3_1_bn(inception_3c_double_3x3_1_out) 201 | inception_3c_relu_double_3x3_1_out = self.inception_3c_relu_double_3x3_1(inception_3c_double_3x3_1_bn_out) 202 | inception_3c_double_3x3_2_out = self.inception_3c_double_3x3_2(inception_3c_double_3x3_1_bn_out) 203 | inception_3c_double_3x3_2_bn_out = self.inception_3c_double_3x3_2_bn(inception_3c_double_3x3_2_out) 204 | inception_3c_relu_double_3x3_2_out = self.inception_3c_relu_double_3x3_2(inception_3c_double_3x3_2_bn_out) 205 | inception_3c_pool_out = self.inception_3c_pool(inception_3b_output_out) 206 | inception_3c_output_out = torch.cat( 207 | [inception_3c_3x3_bn_out, inception_3c_double_3x3_2_bn_out, inception_3c_pool_out], 1) 208 | return inception_3c_output_out 209 | 210 | def _block_4a(self, inception_3c_output_out): 211 | inception_4a_1x1_out = self.inception_4a_1x1(inception_3c_output_out) 212 | inception_4a_1x1_bn_out = self.inception_4a_1x1_bn(inception_4a_1x1_out) 213 | inception_4a_relu_1x1_out = self.inception_4a_relu_1x1(inception_4a_1x1_bn_out) 214 | inception_4a_3x3_reduce_out = self.inception_4a_3x3_reduce(inception_3c_output_out) 215 | inception_4a_3x3_reduce_bn_out = self.inception_4a_3x3_reduce_bn(inception_4a_3x3_reduce_out) 216 | inception_4a_relu_3x3_reduce_out = self.inception_4a_relu_3x3_reduce(inception_4a_3x3_reduce_bn_out) 217 | inception_4a_3x3_out = self.inception_4a_3x3(inception_4a_3x3_reduce_bn_out) 218 | inception_4a_3x3_bn_out = self.inception_4a_3x3_bn(inception_4a_3x3_out) 219 | inception_4a_relu_3x3_out = self.inception_4a_relu_3x3(inception_4a_3x3_bn_out) 220 | inception_4a_double_3x3_reduce_out = self.inception_4a_double_3x3_reduce(inception_3c_output_out) 221 | inception_4a_double_3x3_reduce_bn_out = self.inception_4a_double_3x3_reduce_bn( 222 | inception_4a_double_3x3_reduce_out) 223 | inception_4a_relu_double_3x3_reduce_out = self.inception_4a_relu_double_3x3_reduce( 224 | inception_4a_double_3x3_reduce_bn_out) 225 | inception_4a_double_3x3_1_out = self.inception_4a_double_3x3_1(inception_4a_double_3x3_reduce_bn_out) 226 | inception_4a_double_3x3_1_bn_out = self.inception_4a_double_3x3_1_bn(inception_4a_double_3x3_1_out) 227 | inception_4a_relu_double_3x3_1_out = self.inception_4a_relu_double_3x3_1(inception_4a_double_3x3_1_bn_out) 228 | inception_4a_double_3x3_2_out = self.inception_4a_double_3x3_2(inception_4a_double_3x3_1_bn_out) 229 | inception_4a_double_3x3_2_bn_out = self.inception_4a_double_3x3_2_bn(inception_4a_double_3x3_2_out) 230 | inception_4a_relu_double_3x3_2_out = self.inception_4a_relu_double_3x3_2(inception_4a_double_3x3_2_bn_out) 231 | inception_4a_pool_out = self.inception_4a_pool(inception_3c_output_out) 232 | inception_4a_pool_proj_out = self.inception_4a_pool_proj(inception_4a_pool_out) 233 | inception_4a_pool_proj_bn_out = self.inception_4a_pool_proj_bn(inception_4a_pool_proj_out) 234 | inception_4a_relu_pool_proj_out = self.inception_4a_relu_pool_proj(inception_4a_pool_proj_bn_out) 235 | inception_4a_output_out = torch.cat( 236 | [inception_4a_1x1_bn_out, inception_4a_3x3_bn_out, inception_4a_double_3x3_2_bn_out, 237 | inception_4a_pool_proj_bn_out], 1) 238 | return inception_4a_output_out 239 | 240 | def _block_4b(self, inception_4a_output_out): 241 | inception_4b_1x1_out = self.inception_4b_1x1(inception_4a_output_out) 242 | inception_4b_1x1_bn_out = self.inception_4b_1x1_bn(inception_4b_1x1_out) 243 | inception_4b_relu_1x1_out = self.inception_4b_relu_1x1(inception_4b_1x1_bn_out) 244 | inception_4b_3x3_reduce_out = self.inception_4b_3x3_reduce(inception_4a_output_out) 245 | inception_4b_3x3_reduce_bn_out = self.inception_4b_3x3_reduce_bn(inception_4b_3x3_reduce_out) 246 | inception_4b_relu_3x3_reduce_out = self.inception_4b_relu_3x3_reduce(inception_4b_3x3_reduce_bn_out) 247 | inception_4b_3x3_out = self.inception_4b_3x3(inception_4b_3x3_reduce_bn_out) 248 | inception_4b_3x3_bn_out = self.inception_4b_3x3_bn(inception_4b_3x3_out) 249 | inception_4b_relu_3x3_out = self.inception_4b_relu_3x3(inception_4b_3x3_bn_out) 250 | inception_4b_double_3x3_reduce_out = self.inception_4b_double_3x3_reduce(inception_4a_output_out) 251 | inception_4b_double_3x3_reduce_bn_out = self.inception_4b_double_3x3_reduce_bn( 252 | inception_4b_double_3x3_reduce_out) 253 | inception_4b_relu_double_3x3_reduce_out = self.inception_4b_relu_double_3x3_reduce( 254 | inception_4b_double_3x3_reduce_bn_out) 255 | inception_4b_double_3x3_1_out = self.inception_4b_double_3x3_1(inception_4b_double_3x3_reduce_bn_out) 256 | inception_4b_double_3x3_1_bn_out = self.inception_4b_double_3x3_1_bn(inception_4b_double_3x3_1_out) 257 | inception_4b_relu_double_3x3_1_out = self.inception_4b_relu_double_3x3_1(inception_4b_double_3x3_1_bn_out) 258 | inception_4b_double_3x3_2_out = self.inception_4b_double_3x3_2(inception_4b_double_3x3_1_bn_out) 259 | inception_4b_double_3x3_2_bn_out = self.inception_4b_double_3x3_2_bn(inception_4b_double_3x3_2_out) 260 | inception_4b_relu_double_3x3_2_out = self.inception_4b_relu_double_3x3_2(inception_4b_double_3x3_2_bn_out) 261 | inception_4b_pool_out = self.inception_4b_pool(inception_4a_output_out) 262 | inception_4b_pool_proj_out = self.inception_4b_pool_proj(inception_4b_pool_out) 263 | inception_4b_pool_proj_bn_out = self.inception_4b_pool_proj_bn(inception_4b_pool_proj_out) 264 | inception_4b_relu_pool_proj_out = self.inception_4b_relu_pool_proj(inception_4b_pool_proj_bn_out) 265 | inception_4b_output_out = torch.cat( 266 | [inception_4b_1x1_bn_out, inception_4b_3x3_bn_out, inception_4b_double_3x3_2_bn_out, 267 | inception_4b_pool_proj_bn_out], 1) 268 | return inception_4b_output_out 269 | 270 | def _block_4c(self, inception_4b_output_out): 271 | inception_4c_1x1_out = self.inception_4c_1x1(inception_4b_output_out) 272 | inception_4c_1x1_bn_out = self.inception_4c_1x1_bn(inception_4c_1x1_out) 273 | inception_4c_relu_1x1_out = self.inception_4c_relu_1x1(inception_4c_1x1_bn_out) 274 | inception_4c_3x3_reduce_out = self.inception_4c_3x3_reduce(inception_4b_output_out) 275 | inception_4c_3x3_reduce_bn_out = self.inception_4c_3x3_reduce_bn(inception_4c_3x3_reduce_out) 276 | inception_4c_relu_3x3_reduce_out = self.inception_4c_relu_3x3_reduce(inception_4c_3x3_reduce_bn_out) 277 | inception_4c_3x3_out = self.inception_4c_3x3(inception_4c_3x3_reduce_bn_out) 278 | inception_4c_3x3_bn_out = self.inception_4c_3x3_bn(inception_4c_3x3_out) 279 | inception_4c_relu_3x3_out = self.inception_4c_relu_3x3(inception_4c_3x3_bn_out) 280 | inception_4c_double_3x3_reduce_out = self.inception_4c_double_3x3_reduce(inception_4b_output_out) 281 | inception_4c_double_3x3_reduce_bn_out = self.inception_4c_double_3x3_reduce_bn( 282 | inception_4c_double_3x3_reduce_out) 283 | inception_4c_relu_double_3x3_reduce_out = self.inception_4c_relu_double_3x3_reduce( 284 | inception_4c_double_3x3_reduce_bn_out) 285 | inception_4c_double_3x3_1_out = self.inception_4c_double_3x3_1(inception_4c_double_3x3_reduce_bn_out) 286 | inception_4c_double_3x3_1_bn_out = self.inception_4c_double_3x3_1_bn(inception_4c_double_3x3_1_out) 287 | inception_4c_relu_double_3x3_1_out = self.inception_4c_relu_double_3x3_1(inception_4c_double_3x3_1_bn_out) 288 | inception_4c_double_3x3_2_out = self.inception_4c_double_3x3_2(inception_4c_double_3x3_1_bn_out) 289 | inception_4c_double_3x3_2_bn_out = self.inception_4c_double_3x3_2_bn(inception_4c_double_3x3_2_out) 290 | inception_4c_relu_double_3x3_2_out = self.inception_4c_relu_double_3x3_2(inception_4c_double_3x3_2_bn_out) 291 | inception_4c_pool_out = self.inception_4c_pool(inception_4b_output_out) 292 | inception_4c_pool_proj_out = self.inception_4c_pool_proj(inception_4c_pool_out) 293 | inception_4c_pool_proj_bn_out = self.inception_4c_pool_proj_bn(inception_4c_pool_proj_out) 294 | inception_4c_relu_pool_proj_out = self.inception_4c_relu_pool_proj(inception_4c_pool_proj_bn_out) 295 | inception_4c_output_out = torch.cat( 296 | [inception_4c_1x1_bn_out, inception_4c_3x3_bn_out, inception_4c_double_3x3_2_bn_out, 297 | inception_4c_pool_proj_bn_out], 1) 298 | return inception_4c_output_out 299 | 300 | def _block_4d(self, inception_4c_output_out): 301 | inception_4d_1x1_out = self.inception_4d_1x1(inception_4c_output_out) 302 | inception_4d_1x1_bn_out = self.inception_4d_1x1_bn(inception_4d_1x1_out) 303 | inception_4d_relu_1x1_out = self.inception_4d_relu_1x1(inception_4d_1x1_bn_out) 304 | inception_4d_3x3_reduce_out = self.inception_4d_3x3_reduce(inception_4c_output_out) 305 | inception_4d_3x3_reduce_bn_out = self.inception_4d_3x3_reduce_bn(inception_4d_3x3_reduce_out) 306 | inception_4d_relu_3x3_reduce_out = self.inception_4d_relu_3x3_reduce(inception_4d_3x3_reduce_bn_out) 307 | inception_4d_3x3_out = self.inception_4d_3x3(inception_4d_3x3_reduce_bn_out) 308 | inception_4d_3x3_bn_out = self.inception_4d_3x3_bn(inception_4d_3x3_out) 309 | inception_4d_relu_3x3_out = self.inception_4d_relu_3x3(inception_4d_3x3_bn_out) 310 | inception_4d_double_3x3_reduce_out = self.inception_4d_double_3x3_reduce(inception_4c_output_out) 311 | inception_4d_double_3x3_reduce_bn_out = self.inception_4d_double_3x3_reduce_bn( 312 | inception_4d_double_3x3_reduce_out) 313 | inception_4d_relu_double_3x3_reduce_out = self.inception_4d_relu_double_3x3_reduce( 314 | inception_4d_double_3x3_reduce_bn_out) 315 | inception_4d_double_3x3_1_out = self.inception_4d_double_3x3_1(inception_4d_double_3x3_reduce_bn_out) 316 | inception_4d_double_3x3_1_bn_out = self.inception_4d_double_3x3_1_bn(inception_4d_double_3x3_1_out) 317 | inception_4d_relu_double_3x3_1_out = self.inception_4d_relu_double_3x3_1(inception_4d_double_3x3_1_bn_out) 318 | inception_4d_double_3x3_2_out = self.inception_4d_double_3x3_2(inception_4d_double_3x3_1_bn_out) 319 | inception_4d_double_3x3_2_bn_out = self.inception_4d_double_3x3_2_bn(inception_4d_double_3x3_2_out) 320 | inception_4d_relu_double_3x3_2_out = self.inception_4d_relu_double_3x3_2(inception_4d_double_3x3_2_bn_out) 321 | inception_4d_pool_out = self.inception_4d_pool(inception_4c_output_out) 322 | inception_4d_pool_proj_out = self.inception_4d_pool_proj(inception_4d_pool_out) 323 | inception_4d_pool_proj_bn_out = self.inception_4d_pool_proj_bn(inception_4d_pool_proj_out) 324 | inception_4d_relu_pool_proj_out = self.inception_4d_relu_pool_proj(inception_4d_pool_proj_bn_out) 325 | inception_4d_output_out = torch.cat( 326 | [inception_4d_1x1_bn_out, inception_4d_3x3_bn_out, inception_4d_double_3x3_2_bn_out, 327 | inception_4d_pool_proj_bn_out], 1) 328 | return inception_4d_output_out 329 | 330 | def _block_4e(self, inception_4d_output_out): 331 | inception_4e_3x3_reduce_out = self.inception_4e_3x3_reduce(inception_4d_output_out) 332 | inception_4e_3x3_reduce_bn_out = self.inception_4e_3x3_reduce_bn(inception_4e_3x3_reduce_out) 333 | inception_4e_relu_3x3_reduce_out = self.inception_4e_relu_3x3_reduce(inception_4e_3x3_reduce_bn_out) 334 | inception_4e_3x3_out = self.inception_4e_3x3(inception_4e_3x3_reduce_bn_out) 335 | inception_4e_3x3_bn_out = self.inception_4e_3x3_bn(inception_4e_3x3_out) 336 | inception_4e_relu_3x3_out = self.inception_4e_relu_3x3(inception_4e_3x3_bn_out) 337 | inception_4e_double_3x3_reduce_out = self.inception_4e_double_3x3_reduce(inception_4d_output_out) 338 | inception_4e_double_3x3_reduce_bn_out = self.inception_4e_double_3x3_reduce_bn( 339 | inception_4e_double_3x3_reduce_out) 340 | inception_4e_relu_double_3x3_reduce_out = self.inception_4e_relu_double_3x3_reduce( 341 | inception_4e_double_3x3_reduce_bn_out) 342 | inception_4e_double_3x3_1_out = self.inception_4e_double_3x3_1(inception_4e_double_3x3_reduce_bn_out) 343 | inception_4e_double_3x3_1_bn_out = self.inception_4e_double_3x3_1_bn(inception_4e_double_3x3_1_out) 344 | inception_4e_relu_double_3x3_1_out = self.inception_4e_relu_double_3x3_1(inception_4e_double_3x3_1_bn_out) 345 | inception_4e_double_3x3_2_out = self.inception_4e_double_3x3_2(inception_4e_double_3x3_1_bn_out) 346 | inception_4e_double_3x3_2_bn_out = self.inception_4e_double_3x3_2_bn(inception_4e_double_3x3_2_out) 347 | inception_4e_relu_double_3x3_2_out = self.inception_4e_relu_double_3x3_2(inception_4e_double_3x3_2_bn_out) 348 | inception_4e_pool_out = self.inception_4e_pool(inception_4d_output_out) 349 | inception_4e_output_out = torch.cat( 350 | [inception_4e_3x3_bn_out, inception_4e_double_3x3_2_bn_out, inception_4e_pool_out], 1) 351 | return inception_4e_output_out 352 | 353 | def _block_5a(self, inception_4e_output_out): 354 | inception_5a_1x1_out = self.inception_5a_1x1(inception_4e_output_out) 355 | inception_5a_1x1_bn_out = self.inception_5a_1x1_bn(inception_5a_1x1_out) 356 | inception_5a_relu_1x1_out = self.inception_5a_relu_1x1(inception_5a_1x1_bn_out) 357 | inception_5a_3x3_reduce_out = self.inception_5a_3x3_reduce(inception_4e_output_out) 358 | inception_5a_3x3_reduce_bn_out = self.inception_5a_3x3_reduce_bn(inception_5a_3x3_reduce_out) 359 | inception_5a_relu_3x3_reduce_out = self.inception_5a_relu_3x3_reduce(inception_5a_3x3_reduce_bn_out) 360 | inception_5a_3x3_out = self.inception_5a_3x3(inception_5a_3x3_reduce_bn_out) 361 | inception_5a_3x3_bn_out = self.inception_5a_3x3_bn(inception_5a_3x3_out) 362 | inception_5a_relu_3x3_out = self.inception_5a_relu_3x3(inception_5a_3x3_bn_out) 363 | inception_5a_double_3x3_reduce_out = self.inception_5a_double_3x3_reduce(inception_4e_output_out) 364 | inception_5a_double_3x3_reduce_bn_out = self.inception_5a_double_3x3_reduce_bn( 365 | inception_5a_double_3x3_reduce_out) 366 | inception_5a_relu_double_3x3_reduce_out = self.inception_5a_relu_double_3x3_reduce( 367 | inception_5a_double_3x3_reduce_bn_out) 368 | inception_5a_double_3x3_1_out = self.inception_5a_double_3x3_1(inception_5a_double_3x3_reduce_bn_out) 369 | inception_5a_double_3x3_1_bn_out = self.inception_5a_double_3x3_1_bn(inception_5a_double_3x3_1_out) 370 | inception_5a_relu_double_3x3_1_out = self.inception_5a_relu_double_3x3_1(inception_5a_double_3x3_1_bn_out) 371 | inception_5a_double_3x3_2_out = self.inception_5a_double_3x3_2(inception_5a_double_3x3_1_bn_out) 372 | inception_5a_double_3x3_2_bn_out = self.inception_5a_double_3x3_2_bn(inception_5a_double_3x3_2_out) 373 | inception_5a_relu_double_3x3_2_out = self.inception_5a_relu_double_3x3_2(inception_5a_double_3x3_2_bn_out) 374 | inception_5a_pool_out = self.inception_5a_pool(inception_4e_output_out) 375 | inception_5a_pool_proj_out = self.inception_5a_pool_proj(inception_5a_pool_out) 376 | inception_5a_pool_proj_bn_out = self.inception_5a_pool_proj_bn(inception_5a_pool_proj_out) 377 | inception_5a_relu_pool_proj_out = self.inception_5a_relu_pool_proj(inception_5a_pool_proj_bn_out) 378 | inception_5a_output_out = torch.cat( 379 | [inception_5a_1x1_bn_out, inception_5a_3x3_bn_out, inception_5a_double_3x3_2_bn_out, 380 | inception_5a_pool_proj_bn_out], 1) 381 | return inception_5a_output_out 382 | 383 | def _block_5b(self, inception_5a_output_out): 384 | inception_5b_1x1_out = self.inception_5b_1x1(inception_5a_output_out) 385 | inception_5b_1x1_bn_out = self.inception_5b_1x1_bn(inception_5b_1x1_out) 386 | inception_5b_relu_1x1_out = self.inception_5b_relu_1x1(inception_5b_1x1_bn_out) 387 | inception_5b_3x3_reduce_out = self.inception_5b_3x3_reduce(inception_5a_output_out) 388 | inception_5b_3x3_reduce_bn_out = self.inception_5b_3x3_reduce_bn(inception_5b_3x3_reduce_out) 389 | inception_5b_relu_3x3_reduce_out = self.inception_5b_relu_3x3_reduce(inception_5b_3x3_reduce_bn_out) 390 | inception_5b_3x3_out = self.inception_5b_3x3(inception_5b_3x3_reduce_bn_out) 391 | inception_5b_3x3_bn_out = self.inception_5b_3x3_bn(inception_5b_3x3_out) 392 | inception_5b_relu_3x3_out = self.inception_5b_relu_3x3(inception_5b_3x3_bn_out) 393 | inception_5b_double_3x3_reduce_out = self.inception_5b_double_3x3_reduce(inception_5a_output_out) 394 | inception_5b_double_3x3_reduce_bn_out = self.inception_5b_double_3x3_reduce_bn( 395 | inception_5b_double_3x3_reduce_out) 396 | inception_5b_relu_double_3x3_reduce_out = self.inception_5b_relu_double_3x3_reduce( 397 | inception_5b_double_3x3_reduce_bn_out) 398 | inception_5b_double_3x3_1_out = self.inception_5b_double_3x3_1(inception_5b_double_3x3_reduce_bn_out) 399 | inception_5b_double_3x3_1_bn_out = self.inception_5b_double_3x3_1_bn(inception_5b_double_3x3_1_out) 400 | inception_5b_relu_double_3x3_1_out = self.inception_5b_relu_double_3x3_1(inception_5b_double_3x3_1_bn_out) 401 | inception_5b_double_3x3_2_out = self.inception_5b_double_3x3_2(inception_5b_double_3x3_1_bn_out) 402 | inception_5b_double_3x3_2_bn_out = self.inception_5b_double_3x3_2_bn(inception_5b_double_3x3_2_out) 403 | inception_5b_relu_double_3x3_2_out = self.inception_5b_relu_double_3x3_2(inception_5b_double_3x3_2_bn_out) 404 | inception_5b_pool_out = self.inception_5b_pool(inception_5a_output_out) 405 | inception_5b_pool_proj_out = self.inception_5b_pool_proj(inception_5b_pool_out) 406 | inception_5b_pool_proj_bn_out = self.inception_5b_pool_proj_bn(inception_5b_pool_proj_out) 407 | inception_5b_relu_pool_proj_out = self.inception_5b_relu_pool_proj(inception_5b_pool_proj_bn_out) 408 | inception_5b_output_out = torch.cat( 409 | [inception_5b_1x1_bn_out, inception_5b_3x3_bn_out, inception_5b_double_3x3_2_bn_out, 410 | inception_5b_pool_proj_bn_out], 1) 411 | return inception_5b_output_out 412 | 413 | def _build_features(self, inplace, num_classes): 414 | self.conv1_7x7_s2 = nn.Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3)) 415 | self.conv1_7x7_s2_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True) 416 | self.conv1_relu_7x7 = nn.ReLU(inplace) 417 | self.pool1_3x3_s2 = nn.MaxPool2d((3, 3), stride=(2, 2), dilation=(1, 1), ceil_mode=True) 418 | self.conv2_3x3_reduce = nn.Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1)) 419 | self.conv2_3x3_reduce_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True) 420 | self.conv2_relu_3x3_reduce = nn.ReLU(inplace) 421 | self.conv2_3x3 = nn.Conv2d(64, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 422 | self.conv2_3x3_bn = nn.BatchNorm2d(192, eps=1e-05, momentum=0.9, affine=True) 423 | self.conv2_relu_3x3 = nn.ReLU(inplace) 424 | self.pool2_3x3_s2 = nn.MaxPool2d((3, 3), stride=(2, 2), dilation=(1, 1), ceil_mode=True) 425 | self.inception_3a_1x1 = nn.Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1)) 426 | self.inception_3a_1x1_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True) 427 | self.inception_3a_relu_1x1 = nn.ReLU(inplace) 428 | self.inception_3a_3x3_reduce = nn.Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1)) 429 | self.inception_3a_3x3_reduce_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True) 430 | self.inception_3a_relu_3x3_reduce = nn.ReLU(inplace) 431 | self.inception_3a_3x3 = nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 432 | self.inception_3a_3x3_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True) 433 | self.inception_3a_relu_3x3 = nn.ReLU(inplace) 434 | self.inception_3a_double_3x3_reduce = nn.Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1)) 435 | self.inception_3a_double_3x3_reduce_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True) 436 | self.inception_3a_relu_double_3x3_reduce = nn.ReLU(inplace) 437 | self.inception_3a_double_3x3_1 = nn.Conv2d(64, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 438 | self.inception_3a_double_3x3_1_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.9, affine=True) 439 | self.inception_3a_relu_double_3x3_1 = nn.ReLU(inplace) 440 | self.inception_3a_double_3x3_2 = nn.Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 441 | self.inception_3a_double_3x3_2_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.9, affine=True) 442 | self.inception_3a_relu_double_3x3_2 = nn.ReLU(inplace) 443 | self.inception_3a_pool = nn.AvgPool2d(3, stride=1, padding=1, ceil_mode=True, count_include_pad=True) 444 | self.inception_3a_pool_proj = nn.Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1)) 445 | self.inception_3a_pool_proj_bn = nn.BatchNorm2d(32, eps=1e-05, momentum=0.9, affine=True) 446 | self.inception_3a_relu_pool_proj = nn.ReLU(inplace) 447 | self.inception_3b_1x1 = nn.Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1)) 448 | self.inception_3b_1x1_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True) 449 | self.inception_3b_relu_1x1 = nn.ReLU(inplace) 450 | self.inception_3b_3x3_reduce = nn.Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1)) 451 | self.inception_3b_3x3_reduce_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True) 452 | self.inception_3b_relu_3x3_reduce = nn.ReLU(inplace) 453 | self.inception_3b_3x3 = nn.Conv2d(64, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 454 | self.inception_3b_3x3_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.9, affine=True) 455 | self.inception_3b_relu_3x3 = nn.ReLU(inplace) 456 | self.inception_3b_double_3x3_reduce = nn.Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1)) 457 | self.inception_3b_double_3x3_reduce_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True) 458 | self.inception_3b_relu_double_3x3_reduce = nn.ReLU(inplace) 459 | self.inception_3b_double_3x3_1 = nn.Conv2d(64, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 460 | self.inception_3b_double_3x3_1_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.9, affine=True) 461 | self.inception_3b_relu_double_3x3_1 = nn.ReLU(inplace) 462 | self.inception_3b_double_3x3_2 = nn.Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 463 | self.inception_3b_double_3x3_2_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.9, affine=True) 464 | self.inception_3b_relu_double_3x3_2 = nn.ReLU(inplace) 465 | self.inception_3b_pool = nn.AvgPool2d(3, stride=1, padding=1, ceil_mode=True, count_include_pad=True) 466 | self.inception_3b_pool_proj = nn.Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1)) 467 | self.inception_3b_pool_proj_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True) 468 | self.inception_3b_relu_pool_proj = nn.ReLU(inplace) 469 | self.inception_3c_3x3_reduce = nn.Conv2d(320, 128, kernel_size=(1, 1), stride=(1, 1)) 470 | self.inception_3c_3x3_reduce_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True) 471 | self.inception_3c_relu_3x3_reduce = nn.ReLU(inplace) 472 | self.inception_3c_3x3 = nn.Conv2d(128, 160, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) 473 | self.inception_3c_3x3_bn = nn.BatchNorm2d(160, eps=1e-05, momentum=0.9, affine=True) 474 | self.inception_3c_relu_3x3 = nn.ReLU(inplace) 475 | self.inception_3c_double_3x3_reduce = nn.Conv2d(320, 64, kernel_size=(1, 1), stride=(1, 1)) 476 | self.inception_3c_double_3x3_reduce_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True) 477 | self.inception_3c_relu_double_3x3_reduce = nn.ReLU(inplace) 478 | self.inception_3c_double_3x3_1 = nn.Conv2d(64, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 479 | self.inception_3c_double_3x3_1_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.9, affine=True) 480 | self.inception_3c_relu_double_3x3_1 = nn.ReLU(inplace) 481 | self.inception_3c_double_3x3_2 = nn.Conv2d(96, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) 482 | self.inception_3c_double_3x3_2_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.9, affine=True) 483 | self.inception_3c_relu_double_3x3_2 = nn.ReLU(inplace) 484 | self.inception_3c_pool = nn.MaxPool2d((3, 3), stride=(2, 2), dilation=(1, 1), ceil_mode=True) 485 | self.inception_4a_1x1 = nn.Conv2d(576, 224, kernel_size=(1, 1), stride=(1, 1)) 486 | self.inception_4a_1x1_bn = nn.BatchNorm2d(224, eps=1e-05, momentum=0.9, affine=True) 487 | self.inception_4a_relu_1x1 = nn.ReLU(inplace) 488 | self.inception_4a_3x3_reduce = nn.Conv2d(576, 64, kernel_size=(1, 1), stride=(1, 1)) 489 | self.inception_4a_3x3_reduce_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True) 490 | self.inception_4a_relu_3x3_reduce = nn.ReLU(inplace) 491 | self.inception_4a_3x3 = nn.Conv2d(64, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 492 | self.inception_4a_3x3_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.9, affine=True) 493 | self.inception_4a_relu_3x3 = nn.ReLU(inplace) 494 | self.inception_4a_double_3x3_reduce = nn.Conv2d(576, 96, kernel_size=(1, 1), stride=(1, 1)) 495 | self.inception_4a_double_3x3_reduce_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.9, affine=True) 496 | self.inception_4a_relu_double_3x3_reduce = nn.ReLU(inplace) 497 | self.inception_4a_double_3x3_1 = nn.Conv2d(96, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 498 | self.inception_4a_double_3x3_1_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True) 499 | self.inception_4a_relu_double_3x3_1 = nn.ReLU(inplace) 500 | self.inception_4a_double_3x3_2 = nn.Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 501 | self.inception_4a_double_3x3_2_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True) 502 | self.inception_4a_relu_double_3x3_2 = nn.ReLU(inplace) 503 | self.inception_4a_pool = nn.AvgPool2d(3, stride=1, padding=1, ceil_mode=True, count_include_pad=True) 504 | self.inception_4a_pool_proj = nn.Conv2d(576, 128, kernel_size=(1, 1), stride=(1, 1)) 505 | self.inception_4a_pool_proj_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True) 506 | self.inception_4a_relu_pool_proj = nn.ReLU(inplace) 507 | self.inception_4b_1x1 = nn.Conv2d(576, 192, kernel_size=(1, 1), stride=(1, 1)) 508 | self.inception_4b_1x1_bn = nn.BatchNorm2d(192, eps=1e-05, momentum=0.9, affine=True) 509 | self.inception_4b_relu_1x1 = nn.ReLU(inplace) 510 | self.inception_4b_3x3_reduce = nn.Conv2d(576, 96, kernel_size=(1, 1), stride=(1, 1)) 511 | self.inception_4b_3x3_reduce_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.9, affine=True) 512 | self.inception_4b_relu_3x3_reduce = nn.ReLU(inplace) 513 | self.inception_4b_3x3 = nn.Conv2d(96, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 514 | self.inception_4b_3x3_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True) 515 | self.inception_4b_relu_3x3 = nn.ReLU(inplace) 516 | self.inception_4b_double_3x3_reduce = nn.Conv2d(576, 96, kernel_size=(1, 1), stride=(1, 1)) 517 | self.inception_4b_double_3x3_reduce_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.9, affine=True) 518 | self.inception_4b_relu_double_3x3_reduce = nn.ReLU(inplace) 519 | self.inception_4b_double_3x3_1 = nn.Conv2d(96, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 520 | self.inception_4b_double_3x3_1_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True) 521 | self.inception_4b_relu_double_3x3_1 = nn.ReLU(inplace) 522 | self.inception_4b_double_3x3_2 = nn.Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 523 | self.inception_4b_double_3x3_2_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True) 524 | self.inception_4b_relu_double_3x3_2 = nn.ReLU(inplace) 525 | self.inception_4b_pool = nn.AvgPool2d(3, stride=1, padding=1, ceil_mode=True, count_include_pad=True) 526 | self.inception_4b_pool_proj = nn.Conv2d(576, 128, kernel_size=(1, 1), stride=(1, 1)) 527 | self.inception_4b_pool_proj_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True) 528 | self.inception_4b_relu_pool_proj = nn.ReLU(inplace) 529 | self.inception_4c_1x1 = nn.Conv2d(576, 160, kernel_size=(1, 1), stride=(1, 1)) 530 | self.inception_4c_1x1_bn = nn.BatchNorm2d(160, eps=1e-05, momentum=0.9, affine=True) 531 | self.inception_4c_relu_1x1 = nn.ReLU(inplace) 532 | self.inception_4c_3x3_reduce = nn.Conv2d(576, 128, kernel_size=(1, 1), stride=(1, 1)) 533 | self.inception_4c_3x3_reduce_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True) 534 | self.inception_4c_relu_3x3_reduce = nn.ReLU(inplace) 535 | self.inception_4c_3x3 = nn.Conv2d(128, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 536 | self.inception_4c_3x3_bn = nn.BatchNorm2d(160, eps=1e-05, momentum=0.9, affine=True) 537 | self.inception_4c_relu_3x3 = nn.ReLU(inplace) 538 | self.inception_4c_double_3x3_reduce = nn.Conv2d(576, 128, kernel_size=(1, 1), stride=(1, 1)) 539 | self.inception_4c_double_3x3_reduce_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True) 540 | self.inception_4c_relu_double_3x3_reduce = nn.ReLU(inplace) 541 | self.inception_4c_double_3x3_1 = nn.Conv2d(128, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 542 | self.inception_4c_double_3x3_1_bn = nn.BatchNorm2d(160, eps=1e-05, momentum=0.9, affine=True) 543 | self.inception_4c_relu_double_3x3_1 = nn.ReLU(inplace) 544 | self.inception_4c_double_3x3_2 = nn.Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 545 | self.inception_4c_double_3x3_2_bn = nn.BatchNorm2d(160, eps=1e-05, momentum=0.9, affine=True) 546 | self.inception_4c_relu_double_3x3_2 = nn.ReLU(inplace) 547 | self.inception_4c_pool = nn.AvgPool2d(3, stride=1, padding=1, ceil_mode=True, count_include_pad=True) 548 | self.inception_4c_pool_proj = nn.Conv2d(576, 128, kernel_size=(1, 1), stride=(1, 1)) 549 | self.inception_4c_pool_proj_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True) 550 | self.inception_4c_relu_pool_proj = nn.ReLU(inplace) 551 | self.inception_4d_1x1 = nn.Conv2d(608, 96, kernel_size=(1, 1), stride=(1, 1)) 552 | self.inception_4d_1x1_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.9, affine=True) 553 | self.inception_4d_relu_1x1 = nn.ReLU(inplace) 554 | self.inception_4d_3x3_reduce = nn.Conv2d(608, 128, kernel_size=(1, 1), stride=(1, 1)) 555 | self.inception_4d_3x3_reduce_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True) 556 | self.inception_4d_relu_3x3_reduce = nn.ReLU(inplace) 557 | self.inception_4d_3x3 = nn.Conv2d(128, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 558 | self.inception_4d_3x3_bn = nn.BatchNorm2d(192, eps=1e-05, momentum=0.9, affine=True) 559 | self.inception_4d_relu_3x3 = nn.ReLU(inplace) 560 | self.inception_4d_double_3x3_reduce = nn.Conv2d(608, 160, kernel_size=(1, 1), stride=(1, 1)) 561 | self.inception_4d_double_3x3_reduce_bn = nn.BatchNorm2d(160, eps=1e-05, momentum=0.9, affine=True) 562 | self.inception_4d_relu_double_3x3_reduce = nn.ReLU(inplace) 563 | self.inception_4d_double_3x3_1 = nn.Conv2d(160, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 564 | self.inception_4d_double_3x3_1_bn = nn.BatchNorm2d(192, eps=1e-05, momentum=0.9, affine=True) 565 | self.inception_4d_relu_double_3x3_1 = nn.ReLU(inplace) 566 | self.inception_4d_double_3x3_2 = nn.Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 567 | self.inception_4d_double_3x3_2_bn = nn.BatchNorm2d(192, eps=1e-05, momentum=0.9, affine=True) 568 | self.inception_4d_relu_double_3x3_2 = nn.ReLU(inplace) 569 | self.inception_4d_pool = nn.AvgPool2d(3, stride=1, padding=1, ceil_mode=True, count_include_pad=True) 570 | self.inception_4d_pool_proj = nn.Conv2d(608, 128, kernel_size=(1, 1), stride=(1, 1)) 571 | self.inception_4d_pool_proj_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True) 572 | self.inception_4d_relu_pool_proj = nn.ReLU(inplace) 573 | self.inception_4e_3x3_reduce = nn.Conv2d(608, 128, kernel_size=(1, 1), stride=(1, 1)) 574 | self.inception_4e_3x3_reduce_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True) 575 | self.inception_4e_relu_3x3_reduce = nn.ReLU(inplace) 576 | self.inception_4e_3x3 = nn.Conv2d(128, 192, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) 577 | self.inception_4e_3x3_bn = nn.BatchNorm2d(192, eps=1e-05, momentum=0.9, affine=True) 578 | self.inception_4e_relu_3x3 = nn.ReLU(inplace) 579 | self.inception_4e_double_3x3_reduce = nn.Conv2d(608, 192, kernel_size=(1, 1), stride=(1, 1)) 580 | self.inception_4e_double_3x3_reduce_bn = nn.BatchNorm2d(192, eps=1e-05, momentum=0.9, affine=True) 581 | self.inception_4e_relu_double_3x3_reduce = nn.ReLU(inplace) 582 | self.inception_4e_double_3x3_1 = nn.Conv2d(192, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 583 | self.inception_4e_double_3x3_1_bn = nn.BatchNorm2d(256, eps=1e-05, momentum=0.9, affine=True) 584 | self.inception_4e_relu_double_3x3_1 = nn.ReLU(inplace) 585 | self.inception_4e_double_3x3_2 = nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) 586 | self.inception_4e_double_3x3_2_bn = nn.BatchNorm2d(256, eps=1e-05, momentum=0.9, affine=True) 587 | self.inception_4e_relu_double_3x3_2 = nn.ReLU(inplace) 588 | self.inception_4e_pool = nn.MaxPool2d((3, 3), stride=(2, 2), dilation=(1, 1), ceil_mode=True) 589 | self.inception_5a_1x1 = nn.Conv2d(1056, 352, kernel_size=(1, 1), stride=(1, 1)) 590 | self.inception_5a_1x1_bn = nn.BatchNorm2d(352, eps=1e-05, momentum=0.9, affine=True) 591 | self.inception_5a_relu_1x1 = nn.ReLU(inplace) 592 | self.inception_5a_3x3_reduce = nn.Conv2d(1056, 192, kernel_size=(1, 1), stride=(1, 1)) 593 | self.inception_5a_3x3_reduce_bn = nn.BatchNorm2d(192, eps=1e-05, momentum=0.9, affine=True) 594 | self.inception_5a_relu_3x3_reduce = nn.ReLU(inplace) 595 | self.inception_5a_3x3 = nn.Conv2d(192, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 596 | self.inception_5a_3x3_bn = nn.BatchNorm2d(320, eps=1e-05, momentum=0.9, affine=True) 597 | self.inception_5a_relu_3x3 = nn.ReLU(inplace) 598 | self.inception_5a_double_3x3_reduce = nn.Conv2d(1056, 160, kernel_size=(1, 1), stride=(1, 1)) 599 | self.inception_5a_double_3x3_reduce_bn = nn.BatchNorm2d(160, eps=1e-05, momentum=0.9, affine=True) 600 | self.inception_5a_relu_double_3x3_reduce = nn.ReLU(inplace) 601 | self.inception_5a_double_3x3_1 = nn.Conv2d(160, 224, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 602 | self.inception_5a_double_3x3_1_bn = nn.BatchNorm2d(224, eps=1e-05, momentum=0.9, affine=True) 603 | self.inception_5a_relu_double_3x3_1 = nn.ReLU(inplace) 604 | self.inception_5a_double_3x3_2 = nn.Conv2d(224, 224, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 605 | self.inception_5a_double_3x3_2_bn = nn.BatchNorm2d(224, eps=1e-05, momentum=0.9, affine=True) 606 | self.inception_5a_relu_double_3x3_2 = nn.ReLU(inplace) 607 | self.inception_5a_pool = nn.AvgPool2d(3, stride=1, padding=1, ceil_mode=True, count_include_pad=True) 608 | self.inception_5a_pool_proj = nn.Conv2d(1056, 128, kernel_size=(1, 1), stride=(1, 1)) 609 | self.inception_5a_pool_proj_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True) 610 | self.inception_5a_relu_pool_proj = nn.ReLU(inplace) 611 | self.inception_5b_1x1 = nn.Conv2d(1024, 352, kernel_size=(1, 1), stride=(1, 1)) 612 | self.inception_5b_1x1_bn = nn.BatchNorm2d(352, eps=1e-05, momentum=0.9, affine=True) 613 | self.inception_5b_relu_1x1 = nn.ReLU(inplace) 614 | self.inception_5b_3x3_reduce = nn.Conv2d(1024, 192, kernel_size=(1, 1), stride=(1, 1)) 615 | self.inception_5b_3x3_reduce_bn = nn.BatchNorm2d(192, eps=1e-05, momentum=0.9, affine=True) 616 | self.inception_5b_relu_3x3_reduce = nn.ReLU(inplace) 617 | self.inception_5b_3x3 = nn.Conv2d(192, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 618 | self.inception_5b_3x3_bn = nn.BatchNorm2d(320, eps=1e-05, momentum=0.9, affine=True) 619 | self.inception_5b_relu_3x3 = nn.ReLU(inplace) 620 | self.inception_5b_double_3x3_reduce = nn.Conv2d(1024, 192, kernel_size=(1, 1), stride=(1, 1)) 621 | self.inception_5b_double_3x3_reduce_bn = nn.BatchNorm2d(192, eps=1e-05, momentum=0.9, affine=True) 622 | self.inception_5b_relu_double_3x3_reduce = nn.ReLU(inplace) 623 | self.inception_5b_double_3x3_1 = nn.Conv2d(192, 224, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 624 | self.inception_5b_double_3x3_1_bn = nn.BatchNorm2d(224, eps=1e-05, momentum=0.9, affine=True) 625 | self.inception_5b_relu_double_3x3_1 = nn.ReLU(inplace) 626 | self.inception_5b_double_3x3_2 = nn.Conv2d(224, 224, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 627 | self.inception_5b_double_3x3_2_bn = nn.BatchNorm2d(224, eps=1e-05, momentum=0.9, affine=True) 628 | self.inception_5b_relu_double_3x3_2 = nn.ReLU(inplace) 629 | self.inception_5b_pool = nn.MaxPool2d((3, 3), stride=(1, 1), padding=(1, 1), dilation=(1, 1), ceil_mode=True) 630 | self.inception_5b_pool_proj = nn.Conv2d(1024, 128, kernel_size=(1, 1), stride=(1, 1)) 631 | self.inception_5b_pool_proj_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True) 632 | self.inception_5b_relu_pool_proj = nn.ReLU(inplace) 633 | self.global_pool = nn.AvgPool2d(7, stride=1, padding=0, ceil_mode=True, count_include_pad=True) 634 | self.fc = nn.Linear(1024, num_classes) 635 | 636 | 637 | def bninception(pretrained='imagenet'): 638 | r"""BNInception model architecture from `_ paper. 639 | """ 640 | if pretrained is not None: 641 | print('=> Loading from pretrained model: {}'.format(pretrained)) 642 | settings = pretrained_settings['bninception'][pretrained] 643 | num_classes = settings['num_classes'] 644 | model = BNInception(num_classes=num_classes) 645 | model.load_state_dict(model_zoo.load_url(settings['url'])) 646 | model.input_space = settings['input_space'] 647 | model.input_size = settings['input_size'] 648 | model.input_range = settings['input_range'] 649 | model.mean = settings['mean'] 650 | model.std = settings['std'] 651 | else: 652 | raise NotImplementedError 653 | return model 654 | 655 | 656 | if __name__ == '__main__': 657 | model = bninception() 658 | -------------------------------------------------------------------------------- /archs/mobilenet_v2.py: -------------------------------------------------------------------------------- 1 | # Code adapted from https://github.com/tonylins/pytorch-mobilenet-v2 2 | 3 | import torch.nn as nn 4 | import math 5 | 6 | 7 | def conv_bn(inp, oup, stride): 8 | return nn.Sequential( 9 | nn.Conv2d(inp, oup, 3, stride, 1, bias=False), 10 | nn.BatchNorm2d(oup), 11 | nn.ReLU6(inplace=True) 12 | ) 13 | 14 | 15 | def conv_1x1_bn(inp, oup): 16 | return nn.Sequential( 17 | nn.Conv2d(inp, oup, 1, 1, 0, bias=False), 18 | nn.BatchNorm2d(oup), 19 | nn.ReLU6(inplace=True) 20 | ) 21 | 22 | 23 | def make_divisible(x, divisible_by=8): 24 | import numpy as np 25 | return int(np.ceil(x * 1. / divisible_by) * divisible_by) 26 | 27 | 28 | class InvertedResidual(nn.Module): 29 | def __init__(self, inp, oup, stride, expand_ratio): 30 | super(InvertedResidual, self).__init__() 31 | self.stride = stride 32 | assert stride in [1, 2] 33 | 34 | hidden_dim = int(inp * expand_ratio) 35 | self.use_res_connect = self.stride == 1 and inp == oup 36 | 37 | if expand_ratio == 1: 38 | self.conv = nn.Sequential( 39 | # dw 40 | nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False), 41 | nn.BatchNorm2d(hidden_dim), 42 | nn.ReLU6(inplace=True), 43 | # pw-linear 44 | nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), 45 | nn.BatchNorm2d(oup), 46 | ) 47 | else: 48 | self.conv = nn.Sequential( 49 | # pw 50 | nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False), 51 | nn.BatchNorm2d(hidden_dim), 52 | nn.ReLU6(inplace=True), 53 | # dw 54 | nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False), 55 | nn.BatchNorm2d(hidden_dim), 56 | nn.ReLU6(inplace=True), 57 | # pw-linear 58 | nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), 59 | nn.BatchNorm2d(oup), 60 | ) 61 | 62 | def forward(self, x): 63 | if self.use_res_connect: 64 | return x + self.conv(x) 65 | else: 66 | return self.conv(x) 67 | 68 | 69 | class MobileNetV2(nn.Module): 70 | def __init__(self, n_class=1000, input_size=224, width_mult=1.): 71 | super(MobileNetV2, self).__init__() 72 | block = InvertedResidual 73 | input_channel = 32 74 | last_channel = 1280 75 | interverted_residual_setting = [ 76 | # t, c, n, s 77 | [1, 16, 1, 1], 78 | [6, 24, 2, 2], 79 | [6, 32, 3, 2], 80 | [6, 64, 4, 2], 81 | [6, 96, 3, 1], 82 | [6, 160, 3, 2], 83 | [6, 320, 1, 1], 84 | ] 85 | 86 | # building first layer 87 | assert input_size % 32 == 0 88 | # input_channel = make_divisible(input_channel * width_mult) # first channel is always 32! 89 | self.last_channel = make_divisible(last_channel * width_mult) if width_mult > 1.0 else last_channel 90 | self.features = [conv_bn(3, input_channel, 2)] 91 | # building inverted residual blocks 92 | for t, c, n, s in interverted_residual_setting: 93 | output_channel = make_divisible(c * width_mult) if t > 1 else c 94 | for i in range(n): 95 | if i == 0: 96 | self.features.append(block(input_channel, output_channel, s, expand_ratio=t)) 97 | else: 98 | self.features.append(block(input_channel, output_channel, 1, expand_ratio=t)) 99 | input_channel = output_channel 100 | # building last several layers 101 | self.features.append(conv_1x1_bn(input_channel, self.last_channel)) 102 | # make it nn.Sequential 103 | self.features = nn.Sequential(*self.features) 104 | 105 | # building classifier 106 | self.classifier = nn.Linear(self.last_channel, n_class) 107 | 108 | self._initialize_weights() 109 | 110 | def forward(self, x): 111 | x = self.features(x) 112 | x = x.mean(3).mean(2) 113 | x = self.classifier(x) 114 | return x 115 | 116 | def _initialize_weights(self): 117 | for m in self.modules(): 118 | if isinstance(m, nn.Conv2d): 119 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 120 | m.weight.data.normal_(0, math.sqrt(2. / n)) 121 | if m.bias is not None: 122 | m.bias.data.zero_() 123 | elif isinstance(m, nn.BatchNorm2d): 124 | m.weight.data.fill_(1) 125 | m.bias.data.zero_() 126 | elif isinstance(m, nn.Linear): 127 | n = m.weight.size(1) 128 | m.weight.data.normal_(0, 0.01) 129 | m.bias.data.zero_() 130 | 131 | 132 | def mobilenet_v2(pretrained=True): 133 | model = MobileNetV2(width_mult=1) 134 | 135 | if pretrained: 136 | try: 137 | from torch.hub import load_state_dict_from_url 138 | except ImportError: 139 | from torch.utils.model_zoo import load_url as load_state_dict_from_url 140 | state_dict = load_state_dict_from_url( 141 | 'https://www.dropbox.com/s/47tyzpofuuyyv1b/mobilenetv2_1.0-f2a8633.pth.tar?dl=1', progress=True) 142 | model.load_state_dict(state_dict) 143 | return model 144 | 145 | 146 | if __name__ == '__main__': 147 | net = mobilenet_v2(True) 148 | 149 | 150 | 151 | 152 | 153 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # Code for paper: 2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance" 3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan 4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch 5 | 6 | import sys 7 | import os 8 | import time 9 | import shutil 10 | import torch.nn.parallel 11 | import torch.backends.cudnn as cudnn 12 | import torch.optim 13 | from torch.nn.utils import clip_grad_norm_ 14 | 15 | from ops.dataset import PANDataSet 16 | from ops.models import PAN 17 | from ops.transforms import * 18 | from opts import parser 19 | from ops import dataset_config 20 | from ops.utils import AverageMeter, accuracy 21 | from ops.temporal_shift import make_temporal_pool 22 | 23 | from tensorboardX import SummaryWriter 24 | 25 | best_prec1 = 0 26 | 27 | def main(): 28 | 29 | global args, best_prec1 30 | args = parser.parse_args() 31 | 32 | if args.base == 'TSM': 33 | args.shift = True 34 | args.shift_div = 8 35 | args.shift_place = 'blockres' 36 | 37 | num_class, args.train_list, args.val_list, args.root_path, prefix = dataset_config.return_dataset(args.dataset, 38 | args.modality) 39 | 40 | full_arch_name = args.arch 41 | if args.shift: 42 | full_arch_name += '_shift{}_{}'.format(args.shift_div, args.shift_place) 43 | if args.temporal_pool: 44 | full_arch_name += '_tpool' 45 | args.store_name = '_'.join( 46 | ['PAN', args.modality, args.dataset, full_arch_name, args.consensus_type, 'segment%d' % args.num_segments, 47 | 'e{}'.format(args.epochs)]) 48 | if args.pretrain != 'imagenet': 49 | args.store_name += '_{}'.format(args.pretrain) 50 | if args.lr_type != 'step': 51 | args.store_name += '_{}'.format(args.lr_type) 52 | if args.dense_sample: 53 | args.store_name += '_dense' 54 | if args.non_local > 0: 55 | args.store_name += '_nl' 56 | if args.suffix is not None: 57 | args.store_name += '_{}'.format(args.suffix) 58 | print('- storing name: ' + args.store_name) 59 | 60 | check_rootfolders() 61 | 62 | if args.modality == 'RGB': 63 | data_length = 1 64 | elif args.modality in ['PA', 'Lite']: 65 | data_length = 4 66 | elif args.modality in ['Flow', 'RGBDiff']: 67 | data_length = 5 68 | 69 | print("-"*30) 70 | print("Environment Versions:") 71 | print("- Python: {}".format(sys.version)) 72 | print("- PyTorch: {}".format(torch.__version__)) 73 | print("- TorchVison: {}".format(torchvision.__version__)) 74 | 75 | args_dict = args.__dict__ 76 | print("-"*30) 77 | print("PAN Configurations:") 78 | print(args_dict) 79 | print("-"*30) 80 | 81 | model = PAN(num_class, args.num_segments, args.modality, 82 | base_model=args.arch, 83 | consensus_type=args.consensus_type, 84 | dropout=args.dropout, 85 | img_feature_dim=args.img_feature_dim, 86 | partial_bn=not args.no_partialbn, 87 | pretrain=args.pretrain, 88 | is_shift=args.shift, shift_div=args.shift_div, shift_place=args.shift_place, 89 | fc_lr5=not (args.tune_from and args.dataset in args.tune_from), 90 | temporal_pool=args.temporal_pool, 91 | non_local=args.non_local, data_length=data_length, has_VAP=args.VAP) 92 | 93 | #print(model) 94 | 95 | crop_size = model.crop_size 96 | scale_size = model.scale_size 97 | input_mean = model.input_mean 98 | input_std = model.input_std 99 | policies = model.get_optim_policies() 100 | train_augmentation = model.get_augmentation(flip=False if 'something' in args.dataset or 'jester' in args.dataset else True) 101 | 102 | model = torch.nn.DataParallel(model, device_ids=args.gpus).cuda() 103 | 104 | optimizer = torch.optim.SGD(policies, 105 | args.lr, 106 | momentum=args.momentum, 107 | weight_decay=args.weight_decay) 108 | 109 | if args.resume: 110 | if args.temporal_pool: # early temporal pool so that we can load the state_dict 111 | make_temporal_pool(model.module.base_model, args.num_segments) 112 | if os.path.isfile(args.resume): 113 | print(("=> loading checkpoint '{}'".format(args.resume))) 114 | checkpoint = torch.load(args.resume) 115 | args.start_epoch = checkpoint['epoch'] 116 | best_prec1 = checkpoint['best_prec1'] 117 | model.load_state_dict(checkpoint['state_dict']) 118 | optimizer.load_state_dict(checkpoint['optimizer']) 119 | print(("=> loaded checkpoint '{}' (epoch {})" 120 | .format(args.evaluate, checkpoint['epoch']))) 121 | else: 122 | print(("=> no checkpoint found at '{}'".format(args.resume))) 123 | 124 | if args.tune_from: 125 | print(("=> fine-tuning from '{}'".format(args.tune_from))) 126 | sd = torch.load(args.tune_from) 127 | sd = sd['state_dict'] 128 | model_dict = model.state_dict() 129 | replace_dict = [] 130 | for k, v in sd.items(): 131 | if k not in model_dict and k.replace('.net', '') in model_dict: 132 | print('=> Load after remove .net: ', k) 133 | replace_dict.append((k, k.replace('.net', ''))) 134 | for k, v in model_dict.items(): 135 | if k not in sd and k.replace('.net', '') in sd: 136 | print('=> Load after adding .net: ', k) 137 | replace_dict.append((k.replace('.net', ''), k)) 138 | 139 | for k, k_new in replace_dict: 140 | sd[k_new] = sd.pop(k) 141 | keys1 = set(list(sd.keys())) 142 | keys2 = set(list(model_dict.keys())) 143 | set_diff = (keys1 - keys2) | (keys2 - keys1) 144 | print('#### Notice: keys that failed to load: {}'.format(set_diff)) 145 | if args.dataset not in args.tune_from: # new dataset 146 | print('=> New dataset, do not load fc weights') 147 | sd = {k: v for k, v in sd.items() if 'fc' not in k} 148 | if args.modality == 'Flow' and 'Flow' not in args.tune_from: 149 | sd = {k: v for k, v in sd.items() if 'conv1.weight' not in k} 150 | model_dict.update(sd) 151 | model.load_state_dict(model_dict) 152 | 153 | if args.temporal_pool and not args.resume: 154 | make_temporal_pool(model.module.base_model, args.num_segments) 155 | 156 | cudnn.benchmark = True 157 | 158 | # Data loading code 159 | if args.modality != 'RGBDiff': 160 | normalize = GroupNormalize(input_mean, input_std) 161 | else: 162 | normalize = IdentityTransform() 163 | 164 | train_loader = torch.utils.data.DataLoader( 165 | PANDataSet(args.root_path, args.train_list, num_segments=args.num_segments, 166 | new_length=data_length, 167 | modality=args.modality, 168 | image_tmpl=prefix, 169 | transform=torchvision.transforms.Compose([ 170 | train_augmentation, 171 | Stack(roll=(args.arch in ['BNInception', 'InceptionV3'])), 172 | ToTorchFormatTensor(div=(args.arch not in ['BNInception', 'InceptionV3'])), 173 | normalize, 174 | ]), dense_sample=args.dense_sample, is_lmdb=args.lmdb), 175 | batch_size=args.batch_size, shuffle=True, 176 | num_workers=args.workers, pin_memory=True, 177 | drop_last=True) # prevent something not % n_GPU 178 | 179 | val_loader = torch.utils.data.DataLoader( 180 | PANDataSet(args.root_path, args.val_list, num_segments=args.num_segments, 181 | new_length=data_length, 182 | modality=args.modality, 183 | image_tmpl=prefix, 184 | random_shift=False, 185 | transform=torchvision.transforms.Compose([ 186 | GroupScale(int(scale_size)), 187 | GroupCenterCrop(crop_size), 188 | Stack(roll=(args.arch in ['BNInception', 'InceptionV3'])), 189 | ToTorchFormatTensor(div=(args.arch not in ['BNInception', 'InceptionV3'])), 190 | normalize, 191 | ]), dense_sample=args.dense_sample, is_lmdb=args.lmdb), 192 | batch_size=args.batch_size, shuffle=False, 193 | num_workers=args.workers, pin_memory=True) 194 | 195 | # define loss function (criterion) and optimizer 196 | if args.loss_type == 'nll': 197 | criterion = torch.nn.CrossEntropyLoss().cuda() 198 | else: 199 | raise ValueError("Unknown loss type") 200 | 201 | print("-"*30) 202 | for group in policies: 203 | print(('group: {} has {} params, lr_mult: {}, decay_mult: {}'.format( 204 | group['name'], len(group['params']), group['lr_mult'], group['decay_mult']))) 205 | print("-"*30) 206 | 207 | if args.evaluate: 208 | validate(val_loader, model, criterion, 0) 209 | return 210 | 211 | log_training = open(os.path.join(args.root_log, args.store_name, 'log.csv'), 'w') 212 | with open(os.path.join(args.root_log, args.store_name, 'args.txt'), 'w') as f: 213 | f.write(str(args)) 214 | tf_writer = SummaryWriter(log_dir=os.path.join(args.root_log, args.store_name)) 215 | for epoch in range(args.start_epoch, args.epochs): 216 | adjust_learning_rate(optimizer, epoch, args.lr_type, args.lr_steps) 217 | 218 | # train for one epoch 219 | train(train_loader, model, criterion, optimizer, epoch, log_training, tf_writer) 220 | 221 | # evaluate on validation set 222 | if (epoch + 1) % args.eval_freq == 0 or epoch == args.epochs - 1: 223 | prec1 = validate(val_loader, model, criterion, epoch, log_training, tf_writer) 224 | 225 | # remember best prec@1 and save checkpoint 226 | is_best = prec1 > best_prec1 227 | best_prec1 = max(prec1, best_prec1) 228 | tf_writer.add_scalar('acc/test_top1_best', best_prec1, epoch) 229 | 230 | output_best = 'Best Prec@1: %.3f\n' % (best_prec1) 231 | print(output_best) 232 | log_training.write(output_best + '\n') 233 | log_training.flush() 234 | 235 | save_checkpoint({ 236 | 'epoch': epoch + 1, 237 | 'arch': args.arch, 238 | 'state_dict': model.state_dict(), 239 | 'optimizer': optimizer.state_dict(), 240 | 'best_prec1': best_prec1, 241 | }, is_best) 242 | 243 | def train(train_loader, model, criterion, optimizer, epoch, log, tf_writer): 244 | batch_time = AverageMeter() 245 | data_time = AverageMeter() 246 | losses = AverageMeter() 247 | top1 = AverageMeter() 248 | top5 = AverageMeter() 249 | 250 | if args.no_partialbn: 251 | model.module.partialBN(False) 252 | else: 253 | model.module.partialBN(True) 254 | 255 | # switch to train mode 256 | model.train() 257 | 258 | end = time.time() 259 | for i, (input, target) in enumerate(train_loader): 260 | # measure data loading time 261 | data_time.update(time.time() - end) 262 | if i == 20: 263 | os.system("gpustat") 264 | target = target.cuda() 265 | input_var = torch.autograd.Variable(input) 266 | target_var = torch.autograd.Variable(target) 267 | 268 | # compute output 269 | output = model(input_var) 270 | loss = criterion(output, target_var) 271 | 272 | # measure accuracy and record loss 273 | prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 274 | losses.update(loss.item(), input.size(0)) 275 | top1.update(prec1.item(), input.size(0)) 276 | top5.update(prec5.item(), input.size(0)) 277 | 278 | # compute gradient and do SGD step 279 | loss.backward() 280 | 281 | no_grad_cnt = 0 282 | 283 | if i % args.iter_size == 0: 284 | # scale down gradients when iter size is functioning 285 | if args.iter_size != 1: 286 | for g in optimizer.param_groups: 287 | for p in g['params']: 288 | if isinstance(p.grad, torch.Tensor): 289 | p.grad /= args.iter_size 290 | else: 291 | no_grad_cnt = no_grad_cnt + 1 292 | 293 | if args.clip_gradient is not None: 294 | total_norm = clip_grad_norm_(model.parameters(), args.clip_gradient) 295 | else: 296 | total_norm = 0 297 | 298 | optimizer.step() 299 | optimizer.zero_grad() 300 | 301 | #if i == 0: 302 | # print("{}\nWARNING: There are {} params without gradient!!!!!\n{}".format("*"*50, no_grad_cnt, "*"*50)) 303 | 304 | # measure elapsed time 305 | batch_time.update(time.time() - end) 306 | end = time.time() 307 | 308 | if i % args.print_freq == 0: 309 | output = ('Epoch: [{0}][{1}/{2}], lr: {lr:.5f}\t' 310 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 311 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 312 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 313 | 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 314 | 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( 315 | epoch, i, len(train_loader), batch_time=batch_time, 316 | data_time=data_time, loss=losses, top1=top1, top5=top5, lr=optimizer.param_groups[-1]['lr'] * 0.1)) # TODO 317 | print(output) 318 | log.write(output + '\n') 319 | log.flush() 320 | 321 | tf_writer.add_scalar('loss/train', losses.avg, epoch) 322 | tf_writer.add_scalar('acc/train_top1', top1.avg, epoch) 323 | tf_writer.add_scalar('acc/train_top5', top5.avg, epoch) 324 | tf_writer.add_scalar('lr', optimizer.param_groups[-1]['lr'], epoch) 325 | 326 | 327 | def validate(val_loader, model, criterion, epoch, log=None, tf_writer=None): 328 | batch_time = AverageMeter() 329 | losses = AverageMeter() 330 | top1 = AverageMeter() 331 | top5 = AverageMeter() 332 | 333 | # switch to evaluate mode 334 | model.eval() 335 | 336 | end = time.time() 337 | with torch.no_grad(): 338 | for i, (input, target) in enumerate(val_loader): 339 | target = target.cuda() 340 | 341 | # compute output 342 | output = model(input) 343 | loss = criterion(output, target) 344 | 345 | # measure accuracy and record loss 346 | prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 347 | 348 | losses.update(loss.item(), input.size(0)) 349 | top1.update(prec1.item(), input.size(0)) 350 | top5.update(prec5.item(), input.size(0)) 351 | 352 | # measure elapsed time 353 | batch_time.update(time.time() - end) 354 | end = time.time() 355 | 356 | if i % args.print_freq == 0: 357 | output = ('Test: [{0}/{1}]\t' 358 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 359 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 360 | 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 361 | 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( 362 | i, len(val_loader), batch_time=batch_time, loss=losses, 363 | top1=top1, top5=top5)) 364 | print(output) 365 | if log is not None: 366 | log.write(output + '\n') 367 | log.flush() 368 | 369 | output = ('Testing Results: Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f} Loss {loss.avg:.5f}' 370 | .format(top1=top1, top5=top5, loss=losses)) 371 | print(output) 372 | if log is not None: 373 | log.write(output + '\n') 374 | log.flush() 375 | 376 | if tf_writer is not None: 377 | tf_writer.add_scalar('loss/test', losses.avg, epoch) 378 | tf_writer.add_scalar('acc/test_top1', top1.avg, epoch) 379 | tf_writer.add_scalar('acc/test_top5', top5.avg, epoch) 380 | 381 | return top1.avg 382 | 383 | 384 | def save_checkpoint(state, is_best): 385 | filename = '%s/%s/ckpt.pth.tar' % (args.root_model, args.store_name) 386 | torch.save(state, filename) 387 | if is_best: 388 | shutil.copyfile(filename, filename.replace('pth.tar', 'best.pth.tar')) 389 | 390 | 391 | def adjust_learning_rate(optimizer, epoch, lr_type, lr_steps): 392 | """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" 393 | if lr_type == 'step': 394 | decay = 0.1 ** (sum(epoch >= np.array(lr_steps))) 395 | lr = args.lr * decay 396 | decay = args.weight_decay 397 | elif lr_type == 'cos': 398 | import math 399 | lr = 0.5 * args.lr * (1 + math.cos(math.pi * epoch / args.epochs)) 400 | decay = args.weight_decay 401 | else: 402 | raise NotImplementedError 403 | for param_group in optimizer.param_groups: 404 | param_group['lr'] = lr * param_group['lr_mult'] 405 | param_group['weight_decay'] = decay * param_group['decay_mult'] 406 | 407 | 408 | def check_rootfolders(): 409 | """Create log and model folder""" 410 | folders_util = [args.root_log, args.root_model, 411 | os.path.join(args.root_log, args.store_name), 412 | os.path.join(args.root_model, args.store_name)] 413 | for folder in folders_util: 414 | if not os.path.exists(folder): 415 | print('creating folder ' + folder) 416 | os.mkdir(folder) 417 | 418 | if __name__ == '__main__': 419 | main() 420 | -------------------------------------------------------------------------------- /ops/PAN_modules.py: -------------------------------------------------------------------------------- 1 | # Code for paper: 2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance" 3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan 4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch 5 | 6 | import torch 7 | from torch import nn 8 | import math 9 | 10 | class PA(nn.Module): 11 | def __init__(self, n_length): 12 | super(PA, self).__init__() 13 | self.shallow_conv = nn.Conv2d(3,8,7,1,3) 14 | self.n_length = n_length 15 | for m in self.modules(): 16 | if isinstance(m, nn.Conv2d): 17 | nn.init.normal_(m.weight.data, 0, 0.001) 18 | nn.init.constant_(m.bias.data, 0) 19 | 20 | def forward(self, x): 21 | h, w = x.size(-2), x.size(-1) 22 | x = x.view((-1, 3) + x.size()[-2:]) 23 | x = self.shallow_conv(x) 24 | x = x.view(-1, self.n_length, x.size(-3), x.size(-2)*x.size(-1)) 25 | for i in range(self.n_length-1): 26 | d_i = nn.PairwiseDistance(p=2)(x[:,i,:,:], x[:,i+1,:,:]).unsqueeze(1) 27 | d = d_i if i == 0 else torch.cat((d, d_i), 1) 28 | PA = d.view(-1, 1*(self.n_length-1), h, w) 29 | return PA 30 | 31 | class VAP(nn.Module): 32 | def __init__(self, n_segment, feature_dim, num_class, dropout_ratio): 33 | super(VAP, self).__init__() 34 | VAP_level = int(math.log(n_segment, 2)) 35 | print("=> Using {}-level VAP".format(VAP_level)) 36 | self.n_segment = n_segment 37 | self.VAP_level = VAP_level 38 | total_timescale = 0 39 | for i in range(VAP_level): 40 | timescale = 2**i 41 | total_timescale += timescale 42 | setattr(self, "VAP_{}".format(timescale), nn.MaxPool3d((n_segment//timescale,1,1),1,0,(timescale,1,1))) 43 | self.GAP = nn.AdaptiveAvgPool1d(1) 44 | self.TES = nn.Sequential( 45 | nn.Linear(total_timescale, total_timescale*4, bias=False), 46 | nn.ReLU(inplace=True), 47 | nn.Linear(total_timescale*4, total_timescale, bias=False) 48 | ) 49 | self.softmax = nn.Softmax(dim=1) 50 | self.dropout = nn.Dropout(p=dropout_ratio) 51 | self.pred = nn.Linear(feature_dim, num_class) 52 | 53 | # fc init 54 | for m in self.modules(): 55 | if isinstance(m, nn.Linear): 56 | nn.init.normal_(m.weight.data, 0, 0.001) 57 | if hasattr(m.bias, 'data'): 58 | nn.init.constant_(m.bias.data, 0) 59 | 60 | def forward(self, x): 61 | _, d = x.size() 62 | x = x.view(-1, self.n_segment, d, 1, 1).permute(0,2,1,3,4) 63 | x = torch.cat(tuple([getattr(self, "VAP_{}".format(2**i))(x) for i in range(self.VAP_level)]), 2).squeeze(3).squeeze(3).permute(0,2,1) 64 | w = self.GAP(x).squeeze(2) 65 | w = self.softmax(self.TES(w)) 66 | x = x * w.unsqueeze(2) 67 | x = x.sum(dim=1) 68 | x = self.dropout(x) 69 | x = self.pred(x.view(-1,d)) 70 | return x 71 | 72 | -------------------------------------------------------------------------------- /ops/__init__.py: -------------------------------------------------------------------------------- 1 | from ops.basic_ops import * -------------------------------------------------------------------------------- /ops/basic_ops.py: -------------------------------------------------------------------------------- 1 | # Code for paper: 2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance" 3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan 4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch 5 | 6 | import torch 7 | 8 | 9 | class Identity(torch.nn.Module): 10 | def forward(self, input): 11 | return input 12 | 13 | 14 | class SegmentConsensus(torch.nn.Module): 15 | 16 | def __init__(self, consensus_type, dim=1): 17 | super(SegmentConsensus, self).__init__() 18 | self.consensus_type = consensus_type 19 | self.dim = dim 20 | self.shape = None 21 | 22 | def forward(self, input_tensor): 23 | self.shape = input_tensor.size() 24 | if self.consensus_type == 'avg': 25 | output = input_tensor.mean(dim=self.dim, keepdim=True) 26 | elif self.consensus_type == 'identity': 27 | output = input_tensor 28 | else: 29 | output = None 30 | 31 | return output 32 | 33 | 34 | class ConsensusModule(torch.nn.Module): 35 | 36 | def __init__(self, consensus_type, dim=1): 37 | super(ConsensusModule, self).__init__() 38 | self.consensus_type = consensus_type if consensus_type != 'rnn' else 'identity' 39 | self.dim = dim 40 | 41 | def forward(self, input): 42 | return SegmentConsensus(self.consensus_type, self.dim)(input) 43 | -------------------------------------------------------------------------------- /ops/dataset.py: -------------------------------------------------------------------------------- 1 | # Code for paper: 2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance" 3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan 4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch 5 | 6 | import torch.utils.data as data 7 | 8 | from PIL import Image 9 | import os 10 | import numpy as np 11 | from numpy.random import randint 12 | import lmdb 13 | from io import BytesIO 14 | 15 | class VideoRecord(object): 16 | def __init__(self, row): 17 | self._data = row 18 | 19 | @property 20 | def path(self): 21 | return self._data[0] 22 | 23 | @property 24 | def num_frames(self): 25 | return int(self._data[1]) 26 | 27 | @property 28 | def label(self): 29 | return int(self._data[2]) 30 | 31 | 32 | class PANDataSet(data.Dataset): 33 | def __init__(self, root_path, list_file, 34 | num_segments=3, new_length=1, modality='RGB', 35 | image_tmpl='img_{:05d}.jpg', transform=None, 36 | random_shift=True, test_mode=False, 37 | remove_missing=False, dense_sample=False, twice_sample=False, is_lmdb=False): 38 | 39 | self.root_path = root_path 40 | self.list_file = list_file 41 | self.num_segments = num_segments 42 | self.new_length = new_length 43 | self.modality = modality 44 | self.image_tmpl = image_tmpl 45 | self.transform = transform 46 | self.random_shift = random_shift 47 | self.test_mode = test_mode 48 | self.remove_missing = remove_missing 49 | self.dense_sample = dense_sample # using dense sample as I3D 50 | self.twice_sample = twice_sample # twice sample for more validation 51 | if self.dense_sample: 52 | print('=> Using dense sample for the dataset...') 53 | if self.twice_sample: 54 | print('=> Using twice sample for the dataset...') 55 | 56 | self.is_lmdb = is_lmdb 57 | if self.is_lmdb: 58 | print('=> Loading lmdb dataset from: {}'.format(self.root_path)) 59 | self.database = lmdb.open(self.root_path, readonly=True).begin().cursor() 60 | 61 | if self.modality == 'RGBDiff': 62 | self.new_length += 1 # Diff needs one more image to calculate diff 63 | 64 | self._parse_list() 65 | 66 | def _load_image(self, directory, idx): 67 | if self.modality in ['RGB','PA', 'Lite', 'RGBDiff']: 68 | if self.is_lmdb: 69 | return [Image.open(BytesIO(self.database.get("{}/{:03d}/{:08d}".format(directory, 0, idx-1).encode())))] 70 | else: 71 | return [Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format(idx))).convert('RGB')] 72 | ''' 73 | try: 74 | if self.is_lmdb: 75 | return [Image.open(BytesIO(self.database.get("{}/{:03d}/{:08d}".format(directory, 0, idx-1).encode())))] 76 | else: 77 | return [Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format(idx))).convert('RGB')] 78 | except Exception: 79 | print('error loading image:', os.path.join(self.root_path, directory, self.image_tmpl.format(idx))) 80 | return [Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format(1))).convert('RGB')] 81 | ''' 82 | elif self.modality == 'Flow': 83 | if self.image_tmpl == 'flow_{}_{:05d}.jpg': # ucf 84 | x_img = Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format('x', idx))).convert( 85 | 'L') 86 | y_img = Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format('y', idx))).convert( 87 | 'L') 88 | elif self.image_tmpl == '{:06d}-{}_{:05d}.jpg': # something v1 flow 89 | x_img = Image.open(os.path.join(self.root_path, '{:06d}'.format(int(directory)), self.image_tmpl. 90 | format(int(directory), 'x', idx))).convert('L') 91 | y_img = Image.open(os.path.join(self.root_path, '{:06d}'.format(int(directory)), self.image_tmpl. 92 | format(int(directory), 'y', idx))).convert('L') 93 | else: 94 | try: 95 | # idx_skip = 1 + (idx-1)*5 96 | flow = Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format(idx))).convert( 97 | 'RGB') 98 | except Exception: 99 | print('error loading flow file:', 100 | os.path.join(self.root_path, directory, self.image_tmpl.format(idx))) 101 | flow = Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format(1))).convert('RGB') 102 | # the input flow file is RGB image with (flow_x, flow_y, blank) for each channel 103 | flow_x, flow_y, _ = flow.split() 104 | x_img = flow_x.convert('L') 105 | y_img = flow_y.convert('L') 106 | 107 | return [x_img, y_img] 108 | 109 | def _parse_list(self): 110 | # check the frame number is large >3: 111 | tmp = [x.strip().split(' ') for x in open(self.list_file)] 112 | if not self.test_mode or self.remove_missing: 113 | tmp = [item for item in tmp if int(item[1]) >= 3] 114 | self.video_list = [VideoRecord(item) for item in tmp] 115 | 116 | if self.image_tmpl == '{:06d}-{}_{:05d}.jpg': 117 | for v in self.video_list: 118 | v._data[1] = int(v._data[1]) / 2 119 | print('video number:%d' % (len(self.video_list))) 120 | 121 | def _sample_indices(self, record): 122 | """ 123 | 124 | :param record: VideoRecord 125 | :return: list 126 | """ 127 | if self.dense_sample: # i3d dense sample 128 | sample_pos = max(1, 1 + record.num_frames - 64) 129 | t_stride = 64 // self.num_segments 130 | start_idx = 0 if sample_pos == 1 else np.random.randint(0, sample_pos - 1) 131 | offsets = [(idx * t_stride + start_idx) % record.num_frames for idx in range(self.num_segments)] 132 | return np.array(offsets) + 1 133 | else: # normal sample 134 | average_duration = (record.num_frames - self.new_length + 1) // self.num_segments 135 | if average_duration > 0: 136 | offsets = np.multiply(list(range(self.num_segments)), average_duration) + randint(average_duration, 137 | size=self.num_segments) 138 | elif record.num_frames > self.num_segments: 139 | offsets = np.sort(randint(record.num_frames - self.new_length + 1, size=self.num_segments)) 140 | else: 141 | offsets = np.zeros((self.num_segments,)) 142 | return offsets + 1 143 | 144 | def _get_val_indices(self, record): 145 | if self.dense_sample: # i3d dense sample 146 | sample_pos = max(1, 1 + record.num_frames - 64) 147 | t_stride = 64 // self.num_segments 148 | start_idx = 0 if sample_pos == 1 else np.random.randint(0, sample_pos - 1) 149 | offsets = [(idx * t_stride + start_idx) % record.num_frames for idx in range(self.num_segments)] 150 | return np.array(offsets) + 1 151 | else: 152 | if record.num_frames > self.num_segments + self.new_length - 1: 153 | tick = (record.num_frames - self.new_length + 1) / float(self.num_segments) 154 | offsets = np.array([int(tick / 2.0 + tick * x) for x in range(self.num_segments)]) 155 | else: 156 | offsets = np.zeros((self.num_segments,)) 157 | return offsets + 1 158 | 159 | def _get_test_indices(self, record): 160 | if self.dense_sample: 161 | sample_pos = max(1, 1 + record.num_frames - 64) 162 | t_stride = 64 // self.num_segments 163 | start_list = np.linspace(0, sample_pos - 1, num=10, dtype=int) 164 | offsets = [] 165 | for start_idx in start_list.tolist(): 166 | offsets += [(idx * t_stride + start_idx) % record.num_frames for idx in range(self.num_segments)] 167 | return np.array(offsets) + 1 168 | elif self.twice_sample: 169 | tick = (record.num_frames - self.new_length + 1) / float(self.num_segments) 170 | 171 | offsets = np.array([int(tick / 2.0 + tick * x) for x in range(self.num_segments)] + 172 | [int(tick * x) for x in range(self.num_segments)]) 173 | 174 | return offsets + 1 175 | else: 176 | tick = (record.num_frames - self.new_length + 1) / float(self.num_segments) 177 | offsets = np.array([int(tick / 2.0 + tick * x) for x in range(self.num_segments)]) 178 | return offsets + 1 179 | 180 | def __getitem__(self, index): 181 | record = self.video_list[index] 182 | # check this is a legit video folder 183 | 184 | if self.image_tmpl == 'flow_{}_{:05d}.jpg': 185 | file_name = self.image_tmpl.format('x', 1) 186 | full_path = os.path.join(self.root_path, record.path, file_name) 187 | elif self.image_tmpl == '{:06d}-{}_{:05d}.jpg': 188 | file_name = self.image_tmpl.format(int(record.path), 'x', 1) 189 | full_path = os.path.join(self.root_path, '{:06d}'.format(int(record.path)), file_name) 190 | else: 191 | file_name = self.image_tmpl.format(1) 192 | full_path = os.path.join(self.root_path, record.path, file_name) 193 | 194 | ''' 195 | while not os.path.exists(full_path): 196 | print('################## Not Found:', os.path.join(self.root_path, record.path, file_name)) 197 | index = np.random.randint(len(self.video_list)) 198 | record = self.video_list[index] 199 | if self.image_tmpl == 'flow_{}_{:05d}.jpg': 200 | file_name = self.image_tmpl.format('x', 1) 201 | full_path = os.path.join(self.root_path, record.path, file_name) 202 | elif self.image_tmpl == '{:06d}-{}_{:05d}.jpg': 203 | file_name = self.image_tmpl.format(int(record.path), 'x', 1) 204 | full_path = os.path.join(self.root_path, '{:06d}'.format(int(record.path)), file_name) 205 | else: 206 | file_name = self.image_tmpl.format(1) 207 | full_path = os.path.join(self.root_path, record.path, file_name) 208 | ''' 209 | 210 | if not self.test_mode: 211 | segment_indices = self._sample_indices(record) if self.random_shift else self._get_val_indices(record) 212 | else: 213 | segment_indices = self._get_test_indices(record) 214 | return self.get(record, segment_indices) 215 | 216 | def get(self, record, indices): 217 | 218 | images = list() 219 | for seg_ind in indices: 220 | p = int(seg_ind) 221 | for i in range(self.new_length): 222 | seg_imgs = self._load_image(record.path, p) 223 | images.extend(seg_imgs) 224 | if p < record.num_frames: 225 | p += 1 226 | 227 | process_data = self.transform(images) 228 | return process_data, record.label 229 | 230 | def __len__(self): 231 | return len(self.video_list) 232 | -------------------------------------------------------------------------------- /ops/dataset_config.py: -------------------------------------------------------------------------------- 1 | # Code for paper: 2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance" 3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan 4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch 5 | 6 | import os 7 | 8 | ROOT_DATASET = '/data/zhangcan/dataset/' 9 | 10 | 11 | def return_ucf101(modality): 12 | filename_categories = 101 13 | if modality in ['RGB', 'PA', 'Lite']: 14 | root_data = ROOT_DATASET + 'ucf101_frames' 15 | filename_imglist_train = '/data/zhangcan/file_lists/ucf101/split1/train.txt' 16 | filename_imglist_val = '/data/zhangcan/file_lists/ucf101/split1/val.txt' 17 | prefix = 'img_{:05d}.jpg' 18 | elif modality == 'Flow': 19 | root_data = ROOT_DATASET + 'UCF101/jpg' 20 | filename_imglist_train = 'UCF101/file_list/ucf101_flow_train_split_1.txt' 21 | filename_imglist_val = 'UCF101/file_list/ucf101_flow_val_split_1.txt' 22 | prefix = 'flow_{}_{:05d}.jpg' 23 | else: 24 | raise NotImplementedError('no such modality:' + modality) 25 | return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix 26 | 27 | 28 | def return_hmdb51(modality): 29 | filename_categories = 51 30 | if modality in ['RGB', 'PA', 'Lite']: 31 | root_data = ROOT_DATASET + 'hmdb51_frames' 32 | filename_imglist_train = '/data/zhangcan/file_lists/hmdb51/split1/train.txt' 33 | filename_imglist_val = '/data/zhangcan/file_lists/hmdb51/split1/val.txt' 34 | prefix = 'img_{:05d}.jpg' 35 | elif modality == 'Flow': 36 | root_data = ROOT_DATASET + 'HMDB51/images' 37 | filename_imglist_train = 'HMDB51/splits/hmdb51_flow_train_split_1.txt' 38 | filename_imglist_val = 'HMDB51/splits/hmdb51_flow_val_split_1.txt' 39 | prefix = 'flow_{}_{:05d}.jpg' 40 | else: 41 | raise NotImplementedError('no such modality:' + modality) 42 | return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix 43 | 44 | 45 | def return_something(modality): 46 | filename_categories = 174 47 | if modality in ['RGB', 'PA', 'Lite']: 48 | root_data = ROOT_DATASET + 'sthv1_frames' 49 | filename_imglist_train = '/data/zhangcan/file_lists/sthv1/split/train.txt' 50 | filename_imglist_val = '/data/zhangcan/file_lists/sthv1/split/val.txt' 51 | prefix = '{:05d}.jpg' 52 | elif modality == 'Flow': 53 | root_data = ROOT_DATASET + 'something/v1/20bn-something-something-v1-flow' 54 | filename_imglist_train = 'something/v1/train_videofolder_flow.txt' 55 | filename_imglist_val = 'something/v1/val_videofolder_flow.txt' 56 | prefix = '{:06d}-{}_{:05d}.jpg' 57 | else: 58 | print('no such modality:'+modality) 59 | raise NotImplementedError 60 | return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix 61 | 62 | 63 | def return_somethingv2(modality): 64 | filename_categories = 174 65 | if modality in ['RGB', 'PA', 'Lite']: 66 | root_data = ROOT_DATASET + 'sthv2_frames' 67 | filename_imglist_train = '/data/zhangcan/file_lists/sthv2/split/train.txt' 68 | filename_imglist_val = '/data/zhangcan/file_lists/sthv2/split/val.txt' 69 | prefix = '{:06d}.jpg' 70 | elif modality == 'Flow': 71 | root_data = ROOT_DATASET + 'something/v2/20bn-something-something-v2-flow' 72 | filename_imglist_train = 'something/v2/train_videofolder_flow.txt' 73 | filename_imglist_val = 'something/v2/val_videofolder_flow.txt' 74 | prefix = '{:06d}.jpg' 75 | else: 76 | raise NotImplementedError('no such modality:'+modality) 77 | return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix 78 | 79 | 80 | def return_jester(modality): 81 | filename_categories = 27 82 | if modality in ['RGB', 'PA', 'Lite']: 83 | prefix = '{:05d}.jpg' 84 | root_data = ROOT_DATASET + 'jester_frames' 85 | filename_imglist_train = '/data/zhangcan/file_lists/jester/split/train.txt' 86 | filename_imglist_val = '/data/zhangcan/file_lists/jester/split/val.txt' 87 | else: 88 | raise NotImplementedError('no such modality:'+modality) 89 | return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix 90 | 91 | 92 | def return_kinetics(modality): 93 | filename_categories = 400 94 | if modality in ['RGB', 'PA', 'Lite']: 95 | root_data = ROOT_DATASET + 'kinetics400_frames' 96 | filename_imglist_train = '/data/zhangcan/file_lists/kin400/split/train.txt' 97 | filename_imglist_val = '/data/zhangcan/file_lists/kin400/split/val.txt' 98 | prefix = 'img_{:05d}.jpg' 99 | else: 100 | raise NotImplementedError('no such modality:' + modality) 101 | return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix 102 | 103 | 104 | def return_dataset(dataset, modality): 105 | dict_single = {'jester': return_jester, 'something': return_something, 'somethingv2': return_somethingv2, 106 | 'ucf101': return_ucf101, 'hmdb51': return_hmdb51, 107 | 'kinetics': return_kinetics } 108 | if dataset in dict_single: 109 | file_categories, file_imglist_train, file_imglist_val, root_data, prefix = dict_single[dataset](modality) 110 | else: 111 | raise ValueError('Unknown dataset '+dataset) 112 | 113 | file_imglist_train = os.path.join(ROOT_DATASET, file_imglist_train) 114 | file_imglist_val = os.path.join(ROOT_DATASET, file_imglist_val) 115 | if isinstance(file_categories, str): 116 | file_categories = os.path.join(ROOT_DATASET, file_categories) 117 | with open(file_categories) as f: 118 | lines = f.readlines() 119 | categories = [item.rstrip() for item in lines] 120 | else: # number of categories 121 | categories = [None] * file_categories 122 | n_class = len(categories) 123 | print('{}: {} classes'.format(dataset, n_class)) 124 | return n_class, file_imglist_train, file_imglist_val, root_data, prefix 125 | -------------------------------------------------------------------------------- /ops/models.py: -------------------------------------------------------------------------------- 1 | # Code for paper: 2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance" 3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan 4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch 5 | 6 | from torch import nn 7 | 8 | from ops.basic_ops import ConsensusModule 9 | from ops.transforms import * 10 | from torch.nn.init import normal_, constant_ 11 | from ops.PAN_modules import PA, VAP 12 | 13 | class PAN(nn.Module): 14 | def __init__(self, num_class, num_segments, modality, 15 | base_model='resnet101', new_length=None, 16 | consensus_type='avg', before_softmax=True, 17 | dropout=0.8, img_feature_dim=256, 18 | crop_num=1, partial_bn=True, print_spec=False, pretrain='imagenet', 19 | is_shift=False, shift_div=8, shift_place='blockres', fc_lr5=False, 20 | temporal_pool=False, non_local=False, data_length=1, has_VAP=False): 21 | super(PAN, self).__init__() 22 | self.modality = modality 23 | self.num_segments = num_segments 24 | self.reshape = True 25 | self.before_softmax = before_softmax 26 | self.dropout = dropout 27 | self.crop_num = crop_num 28 | self.consensus_type = consensus_type 29 | self.img_feature_dim = img_feature_dim # the dimension of the CNN feature to represent each frame 30 | self.pretrain = pretrain 31 | 32 | self.is_shift = is_shift 33 | self.shift_div = shift_div 34 | self.shift_place = shift_place 35 | self.base_model_name = base_model 36 | self.fc_lr5 = fc_lr5 37 | self.temporal_pool = temporal_pool 38 | self.non_local = non_local 39 | self.data_length = data_length 40 | self.num_class = num_class 41 | self.has_VIP = has_VAP 42 | 43 | if not before_softmax and consensus_type != 'avg': 44 | raise ValueError("Only avg consensus can be used after Softmax") 45 | 46 | if new_length is None: 47 | self.new_length = 1 if modality == "RGB" or modality in ["PA", "Lite"] else 5 48 | else: 49 | self.new_length = new_length 50 | if print_spec: 51 | print((""" 52 | Initializing PAN with base model: {}. 53 | PAN Configurations: 54 | input_modality: {} 55 | num_segments: {} 56 | new_length: {} 57 | consensus_module: {} 58 | dropout_ratio: {} 59 | img_feature_dim: {} 60 | """.format(base_model, self.modality, self.num_segments, self.new_length, consensus_type, self.dropout, self.img_feature_dim))) 61 | 62 | self._prepare_base_model(base_model) 63 | 64 | if self.has_VIP: 65 | feature_dim = getattr(self.base_model, self.base_model.last_layer_name).in_features 66 | setattr(self.base_model, self.base_model.last_layer_name, VAP(self.num_segments, feature_dim, self.num_class, self.dropout)) 67 | else: 68 | feature_dim = self._prepare_tsn(num_class) 69 | 70 | if self.modality == 'Flow': 71 | print("Converting the ImageNet model to a flow init model") 72 | self.base_model = self._construct_flow_model(self.base_model) 73 | print("Done. Flow model ready...") 74 | elif self.modality == 'Lite': 75 | print("=> Converting the ImageNet model to a PAN_Lite init model") 76 | self.base_model = self._construct_pa_model(self.base_model) 77 | print("=> Done. PAN_lite model ready...") 78 | elif self.modality == 'RGBDiff': 79 | print("Converting the ImageNet model to RGB+Diff init model") 80 | self.base_model = self._construct_diff_model(self.base_model) 81 | print("Done. RGBDiff model ready.") 82 | 83 | if not self.has_VIP: 84 | self.consensus = ConsensusModule(consensus_type) 85 | 86 | if not self.before_softmax: 87 | self.softmax = nn.Softmax() 88 | 89 | self._enable_pbn = partial_bn 90 | if partial_bn: 91 | self.partialBN(True) 92 | 93 | def _prepare_tsn(self, num_class): 94 | feature_dim = getattr(self.base_model, self.base_model.last_layer_name).in_features 95 | if self.dropout == 0: 96 | setattr(self.base_model, self.base_model.last_layer_name, nn.Linear(feature_dim, num_class)) 97 | self.new_fc = None 98 | else: 99 | setattr(self.base_model, self.base_model.last_layer_name, nn.Dropout(p=self.dropout)) 100 | self.new_fc = nn.Linear(feature_dim, num_class) 101 | 102 | std = 0.001 103 | if self.new_fc is None: 104 | normal_(getattr(self.base_model, self.base_model.last_layer_name).weight, 0, std) 105 | constant_(getattr(self.base_model, self.base_model.last_layer_name).bias, 0) 106 | else: 107 | if hasattr(self.new_fc, 'weight'): 108 | normal_(self.new_fc.weight, 0, std) 109 | constant_(self.new_fc.bias, 0) 110 | return feature_dim 111 | 112 | def _prepare_base_model(self, base_model): 113 | print('=> base model: {}'.format(base_model)) 114 | 115 | if 'resnet' in base_model: 116 | if self.modality in ["PA", "Lite"]: 117 | self.PA = PA(self.data_length) 118 | self.base_model = getattr(torchvision.models, base_model)(True if self.pretrain == 'imagenet' else False) 119 | if self.is_shift: 120 | print('=> Adding temporal shift...') 121 | from ops.temporal_shift import make_temporal_shift 122 | make_temporal_shift(self.base_model, self.num_segments, 123 | n_div=self.shift_div, place=self.shift_place, temporal_pool=self.temporal_pool) 124 | 125 | if self.non_local: 126 | print('=> Adding non-local module...') 127 | from ops.non_local import make_non_local 128 | make_non_local(self.base_model, self.num_segments) 129 | 130 | self.base_model.last_layer_name = 'fc' 131 | self.input_size = 224 132 | self.input_mean = [0.485, 0.456, 0.406] 133 | self.input_std = [0.229, 0.224, 0.225] 134 | 135 | self.base_model.avgpool = nn.AdaptiveAvgPool2d(1) 136 | 137 | if self.modality == 'Flow': 138 | self.input_mean = [0.5] 139 | self.input_std = [np.mean(self.input_std)] 140 | elif self.modality == 'RGBDiff': 141 | self.input_mean = [0.485, 0.456, 0.406] + [0] * 3 * self.new_length 142 | self.input_std = self.input_std + [np.mean(self.input_std) * 2] * 3 * self.new_length 143 | 144 | elif base_model == 'mobilenetv2': 145 | from archs.mobilenet_v2 import mobilenet_v2, InvertedResidual 146 | self.base_model = mobilenet_v2(True if self.pretrain == 'imagenet' else False) 147 | 148 | self.base_model.last_layer_name = 'classifier' 149 | self.input_size = 224 150 | self.input_mean = [0.485, 0.456, 0.406] 151 | self.input_std = [0.229, 0.224, 0.225] 152 | 153 | self.base_model.avgpool = nn.AdaptiveAvgPool2d(1) 154 | if self.is_shift: 155 | from ops.temporal_shift import TemporalShift 156 | for m in self.base_model.modules(): 157 | if isinstance(m, InvertedResidual) and len(m.conv) == 8 and m.use_res_connect: 158 | if self.print_spec: 159 | print('Adding temporal shift... {}'.format(m.use_res_connect)) 160 | m.conv[0] = TemporalShift(m.conv[0], n_segment=self.num_segments, n_div=self.shift_div) 161 | if self.modality == 'Flow': 162 | self.input_mean = [0.5] 163 | self.input_std = [np.mean(self.input_std)] 164 | elif self.modality == 'RGBDiff': 165 | self.input_mean = [0.485, 0.456, 0.406] + [0] * 3 * self.new_length 166 | self.input_std = self.input_std + [np.mean(self.input_std) * 2] * 3 * self.new_length 167 | 168 | elif base_model == 'BNInception': 169 | from archs.bn_inception import bninception 170 | self.base_model = bninception(pretrained=self.pretrain) 171 | self.input_size = self.base_model.input_size 172 | self.input_mean = self.base_model.mean 173 | self.input_std = self.base_model.std 174 | self.base_model.last_layer_name = 'fc' 175 | if self.modality == 'Flow': 176 | self.input_mean = [128] 177 | elif self.modality == 'RGBDiff': 178 | self.input_mean = self.input_mean * (1 + self.new_length) 179 | if self.is_shift: 180 | print('Adding temporal shift...') 181 | self.base_model.build_temporal_ops( 182 | self.num_segments, is_temporal_shift=self.shift_place, shift_div=self.shift_div) 183 | else: 184 | raise ValueError('Unknown base model: {}'.format(base_model)) 185 | 186 | def train(self, mode=True): 187 | """ 188 | Override the default train() to freeze the BN parameters 189 | :return: 190 | """ 191 | super(PAN, self).train(mode) 192 | count = 0 193 | if self._enable_pbn and mode: 194 | print("Freezing BatchNorm2D except the first one.") 195 | for m in self.base_model.modules(): 196 | if isinstance(m, nn.BatchNorm2d): 197 | count += 1 198 | if count >= (2 if self._enable_pbn else 1): 199 | m.eval() 200 | # shutdown update in frozen mode 201 | m.weight.requires_grad = False 202 | m.bias.requires_grad = False 203 | 204 | def partialBN(self, enable): 205 | self._enable_pbn = enable 206 | 207 | def get_optim_policies(self): 208 | first_conv_weight = [] 209 | first_conv_bias = [] 210 | normal_weight = [] 211 | normal_bias = [] 212 | lr5_weight = [] 213 | lr10_bias = [] 214 | bn = [] 215 | custom_ops = [] 216 | 217 | conv_cnt = 0 218 | bn_cnt = 0 219 | for m in self.modules(): 220 | if isinstance(m, torch.nn.Conv2d) or isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv3d): 221 | ps = list(m.parameters()) 222 | conv_cnt += 1 223 | if conv_cnt == 1: 224 | first_conv_weight.append(ps[0]) 225 | if len(ps) == 2: 226 | first_conv_bias.append(ps[1]) 227 | else: 228 | normal_weight.append(ps[0]) 229 | if len(ps) == 2: 230 | normal_bias.append(ps[1]) 231 | elif isinstance(m, torch.nn.Linear): 232 | ps = list(m.parameters()) 233 | if self.fc_lr5: 234 | lr5_weight.append(ps[0]) 235 | else: 236 | normal_weight.append(ps[0]) 237 | if len(ps) == 2: 238 | if self.fc_lr5: 239 | lr10_bias.append(ps[1]) 240 | else: 241 | normal_bias.append(ps[1]) 242 | 243 | elif isinstance(m, torch.nn.BatchNorm2d): 244 | bn_cnt += 1 245 | # later BN's are frozen 246 | if not self._enable_pbn or bn_cnt == 1: 247 | bn.extend(list(m.parameters())) 248 | elif isinstance(m, torch.nn.BatchNorm3d): 249 | bn_cnt += 1 250 | # later BN's are frozen 251 | if not self._enable_pbn or bn_cnt == 1: 252 | bn.extend(list(m.parameters())) 253 | elif len(m._modules) == 0: 254 | if len(list(m.parameters())) > 0: 255 | raise ValueError("New atomic module type: {}. Need to give it a learning policy".format(type(m))) 256 | 257 | return [ 258 | {'params': first_conv_weight, 'lr_mult': 5 if self.modality == 'Flow' or self.modality in ['PA', 'Lite'] else 1, 'decay_mult': 1, 259 | 'name': "first_conv_weight"}, 260 | {'params': first_conv_bias, 'lr_mult': 10 if self.modality == 'Flow' or self.modality in ['PA', 'Lite'] else 2, 'decay_mult': 0, 261 | 'name': "first_conv_bias"}, 262 | {'params': normal_weight, 'lr_mult': 1, 'decay_mult': 1, 263 | 'name': "normal_weight"}, 264 | {'params': normal_bias, 'lr_mult': 2, 'decay_mult': 0, 265 | 'name': "normal_bias"}, 266 | {'params': bn, 'lr_mult': 1, 'decay_mult': 0, 267 | 'name': "BN scale/shift"}, 268 | {'params': custom_ops, 'lr_mult': 1, 'decay_mult': 1, 269 | 'name': "custom_ops"}, 270 | # for fc 271 | {'params': lr5_weight, 'lr_mult': 5, 'decay_mult': 1, 272 | 'name': "lr5_weight"}, 273 | {'params': lr10_bias, 'lr_mult': 10, 'decay_mult': 0, 274 | 'name': "lr10_bias"}, 275 | ] 276 | 277 | def forward(self, input, no_reshape=False): 278 | if not no_reshape: 279 | sample_len = (3 if self.modality in ['RGB', 'PA', 'Lite'] else 2) * self.new_length 280 | 281 | if self.modality == 'RGBDiff': 282 | sample_len = 3 * self.new_length 283 | input = self._get_diff(input) 284 | 285 | if self.modality == 'PA': 286 | base_out = self.PA(input.view((-1, sample_len) + input.size()[-2:])) 287 | base_out = self.base_model(base_out) 288 | elif self.modality == 'Lite': 289 | input = input.view((-1, sample_len) + input.size()[-2:]) 290 | PA = self.PA(input) 291 | RGB = input.view((-1, self.data_length, sample_len) + input.size()[-2:])[:,0,:,:,:] 292 | base_out = torch.cat((RGB, PA), 1) 293 | base_out = self.base_model(base_out) 294 | else: 295 | base_out = self.base_model(input.view((-1, sample_len) + input.size()[-2:])) 296 | else: 297 | base_out = self.base_model(input) 298 | 299 | if self.has_VIP: 300 | return base_out 301 | 302 | if self.dropout > 0: 303 | base_out = self.new_fc(base_out) 304 | 305 | if not self.before_softmax: 306 | base_out = self.softmax(base_out) 307 | 308 | if self.reshape: 309 | if self.is_shift and self.temporal_pool: 310 | base_out = base_out.view((-1, self.num_segments // 2) + base_out.size()[1:]) 311 | else: 312 | base_out = base_out.view((-1, self.num_segments) + base_out.size()[1:]) 313 | output = self.consensus(base_out) 314 | return output.squeeze(1) 315 | 316 | def _get_diff(self, input, keep_rgb=False): 317 | input_c = 3 if self.modality in ["RGB", "PA", "Lite", "RGBDiff"] else 2 318 | input_view = input.view((-1, self.num_segments, self.new_length + 1, input_c,) + input.size()[2:]) 319 | if keep_rgb: 320 | new_data = input_view.clone() 321 | else: 322 | new_data = input_view[:, :, 1:, :, :, :].clone() 323 | 324 | for x in reversed(list(range(1, self.new_length + 1))): 325 | if keep_rgb: 326 | new_data[:, :, x, :, :, :] = input_view[:, :, x, :, :, :] - input_view[:, :, x - 1, :, :, :] 327 | else: 328 | new_data[:, :, x - 1, :, :, :] = input_view[:, :, x, :, :, :] - input_view[:, :, x - 1, :, :, :] 329 | 330 | return new_data 331 | 332 | def _construct_pa_model(self, base_model): 333 | # modify the convolution layers 334 | # Torch models are usually defined in a hierarchical way. 335 | # nn.modules.children() return all sub modules in a DFS manner 336 | modules = list(self.base_model.modules()) 337 | first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv2d), list(range(len(modules)))))[0] 338 | conv_layer = modules[first_conv_idx] 339 | container = modules[first_conv_idx - 1] 340 | 341 | # modify parameters, assume the first blob contains the convolution kernels 342 | params = [x.clone() for x in conv_layer.parameters()] 343 | kernel_size = params[0].size() 344 | new_kernel_size = kernel_size[:1] + (6, ) + kernel_size[2:] 345 | new_kernels = params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous() 346 | 347 | new_conv = nn.Conv2d(6, conv_layer.out_channels, 348 | conv_layer.kernel_size, conv_layer.stride, conv_layer.padding, 349 | bias=True if len(params) == 2 else False) 350 | new_conv.weight.data = new_kernels 351 | if len(params) == 2: 352 | new_conv.bias.data = params[1].data # add bias if neccessary 353 | layer_name = list(container.state_dict().keys())[0][:-7] # remove .weight suffix to get the layer name 354 | 355 | # replace the first convlution layer 356 | setattr(container, layer_name, new_conv) 357 | 358 | if self.base_model_name == 'BNInception': 359 | import torch.utils.model_zoo as model_zoo 360 | sd = model_zoo.load_url('https://www.dropbox.com/s/35ftw2t4mxxgjae/BNInceptionFlow-ef652051.pth.tar?dl=1') 361 | base_model.load_state_dict(sd) 362 | print('=> Loading pretrained Flow weight done...') 363 | return base_model 364 | 365 | def _construct_flow_model(self, base_model): 366 | # modify the convolution layers 367 | # Torch models are usually defined in a hierarchical way. 368 | # nn.modules.children() return all sub modules in a DFS manner 369 | modules = list(self.base_model.modules()) 370 | first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv2d), list(range(len(modules)))))[0] 371 | conv_layer = modules[first_conv_idx] 372 | container = modules[first_conv_idx - 1] 373 | 374 | # modify parameters, assume the first blob contains the convolution kernels 375 | params = [x.clone() for x in conv_layer.parameters()] 376 | kernel_size = params[0].size() 377 | new_kernel_size = kernel_size[:1] + (2 * self.new_length, ) + kernel_size[2:] 378 | new_kernels = params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous() 379 | 380 | new_conv = nn.Conv2d(2 * self.new_length, conv_layer.out_channels, 381 | conv_layer.kernel_size, conv_layer.stride, conv_layer.padding, 382 | bias=True if len(params) == 2 else False) 383 | new_conv.weight.data = new_kernels 384 | if len(params) == 2: 385 | new_conv.bias.data = params[1].data # add bias if neccessary 386 | layer_name = list(container.state_dict().keys())[0][:-7] # remove .weight suffix to get the layer name 387 | 388 | # replace the first convlution layer 389 | setattr(container, layer_name, new_conv) 390 | 391 | if self.base_model_name == 'BNInception': 392 | import torch.utils.model_zoo as model_zoo 393 | sd = model_zoo.load_url('https://www.dropbox.com/s/35ftw2t4mxxgjae/BNInceptionFlow-ef652051.pth.tar?dl=1') 394 | base_model.load_state_dict(sd) 395 | print('=> Loading pretrained Flow weight done...') 396 | else: 397 | print('#' * 30, 'Warning! No Flow pretrained model is found') 398 | return base_model 399 | 400 | def _construct_diff_model(self, base_model, keep_rgb=False): 401 | # modify the convolution layers 402 | # Torch models are usually defined in a hierarchical way. 403 | # nn.modules.children() return all sub modules in a DFS manner 404 | modules = list(self.base_model.modules()) 405 | first_conv_idx = filter(lambda x: isinstance(modules[x], nn.Conv2d), list(range(len(modules))))[0] 406 | conv_layer = modules[first_conv_idx] 407 | container = modules[first_conv_idx - 1] 408 | 409 | # modify parameters, assume the first blob contains the convolution kernels 410 | params = [x.clone() for x in conv_layer.parameters()] 411 | kernel_size = params[0].size() 412 | if not keep_rgb: 413 | new_kernel_size = kernel_size[:1] + (3 * self.new_length,) + kernel_size[2:] 414 | new_kernels = params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous() 415 | else: 416 | new_kernel_size = kernel_size[:1] + (3 * self.new_length,) + kernel_size[2:] 417 | new_kernels = torch.cat((params[0].data, params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous()), 418 | 1) 419 | new_kernel_size = kernel_size[:1] + (3 + 3 * self.new_length,) + kernel_size[2:] 420 | 421 | new_conv = nn.Conv2d(new_kernel_size[1], conv_layer.out_channels, 422 | conv_layer.kernel_size, conv_layer.stride, conv_layer.padding, 423 | bias=True if len(params) == 2 else False) 424 | new_conv.weight.data = new_kernels 425 | if len(params) == 2: 426 | new_conv.bias.data = params[1].data # add bias if neccessary 427 | layer_name = list(container.state_dict().keys())[0][:-7] # remove .weight suffix to get the layer name 428 | 429 | # replace the first convolution layer 430 | setattr(container, layer_name, new_conv) 431 | return base_model 432 | 433 | @property 434 | def crop_size(self): 435 | return self.input_size 436 | 437 | @property 438 | def scale_size(self): 439 | return self.input_size * 256 // 224 440 | 441 | def get_augmentation(self, flip=True): 442 | if self.modality in ['RGB', 'PA', 'Lite']: 443 | if flip: 444 | return torchvision.transforms.Compose([GroupMultiScaleCrop(self.input_size, [1, .875, .75, .66]), 445 | GroupRandomHorizontalFlip(is_flow=False)]) 446 | else: 447 | print('=> NO FLIP!!!') 448 | return torchvision.transforms.Compose([GroupMultiScaleCrop(self.input_size, [1, .875, .75, .66])]) 449 | elif self.modality == 'Flow': 450 | return torchvision.transforms.Compose([GroupMultiScaleCrop(self.input_size, [1, .875, .75]), 451 | GroupRandomHorizontalFlip(is_flow=True)]) 452 | elif self.modality == 'RGBDiff': 453 | return torchvision.transforms.Compose([GroupMultiScaleCrop(self.input_size, [1, .875, .75]), 454 | GroupRandomHorizontalFlip(is_flow=False)]) 455 | -------------------------------------------------------------------------------- /ops/non_local.py: -------------------------------------------------------------------------------- 1 | # Non-local block using embedded gaussian 2 | # Code from 3 | # https://github.com/AlexHex7/Non-local_pytorch/blob/master/Non-Local_pytorch_0.3.1/lib/non_local_embedded_gaussian.py 4 | 5 | import torch 6 | from torch import nn 7 | from torch.nn import functional as F 8 | 9 | 10 | class _NonLocalBlockND(nn.Module): 11 | def __init__(self, in_channels, inter_channels=None, dimension=3, sub_sample=True, bn_layer=True): 12 | super(_NonLocalBlockND, self).__init__() 13 | 14 | assert dimension in [1, 2, 3] 15 | 16 | self.dimension = dimension 17 | self.sub_sample = sub_sample 18 | 19 | self.in_channels = in_channels 20 | self.inter_channels = inter_channels 21 | 22 | if self.inter_channels is None: 23 | self.inter_channels = in_channels // 2 24 | if self.inter_channels == 0: 25 | self.inter_channels = 1 26 | 27 | if dimension == 3: 28 | conv_nd = nn.Conv3d 29 | max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2)) 30 | bn = nn.BatchNorm3d 31 | elif dimension == 2: 32 | conv_nd = nn.Conv2d 33 | max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2)) 34 | bn = nn.BatchNorm2d 35 | else: 36 | conv_nd = nn.Conv1d 37 | max_pool_layer = nn.MaxPool1d(kernel_size=(2)) 38 | bn = nn.BatchNorm1d 39 | 40 | self.g = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, 41 | kernel_size=1, stride=1, padding=0) 42 | 43 | if bn_layer: 44 | self.W = nn.Sequential( 45 | conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels, 46 | kernel_size=1, stride=1, padding=0), 47 | bn(self.in_channels) 48 | ) 49 | nn.init.constant_(self.W[1].weight, 0) 50 | nn.init.constant_(self.W[1].bias, 0) 51 | else: 52 | self.W = conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels, 53 | kernel_size=1, stride=1, padding=0) 54 | nn.init.constant_(self.W.weight, 0) 55 | nn.init.constant_(self.W.bias, 0) 56 | 57 | self.theta = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, 58 | kernel_size=1, stride=1, padding=0) 59 | self.phi = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, 60 | kernel_size=1, stride=1, padding=0) 61 | 62 | if sub_sample: 63 | self.g = nn.Sequential(self.g, max_pool_layer) 64 | self.phi = nn.Sequential(self.phi, max_pool_layer) 65 | 66 | def forward(self, x): 67 | ''' 68 | :param x: (b, c, t, h, w) 69 | :return: 70 | ''' 71 | 72 | batch_size = x.size(0) 73 | 74 | g_x = self.g(x).view(batch_size, self.inter_channels, -1) 75 | g_x = g_x.permute(0, 2, 1) 76 | 77 | theta_x = self.theta(x).view(batch_size, self.inter_channels, -1) 78 | theta_x = theta_x.permute(0, 2, 1) 79 | phi_x = self.phi(x).view(batch_size, self.inter_channels, -1) 80 | f = torch.matmul(theta_x, phi_x) 81 | f_div_C = F.softmax(f, dim=-1) 82 | 83 | y = torch.matmul(f_div_C, g_x) 84 | y = y.permute(0, 2, 1).contiguous() 85 | y = y.view(batch_size, self.inter_channels, *x.size()[2:]) 86 | W_y = self.W(y) 87 | z = W_y + x 88 | 89 | return z 90 | 91 | 92 | class NONLocalBlock1D(_NonLocalBlockND): 93 | def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True): 94 | super(NONLocalBlock1D, self).__init__(in_channels, 95 | inter_channels=inter_channels, 96 | dimension=1, sub_sample=sub_sample, 97 | bn_layer=bn_layer) 98 | 99 | 100 | class NONLocalBlock2D(_NonLocalBlockND): 101 | def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True): 102 | super(NONLocalBlock2D, self).__init__(in_channels, 103 | inter_channels=inter_channels, 104 | dimension=2, sub_sample=sub_sample, 105 | bn_layer=bn_layer) 106 | 107 | 108 | class NONLocalBlock3D(_NonLocalBlockND): 109 | def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True): 110 | super(NONLocalBlock3D, self).__init__(in_channels, 111 | inter_channels=inter_channels, 112 | dimension=3, sub_sample=sub_sample, 113 | bn_layer=bn_layer) 114 | 115 | 116 | class NL3DWrapper(nn.Module): 117 | def __init__(self, block, n_segment): 118 | super(NL3DWrapper, self).__init__() 119 | self.block = block 120 | self.nl = NONLocalBlock3D(block.bn3.num_features) 121 | self.n_segment = n_segment 122 | 123 | def forward(self, x): 124 | x = self.block(x) 125 | 126 | nt, c, h, w = x.size() 127 | x = x.view(nt // self.n_segment, self.n_segment, c, h, w).transpose(1, 2) # n, c, t, h, w 128 | x = self.nl(x) 129 | x = x.transpose(1, 2).contiguous().view(nt, c, h, w) 130 | return x 131 | 132 | 133 | def make_non_local(net, n_segment): 134 | import torchvision 135 | import archs 136 | if isinstance(net, torchvision.models.ResNet): 137 | net.layer2 = nn.Sequential( 138 | NL3DWrapper(net.layer2[0], n_segment), 139 | net.layer2[1], 140 | NL3DWrapper(net.layer2[2], n_segment), 141 | net.layer2[3], 142 | ) 143 | net.layer3 = nn.Sequential( 144 | NL3DWrapper(net.layer3[0], n_segment), 145 | net.layer3[1], 146 | NL3DWrapper(net.layer3[2], n_segment), 147 | net.layer3[3], 148 | NL3DWrapper(net.layer3[4], n_segment), 149 | net.layer3[5], 150 | ) 151 | else: 152 | raise NotImplementedError 153 | 154 | 155 | if __name__ == '__main__': 156 | from torch.autograd import Variable 157 | import torch 158 | 159 | sub_sample = True 160 | bn_layer = True 161 | 162 | img = Variable(torch.zeros(2, 3, 20)) 163 | net = NONLocalBlock1D(3, sub_sample=sub_sample, bn_layer=bn_layer) 164 | out = net(img) 165 | print(out.size()) 166 | 167 | img = Variable(torch.zeros(2, 3, 20, 20)) 168 | net = NONLocalBlock2D(3, sub_sample=sub_sample, bn_layer=bn_layer) 169 | out = net(img) 170 | print(out.size()) 171 | 172 | img = Variable(torch.randn(2, 3, 10, 20, 20)) 173 | net = NONLocalBlock3D(3, sub_sample=sub_sample, bn_layer=bn_layer) 174 | out = net(img) 175 | print(out.size()) 176 | -------------------------------------------------------------------------------- /ops/temporal_shift.py: -------------------------------------------------------------------------------- 1 | # Code from "TSM: Temporal Shift Module for Efficient Video Understanding" 2 | # https://github.com/mit-han-lab/temporal-shift-module 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | 9 | class TemporalShift(nn.Module): 10 | def __init__(self, net, n_segment=3, n_div=8, inplace=False): 11 | super(TemporalShift, self).__init__() 12 | self.net = net 13 | self.n_segment = n_segment 14 | self.fold_div = n_div 15 | self.inplace = inplace 16 | if inplace: 17 | print('=> Using in-place shift...') 18 | #print('=> Using fold div: {}'.format(self.fold_div)) 19 | 20 | def forward(self, x): 21 | x = self.shift(x, self.n_segment, fold_div=self.fold_div, inplace=self.inplace) 22 | return self.net(x) 23 | 24 | @staticmethod 25 | def shift(x, n_segment, fold_div=3, inplace=False): 26 | nt, c, h, w = x.size() 27 | n_batch = nt // n_segment 28 | x = x.view(n_batch, n_segment, c, h, w) 29 | 30 | fold = c // fold_div 31 | if inplace: 32 | # Due to some out of order error when performing parallel computing. 33 | # May need to write a CUDA kernel. 34 | raise NotImplementedError 35 | # out = InplaceShift.apply(x, fold) 36 | else: 37 | out = torch.zeros_like(x) 38 | out[:, :-1, :fold] = x[:, 1:, :fold] # shift left 39 | out[:, 1:, fold: 2 * fold] = x[:, :-1, fold: 2 * fold] # shift right 40 | out[:, :, 2 * fold:] = x[:, :, 2 * fold:] # not shift 41 | 42 | return out.view(nt, c, h, w) 43 | 44 | 45 | class InplaceShift(torch.autograd.Function): 46 | # Special thanks to @raoyongming for the help to this function 47 | @staticmethod 48 | def forward(ctx, input, fold): 49 | # not support higher order gradient 50 | # input = input.detach_() 51 | ctx.fold_ = fold 52 | n, t, c, h, w = input.size() 53 | buffer = input.data.new(n, t, fold, h, w).zero_() 54 | buffer[:, :-1] = input.data[:, 1:, :fold] 55 | input.data[:, :, :fold] = buffer 56 | buffer.zero_() 57 | buffer[:, 1:] = input.data[:, :-1, fold: 2 * fold] 58 | input.data[:, :, fold: 2 * fold] = buffer 59 | return input 60 | 61 | @staticmethod 62 | def backward(ctx, grad_output): 63 | # grad_output = grad_output.detach_() 64 | fold = ctx.fold_ 65 | n, t, c, h, w = grad_output.size() 66 | buffer = grad_output.data.new(n, t, fold, h, w).zero_() 67 | buffer[:, 1:] = grad_output.data[:, :-1, :fold] 68 | grad_output.data[:, :, :fold] = buffer 69 | buffer.zero_() 70 | buffer[:, :-1] = grad_output.data[:, 1:, fold: 2 * fold] 71 | grad_output.data[:, :, fold: 2 * fold] = buffer 72 | return grad_output, None 73 | 74 | 75 | class TemporalPool(nn.Module): 76 | def __init__(self, net, n_segment): 77 | super(TemporalPool, self).__init__() 78 | self.net = net 79 | self.n_segment = n_segment 80 | 81 | def forward(self, x): 82 | x = self.temporal_pool(x, n_segment=self.n_segment) 83 | return self.net(x) 84 | 85 | @staticmethod 86 | def temporal_pool(x, n_segment): 87 | nt, c, h, w = x.size() 88 | n_batch = nt // n_segment 89 | x = x.view(n_batch, n_segment, c, h, w).transpose(1, 2) # n, c, t, h, w 90 | x = F.max_pool3d(x, kernel_size=(3, 1, 1), stride=(2, 1, 1), padding=(1, 0, 0)) 91 | x = x.transpose(1, 2).contiguous().view(nt // 2, c, h, w) 92 | return x 93 | 94 | 95 | def make_temporal_shift(net, n_segment, n_div=8, place='blockres', temporal_pool=False): 96 | if temporal_pool: 97 | n_segment_list = [n_segment, n_segment // 2, n_segment // 2, n_segment // 2] 98 | else: 99 | n_segment_list = [n_segment] * 4 100 | assert n_segment_list[-1] > 0 101 | #print('=> n_segment per stage: {}'.format(n_segment_list)) 102 | 103 | import torchvision 104 | if isinstance(net, torchvision.models.ResNet): 105 | if place == 'block': 106 | def make_block_temporal(stage, this_segment): 107 | blocks = list(stage.children()) 108 | #print('=> Processing stage with {} blocks'.format(len(blocks))) 109 | for i, b in enumerate(blocks): 110 | blocks[i] = TemporalShift(b, n_segment=this_segment, n_div=n_div) 111 | return nn.Sequential(*(blocks)) 112 | 113 | net.layer1 = make_block_temporal(net.layer1, n_segment_list[0]) 114 | net.layer2 = make_block_temporal(net.layer2, n_segment_list[1]) 115 | net.layer3 = make_block_temporal(net.layer3, n_segment_list[2]) 116 | net.layer4 = make_block_temporal(net.layer4, n_segment_list[3]) 117 | 118 | elif 'blockres' in place: 119 | n_round = 1 120 | if len(list(net.layer3.children())) >= 23: 121 | n_round = 2 122 | #print('=> Using n_round {} to insert temporal shift'.format(n_round)) 123 | 124 | def make_block_temporal(stage, this_segment): 125 | blocks = list(stage.children()) 126 | #print('=> Processing stage with {} blocks residual'.format(len(blocks))) 127 | for i, b in enumerate(blocks): 128 | if i % n_round == 0: 129 | blocks[i].conv1 = TemporalShift(b.conv1, n_segment=this_segment, n_div=n_div) 130 | return nn.Sequential(*blocks) 131 | 132 | net.layer1 = make_block_temporal(net.layer1, n_segment_list[0]) 133 | net.layer2 = make_block_temporal(net.layer2, n_segment_list[1]) 134 | net.layer3 = make_block_temporal(net.layer3, n_segment_list[2]) 135 | net.layer4 = make_block_temporal(net.layer4, n_segment_list[3]) 136 | else: 137 | raise NotImplementedError(place) 138 | 139 | 140 | def make_temporal_pool(net, n_segment): 141 | import torchvision 142 | if isinstance(net, torchvision.models.ResNet): 143 | print('=> Injecting nonlocal pooling') 144 | net.layer2 = TemporalPool(net.layer2, n_segment) 145 | else: 146 | raise NotImplementedError 147 | 148 | 149 | if __name__ == '__main__': 150 | # test inplace shift v.s. vanilla shift 151 | tsm1 = TemporalShift(nn.Sequential(), n_segment=8, n_div=8, inplace=False) 152 | tsm2 = TemporalShift(nn.Sequential(), n_segment=8, n_div=8, inplace=True) 153 | 154 | print('=> Testing CPU...') 155 | # test forward 156 | with torch.no_grad(): 157 | for i in range(10): 158 | x = torch.rand(2 * 8, 3, 224, 224) 159 | y1 = tsm1(x) 160 | y2 = tsm2(x) 161 | assert torch.norm(y1 - y2).item() < 1e-5 162 | 163 | # test backward 164 | with torch.enable_grad(): 165 | for i in range(10): 166 | x1 = torch.rand(2 * 8, 3, 224, 224) 167 | x1.requires_grad_() 168 | x2 = x1.clone() 169 | y1 = tsm1(x1) 170 | y2 = tsm2(x2) 171 | grad1 = torch.autograd.grad((y1 ** 2).mean(), [x1])[0] 172 | grad2 = torch.autograd.grad((y2 ** 2).mean(), [x2])[0] 173 | assert torch.norm(grad1 - grad2).item() < 1e-5 174 | 175 | print('=> Testing GPU...') 176 | tsm1.cuda() 177 | tsm2.cuda() 178 | # test forward 179 | with torch.no_grad(): 180 | for i in range(10): 181 | x = torch.rand(2 * 8, 3, 224, 224).cuda() 182 | y1 = tsm1(x) 183 | y2 = tsm2(x) 184 | assert torch.norm(y1 - y2).item() < 1e-5 185 | 186 | # test backward 187 | with torch.enable_grad(): 188 | for i in range(10): 189 | x1 = torch.rand(2 * 8, 3, 224, 224).cuda() 190 | x1.requires_grad_() 191 | x2 = x1.clone() 192 | y1 = tsm1(x1) 193 | y2 = tsm2(x2) 194 | grad1 = torch.autograd.grad((y1 ** 2).mean(), [x1])[0] 195 | grad2 = torch.autograd.grad((y2 ** 2).mean(), [x2])[0] 196 | assert torch.norm(grad1 - grad2).item() < 1e-5 197 | print('Test passed.') 198 | 199 | 200 | 201 | 202 | -------------------------------------------------------------------------------- /ops/transforms.py: -------------------------------------------------------------------------------- 1 | # Code for paper: 2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance" 3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan 4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch 5 | 6 | import torchvision 7 | import random 8 | from PIL import Image, ImageOps 9 | import numpy as np 10 | import numbers 11 | import math 12 | import torch 13 | 14 | 15 | class GroupRandomCrop(object): 16 | def __init__(self, size): 17 | if isinstance(size, numbers.Number): 18 | self.size = (int(size), int(size)) 19 | else: 20 | self.size = size 21 | 22 | def __call__(self, img_group): 23 | 24 | w, h = img_group[0].size 25 | th, tw = self.size 26 | 27 | out_images = list() 28 | 29 | x1 = random.randint(0, w - tw) 30 | y1 = random.randint(0, h - th) 31 | 32 | for img in img_group: 33 | assert(img.size[0] == w and img.size[1] == h) 34 | if w == tw and h == th: 35 | out_images.append(img) 36 | else: 37 | out_images.append(img.crop((x1, y1, x1 + tw, y1 + th))) 38 | 39 | return out_images 40 | 41 | 42 | class GroupCenterCrop(object): 43 | def __init__(self, size): 44 | self.worker = torchvision.transforms.CenterCrop(size) 45 | 46 | def __call__(self, img_group): 47 | return [self.worker(img) for img in img_group] 48 | 49 | 50 | class GroupRandomHorizontalFlip(object): 51 | """Randomly horizontally flips the given PIL.Image with a probability of 0.5 52 | """ 53 | def __init__(self, is_flow=False): 54 | self.is_flow = is_flow 55 | 56 | def __call__(self, img_group, is_flow=False): 57 | v = random.random() 58 | if v < 0.5: 59 | ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group] 60 | if self.is_flow: 61 | for i in range(0, len(ret), 2): 62 | ret[i] = ImageOps.invert(ret[i]) # invert flow pixel values when flipping 63 | return ret 64 | else: 65 | return img_group 66 | 67 | 68 | class GroupNormalize(object): 69 | def __init__(self, mean, std): 70 | self.mean = mean 71 | self.std = std 72 | 73 | def __call__(self, tensor): 74 | rep_mean = self.mean * (tensor.size()[0]//len(self.mean)) 75 | rep_std = self.std * (tensor.size()[0]//len(self.std)) 76 | 77 | # TODO: make efficient 78 | for t, m, s in zip(tensor, rep_mean, rep_std): 79 | t.sub_(m).div_(s) 80 | 81 | return tensor 82 | 83 | 84 | class GroupScale(object): 85 | """ Rescales the input PIL.Image to the given 'size'. 86 | 'size' will be the size of the smaller edge. 87 | For example, if height > width, then image will be 88 | rescaled to (size * height / width, size) 89 | size: size of the smaller edge 90 | interpolation: Default: PIL.Image.BILINEAR 91 | """ 92 | 93 | def __init__(self, size, interpolation=Image.BILINEAR): 94 | self.worker = torchvision.transforms.Resize(size, interpolation) 95 | 96 | def __call__(self, img_group): 97 | return [self.worker(img) for img in img_group] 98 | 99 | 100 | class GroupOverSample(object): 101 | def __init__(self, crop_size, scale_size=None, flip=True): 102 | self.crop_size = crop_size if not isinstance(crop_size, int) else (crop_size, crop_size) 103 | 104 | if scale_size is not None: 105 | self.scale_worker = GroupScale(scale_size) 106 | else: 107 | self.scale_worker = None 108 | self.flip = flip 109 | 110 | def __call__(self, img_group): 111 | 112 | if self.scale_worker is not None: 113 | img_group = self.scale_worker(img_group) 114 | 115 | image_w, image_h = img_group[0].size 116 | crop_w, crop_h = self.crop_size 117 | 118 | offsets = GroupMultiScaleCrop.fill_fix_offset(False, image_w, image_h, crop_w, crop_h) 119 | oversample_group = list() 120 | for o_w, o_h in offsets: 121 | normal_group = list() 122 | flip_group = list() 123 | for i, img in enumerate(img_group): 124 | crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h)) 125 | normal_group.append(crop) 126 | flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT) 127 | 128 | if img.mode == 'L' and i % 2 == 0: 129 | flip_group.append(ImageOps.invert(flip_crop)) 130 | else: 131 | flip_group.append(flip_crop) 132 | 133 | oversample_group.extend(normal_group) 134 | if self.flip: 135 | oversample_group.extend(flip_group) 136 | return oversample_group 137 | 138 | 139 | class GroupFullResSample(object): 140 | def __init__(self, crop_size, scale_size=None, flip=True): 141 | self.crop_size = crop_size if not isinstance(crop_size, int) else (crop_size, crop_size) 142 | 143 | if scale_size is not None: 144 | self.scale_worker = GroupScale(scale_size) 145 | else: 146 | self.scale_worker = None 147 | self.flip = flip 148 | 149 | def __call__(self, img_group): 150 | 151 | if self.scale_worker is not None: 152 | img_group = self.scale_worker(img_group) 153 | 154 | image_w, image_h = img_group[0].size 155 | crop_w, crop_h = self.crop_size 156 | 157 | w_step = (image_w - crop_w) // 4 158 | h_step = (image_h - crop_h) // 4 159 | 160 | offsets = list() 161 | offsets.append((0 * w_step, 2 * h_step)) # left 162 | offsets.append((4 * w_step, 2 * h_step)) # right 163 | offsets.append((2 * w_step, 2 * h_step)) # center 164 | 165 | oversample_group = list() 166 | for o_w, o_h in offsets: 167 | normal_group = list() 168 | flip_group = list() 169 | for i, img in enumerate(img_group): 170 | crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h)) 171 | normal_group.append(crop) 172 | if self.flip: 173 | flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT) 174 | 175 | if img.mode == 'L' and i % 2 == 0: 176 | flip_group.append(ImageOps.invert(flip_crop)) 177 | else: 178 | flip_group.append(flip_crop) 179 | 180 | oversample_group.extend(normal_group) 181 | oversample_group.extend(flip_group) 182 | return oversample_group 183 | 184 | 185 | class GroupMultiScaleCrop(object): 186 | 187 | def __init__(self, input_size, scales=None, max_distort=1, fix_crop=True, more_fix_crop=True): 188 | self.scales = scales if scales is not None else [1, .875, .75, .66] 189 | self.max_distort = max_distort 190 | self.fix_crop = fix_crop 191 | self.more_fix_crop = more_fix_crop 192 | self.input_size = input_size if not isinstance(input_size, int) else [input_size, input_size] 193 | self.interpolation = Image.BILINEAR 194 | 195 | def __call__(self, img_group): 196 | 197 | im_size = img_group[0].size 198 | 199 | crop_w, crop_h, offset_w, offset_h = self._sample_crop_size(im_size) 200 | crop_img_group = [img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h)) for img in img_group] 201 | ret_img_group = [img.resize((self.input_size[0], self.input_size[1]), self.interpolation) 202 | for img in crop_img_group] 203 | return ret_img_group 204 | 205 | def _sample_crop_size(self, im_size): 206 | image_w, image_h = im_size[0], im_size[1] 207 | 208 | # find a crop size 209 | base_size = min(image_w, image_h) 210 | crop_sizes = [int(base_size * x) for x in self.scales] 211 | crop_h = [self.input_size[1] if abs(x - self.input_size[1]) < 3 else x for x in crop_sizes] 212 | crop_w = [self.input_size[0] if abs(x - self.input_size[0]) < 3 else x for x in crop_sizes] 213 | 214 | pairs = [] 215 | for i, h in enumerate(crop_h): 216 | for j, w in enumerate(crop_w): 217 | if abs(i - j) <= self.max_distort: 218 | pairs.append((w, h)) 219 | 220 | crop_pair = random.choice(pairs) 221 | if not self.fix_crop: 222 | w_offset = random.randint(0, image_w - crop_pair[0]) 223 | h_offset = random.randint(0, image_h - crop_pair[1]) 224 | else: 225 | w_offset, h_offset = self._sample_fix_offset(image_w, image_h, crop_pair[0], crop_pair[1]) 226 | 227 | return crop_pair[0], crop_pair[1], w_offset, h_offset 228 | 229 | def _sample_fix_offset(self, image_w, image_h, crop_w, crop_h): 230 | offsets = self.fill_fix_offset(self.more_fix_crop, image_w, image_h, crop_w, crop_h) 231 | return random.choice(offsets) 232 | 233 | @staticmethod 234 | def fill_fix_offset(more_fix_crop, image_w, image_h, crop_w, crop_h): 235 | w_step = (image_w - crop_w) // 4 236 | h_step = (image_h - crop_h) // 4 237 | 238 | ret = list() 239 | ret.append((0, 0)) # upper left 240 | ret.append((4 * w_step, 0)) # upper right 241 | ret.append((0, 4 * h_step)) # lower left 242 | ret.append((4 * w_step, 4 * h_step)) # lower right 243 | ret.append((2 * w_step, 2 * h_step)) # center 244 | 245 | if more_fix_crop: 246 | ret.append((0, 2 * h_step)) # center left 247 | ret.append((4 * w_step, 2 * h_step)) # center right 248 | ret.append((2 * w_step, 4 * h_step)) # lower center 249 | ret.append((2 * w_step, 0 * h_step)) # upper center 250 | 251 | ret.append((1 * w_step, 1 * h_step)) # upper left quarter 252 | ret.append((3 * w_step, 1 * h_step)) # upper right quarter 253 | ret.append((1 * w_step, 3 * h_step)) # lower left quarter 254 | ret.append((3 * w_step, 3 * h_step)) # lower righ quarter 255 | 256 | return ret 257 | 258 | 259 | class GroupRandomSizedCrop(object): 260 | """Random crop the given PIL.Image to a random size of (0.08 to 1.0) of the original size 261 | and and a random aspect ratio of 3/4 to 4/3 of the original aspect ratio 262 | This is popularly used to train the Inception networks 263 | size: size of the smaller edge 264 | interpolation: Default: PIL.Image.BILINEAR 265 | """ 266 | def __init__(self, size, interpolation=Image.BILINEAR): 267 | self.size = size 268 | self.interpolation = interpolation 269 | 270 | def __call__(self, img_group): 271 | for attempt in range(10): 272 | area = img_group[0].size[0] * img_group[0].size[1] 273 | target_area = random.uniform(0.08, 1.0) * area 274 | aspect_ratio = random.uniform(3. / 4, 4. / 3) 275 | 276 | w = int(round(math.sqrt(target_area * aspect_ratio))) 277 | h = int(round(math.sqrt(target_area / aspect_ratio))) 278 | 279 | if random.random() < 0.5: 280 | w, h = h, w 281 | 282 | if w <= img_group[0].size[0] and h <= img_group[0].size[1]: 283 | x1 = random.randint(0, img_group[0].size[0] - w) 284 | y1 = random.randint(0, img_group[0].size[1] - h) 285 | found = True 286 | break 287 | else: 288 | found = False 289 | x1 = 0 290 | y1 = 0 291 | 292 | if found: 293 | out_group = list() 294 | for img in img_group: 295 | img = img.crop((x1, y1, x1 + w, y1 + h)) 296 | assert(img.size == (w, h)) 297 | out_group.append(img.resize((self.size, self.size), self.interpolation)) 298 | return out_group 299 | else: 300 | # Fallback 301 | scale = GroupScale(self.size, interpolation=self.interpolation) 302 | crop = GroupRandomCrop(self.size) 303 | return crop(scale(img_group)) 304 | 305 | 306 | class Stack(object): 307 | 308 | def __init__(self, roll=False): 309 | self.roll = roll 310 | 311 | def __call__(self, img_group): 312 | if img_group[0].mode == 'L': 313 | return np.concatenate([np.expand_dims(x, 2) for x in img_group], axis=2) 314 | elif img_group[0].mode == 'RGB': 315 | if self.roll: 316 | return np.concatenate([np.array(x)[:, :, ::-1] for x in img_group], axis=2) 317 | else: 318 | return np.concatenate(img_group, axis=2) 319 | 320 | 321 | class ToTorchFormatTensor(object): 322 | """ Converts a PIL.Image (RGB) or numpy.ndarray (H x W x C) in the range [0, 255] 323 | to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] """ 324 | def __init__(self, div=True): 325 | self.div = div 326 | 327 | def __call__(self, pic): 328 | if isinstance(pic, np.ndarray): 329 | # handle numpy array 330 | img = torch.from_numpy(pic).permute(2, 0, 1).contiguous() 331 | else: 332 | # handle PIL Image 333 | img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes())) 334 | img = img.view(pic.size[1], pic.size[0], len(pic.mode)) 335 | # put it from HWC to CHW format 336 | # yikes, this transpose takes 80% of the loading time/CPU 337 | img = img.transpose(0, 1).transpose(0, 2).contiguous() 338 | return img.float().div(255) if self.div else img.float() 339 | 340 | 341 | class IdentityTransform(object): 342 | 343 | def __call__(self, data): 344 | return data 345 | 346 | 347 | if __name__ == "__main__": 348 | trans = torchvision.transforms.Compose([ 349 | GroupScale(256), 350 | GroupRandomCrop(224), 351 | Stack(), 352 | ToTorchFormatTensor(), 353 | GroupNormalize( 354 | mean=[.485, .456, .406], 355 | std=[.229, .224, .225] 356 | )] 357 | ) 358 | 359 | im = Image.open('../tensorflow-model-zoo.torch/lena_299.png') 360 | 361 | color_group = [im] * 3 362 | rst = trans(color_group) 363 | 364 | gray_group = [im.convert('L')] * 9 365 | gray_rst = trans(gray_group) 366 | 367 | trans2 = torchvision.transforms.Compose([ 368 | GroupRandomSizedCrop(256), 369 | Stack(), 370 | ToTorchFormatTensor(), 371 | GroupNormalize( 372 | mean=[.485, .456, .406], 373 | std=[.229, .224, .225]) 374 | ]) 375 | print(trans2(color_group)) 376 | -------------------------------------------------------------------------------- /ops/utils.py: -------------------------------------------------------------------------------- 1 | # Code for paper: 2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance" 3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan 4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch 5 | 6 | import numpy as np 7 | 8 | 9 | def softmax(scores): 10 | es = np.exp(scores - scores.max(axis=-1)[..., None]) 11 | return es / es.sum(axis=-1)[..., None] 12 | 13 | 14 | class AverageMeter(object): 15 | """Computes and stores the average and current value""" 16 | 17 | def __init__(self): 18 | self.reset() 19 | 20 | def reset(self): 21 | self.val = 0 22 | self.avg = 0 23 | self.sum = 0 24 | self.count = 0 25 | 26 | def update(self, val, n=1): 27 | self.val = val 28 | self.sum += val * n 29 | self.count += n 30 | self.avg = self.sum / self.count 31 | 32 | 33 | def accuracy(output, target, topk=(1,)): 34 | """Computes the precision@k for the specified values of k""" 35 | maxk = max(topk) 36 | batch_size = target.size(0) 37 | 38 | _, pred = output.topk(maxk, 1, True, True) 39 | pred = pred.t() 40 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 41 | 42 | res = [] 43 | for k in topk: 44 | correct_k = correct[:k].view(-1).float().sum(0) 45 | res.append(correct_k.mul_(100.0 / batch_size)) 46 | return res 47 | -------------------------------------------------------------------------------- /opts.py: -------------------------------------------------------------------------------- 1 | # Code for paper: 2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance" 3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan 4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch 5 | 6 | import argparse 7 | parser = argparse.ArgumentParser(description="PyTorch implementation of Temporal Segment Networks") 8 | parser.add_argument('dataset', type=str) 9 | parser.add_argument('modality', type=str, choices=['Lite', 'RGB', 'PA', 'Flow']) 10 | parser.add_argument('--train_list', type=str, default="") 11 | parser.add_argument('--val_list', type=str, default="") 12 | parser.add_argument('--root_path', type=str, default="") 13 | parser.add_argument('--store_name', type=str, default="") 14 | parser.add_argument('--lmdb', default=False, action="store_true", help='use lmdb format dataset') 15 | # ========================= Model Configs ========================== 16 | parser.add_argument('--arch', type=str, default="BNInception") 17 | parser.add_argument('--num_segments', type=int, default=8) 18 | parser.add_argument('--consensus_type', type=str, default='avg') 19 | parser.add_argument('--k', type=int, default=3) 20 | 21 | parser.add_argument('--dropout', '--do', default=0.5, type=float, 22 | metavar='DO', help='dropout ratio (default: 0.5)') 23 | parser.add_argument('--loss_type', type=str, default="nll", 24 | choices=['nll']) 25 | parser.add_argument('--img_feature_dim', default=256, type=int, help="the feature dimension for each frame") 26 | parser.add_argument('--suffix', type=str, default=None) 27 | parser.add_argument('--pretrain', type=str, default='imagenet') 28 | parser.add_argument('--tune_from', type=str, default=None, help='fine-tune from checkpoint') 29 | parser.add_argument('--base', default='TSM', type=str, choices=['TSN', 'TSM']) 30 | 31 | # ========================= Learning Configs ========================== 32 | parser.add_argument('--epochs', default=120, type=int, metavar='N', 33 | help='number of total epochs to run') 34 | parser.add_argument('-b', '--batch-size', default=128, type=int, 35 | metavar='N', help='mini-batch size (default: 256)') 36 | parser.add_argument('--lr', '--learning-rate', default=0.001, type=float, 37 | metavar='LR', help='initial learning rate') 38 | parser.add_argument('--lr_type', default='step', type=str, 39 | metavar='LRtype', help='learning rate type') 40 | parser.add_argument('--lr_steps', default=[50, 100], type=float, nargs="+", 41 | metavar='LRSteps', help='epochs to decay learning rate by 10') 42 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M', 43 | help='momentum') 44 | parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float, 45 | metavar='W', help='weight decay (default: 5e-4)') 46 | parser.add_argument('--clip-gradient', '--gd', default=20, type=float, 47 | metavar='W', help='gradient norm clipping (default: disabled)') 48 | parser.add_argument('--no_partialbn', '--npb', default=True, action="store_true") 49 | parser.add_argument('-i', '--iter-size', default=1, type=int, 50 | metavar='N', help='number of iterations before on update') 51 | 52 | # ========================= Monitor Configs ========================== 53 | parser.add_argument('--print-freq', '-p', default=20, type=int, 54 | metavar='N', help='print frequency (default: 10)') 55 | parser.add_argument('--eval-freq', '-ef', default=1, type=int, 56 | metavar='N', help='evaluation frequency (default: 5)') 57 | 58 | 59 | # ========================= Runtime Configs ========================== 60 | parser.add_argument('-j', '--workers', default=8, type=int, metavar='N', 61 | help='number of data loading workers (default: 8)') 62 | parser.add_argument('--resume', default='', type=str, metavar='PATH', 63 | help='path to latest checkpoint (default: none)') 64 | parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', 65 | help='evaluate model on validation set') 66 | parser.add_argument('--snapshot_pref', type=str, default="") 67 | parser.add_argument('--start-epoch', default=0, type=int, metavar='N', 68 | help='manual epoch number (useful on restarts)') 69 | parser.add_argument('--gpus', nargs='+', type=int, default=None) 70 | parser.add_argument('--flow_prefix', default="", type=str) 71 | parser.add_argument('--root_log',type=str, default='log') 72 | parser.add_argument('--root_model', type=str, default='checkpoint') 73 | 74 | parser.add_argument('--shift', default=False, action="store_true", help='use shift for models') 75 | parser.add_argument('--shift_div', default=8, type=int, help='number of div for shift (default: 8)') 76 | parser.add_argument('--shift_place', default='blockres', type=str, help='place for shift (default: stageres)') 77 | 78 | parser.add_argument('--temporal_pool', default=False, action="store_true", help='add temporal pooling') 79 | parser.add_argument('--non_local', default=False, action="store_true", help='add non local block') 80 | 81 | parser.add_argument('--dense_sample', default=False, action="store_true", help='use dense sample for video dataset') 82 | 83 | parser.add_argument('--VAP', default=True, action="store_true", help='use VAP for various-timescale aggregation') 84 | -------------------------------------------------------------------------------- /pretrained/models_urls.md: -------------------------------------------------------------------------------- 1 | ## Pretrained Models 2 | 3 | Here, we provide the pretrained models of PAN models on Something-Something-V1 & V2 datasets. Recognizing actions in these datasets requires strong temporal modeling ability, as many action classes are symmetrical. PAN achieves state-of-the-art performance on these datasets. Notably, our method even surpasses optical flow based methods while with only RGB frames as input. 4 | 5 | ### Something-Something-V1 6 | 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 |
ModelBackboneFLOPs * viewsVal Top1Val Top5Checkpoints
PANLiteResNet-5035.7G * 148.076.1[Google Drive] or [Weiyun]
PANFull67.7G * 150.579.2
PANEn(46.6G+88.4G) * 253.481.1
PANEnResNet-101(85.6G+166.1G) * 255.382.8[Google Drive] or [Weiyun]
50 |
51 | 52 | ### Something-Something-V2 53 | 54 |
55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 |
ModelBackboneFLOPs * viewsVal Top1Val Top5Checkpoints
PANLiteResNet-5035.7G * 160.886.7[Google Drive] or [Weiyun]
PANFull67.7G * 163.888.6
PANEn(46.6G+88.4G) * 266.290.1
PANEnResNet-101(85.6G+166.1G) * 266.590.6[Google Drive] or [Weiyun]
97 |
98 | 99 | -------------------------------------------------------------------------------- /scripts/test/sthv1/En.sh: -------------------------------------------------------------------------------- 1 | python test_models.py something \ 2 | --VAP --batch_size=64 -j=4 --test_crops=1 --test_segments=8,8,8 \ 3 | --weights=pretrained/PAN_Lite_something_resnet50_shift8_blockres_avg_segment8_e80.pth.tar,pretrained/PAN_RGB_something_resnet50_shift8_blockres_avg_segment8_e50.pth.tar,pretrained/PAN_PA_something_resnet50_shift8_blockres_avg_segment8_e80.pth.tar \ 4 | --full_res --twice_sample 5 | -------------------------------------------------------------------------------- /scripts/test/sthv1/Full.sh: -------------------------------------------------------------------------------- 1 | python test_models.py something --VAP --batch_size=64 -j=4 --test_crops=1 --test_segments=8,8 --weights=pretrained/PAN_RGB_something_resnet50_shift8_blockres_avg_segment8_e50.pth.tar,pretrained/PAN_PA_something_resnet50_shift8_blockres_avg_segment8_e80.pth.tar 2 | -------------------------------------------------------------------------------- /scripts/test/sthv1/Lite.sh: -------------------------------------------------------------------------------- 1 | python test_models.py something --VAP --batch_size=64 -j=4 --test_crops=1 --test_segments=8 --weights=pretrained/PAN_Lite_something_resnet50_shift8_blockres_avg_segment8_e80.pth.tar 2 | -------------------------------------------------------------------------------- /scripts/test/sthv2/En.sh: -------------------------------------------------------------------------------- 1 | python test_models.py somethingv2 \ 2 | --VAP --batch_size=64 -j=4 --test_crops=1 --test_segments=8,8,8 \ 3 | --weights=pretrained/PAN_Lite_somethingv2_resnet50_shift8_blockres_avg_segment8_e80.pth.tar,pretrained/PAN_RGB_somethingv2_resnet50_shift8_blockres_avg_segment8_e50.pth.tar,pretrained/PAN_PA_somethingv2_resnet50_shift8_blockres_avg_segment8_e80.pth.tar \ 4 | --full_res --twice_sample 5 | -------------------------------------------------------------------------------- /scripts/test/sthv2/Full.sh: -------------------------------------------------------------------------------- 1 | python test_models.py somethingv2 --VAP --batch_size=64 -j=4 --test_crops=1 --test_segments=8,8 --weights=pretrained/PAN_RGB_somethingv2_resnet50_shift8_blockres_avg_segment8_e50.pth.tar,pretrained/PAN_PA_somethingv2_resnet50_shift8_blockres_avg_segment8_e80.pth.tar 2 | -------------------------------------------------------------------------------- /scripts/test/sthv2/Lite.sh: -------------------------------------------------------------------------------- 1 | python test_models.py somethingv2 --VAP --batch_size=64 -j=4 --test_crops=1 --test_segments=8 --weights=pretrained/PAN_Lite_somethingv2_resnet50_shift8_blockres_avg_segment8_e80.pth.tar 2 | -------------------------------------------------------------------------------- /scripts/train/sthv1/Full_PA.sh: -------------------------------------------------------------------------------- 1 | python main.py something PA --arch resnet50 --num_segments 8 --lr 0.01 --lr_steps 30 60 --epochs 80 --batch-size 64 -i 1 -j 8 --dropout 0.5 2 | -------------------------------------------------------------------------------- /scripts/train/sthv1/Full_RGB.sh: -------------------------------------------------------------------------------- 1 | python main.py something RGB --arch resnet50 --num_segments 8 --lr 0.01 --lr_steps 20 40 --epochs 50 --batch-size 64 -i 1 -j 8 --dropout 0.5 2 | -------------------------------------------------------------------------------- /scripts/train/sthv1/Lite.sh: -------------------------------------------------------------------------------- 1 | python main.py something Lite --arch resnet50 --num_segments 8 --lr 0.01 --lr_steps 30 60 --epochs 80 --batch-size 64 -i 1 -j 8 --dropout 0.5 2 | -------------------------------------------------------------------------------- /scripts/train/sthv2/Full_PA.sh: -------------------------------------------------------------------------------- 1 | python main.py somethingv2 PA --arch resnet50 --num_segments 8 --lr 0.01 --lr_steps 30 60 --epochs 80 --batch-size 64 -i 1 -j 8 --dropout 0.5 2 | -------------------------------------------------------------------------------- /scripts/train/sthv2/Full_RGB.sh: -------------------------------------------------------------------------------- 1 | python main.py somethingv2 RGB --arch resnet50 --num_segments 8 --lr 0.01 --lr_steps 20 40 --epochs 50 --batch-size 64 -i 1 -j 8 --dropout 0.5 2 | -------------------------------------------------------------------------------- /scripts/train/sthv2/Lite.sh: -------------------------------------------------------------------------------- 1 | python main.py somethingv2 Lite --arch resnet50 --num_segments 8 --lr 0.01 --lr_steps 30 60 --epochs 80 --batch-size 64 -i 1 -j 8 --dropout 0.5 2 | -------------------------------------------------------------------------------- /test_models.py: -------------------------------------------------------------------------------- 1 | # Code for paper: 2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance" 3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan 4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch 5 | 6 | import argparse 7 | import time 8 | 9 | import torch.nn.parallel 10 | import torch.optim 11 | from sklearn.metrics import confusion_matrix 12 | from ops.dataset import PANDataSet 13 | from ops.models import PAN 14 | from ops.transforms import * 15 | from ops import dataset_config 16 | from torch.nn import functional as F 17 | 18 | # options 19 | parser = argparse.ArgumentParser(description="PAN testing on the full validation set") 20 | parser.add_argument('dataset', type=str) 21 | 22 | # may contain splits 23 | parser.add_argument('--weights', type=str, default=None) 24 | parser.add_argument('--test_segments', type=str, default=25) 25 | parser.add_argument('--dense_sample', default=False, action="store_true", help='use dense sample as I3D') 26 | parser.add_argument('--twice_sample', default=False, action="store_true", help='use twice sample for ensemble') 27 | parser.add_argument('--full_res', default=False, action="store_true", 28 | help='use full resolution 256x256 for test as in Non-local I3D') 29 | 30 | parser.add_argument('--test_crops', type=int, default=1) 31 | parser.add_argument('--coeff', type=str, default=None) 32 | parser.add_argument('--batch_size', type=int, default=1) 33 | parser.add_argument('-j', '--workers', default=8, type=int, metavar='N', 34 | help='number of data loading workers (default: 8)') 35 | 36 | # for true test 37 | parser.add_argument('--test_list', type=str, default=None) 38 | parser.add_argument('--csv_file', type=str, default=None) 39 | 40 | parser.add_argument('--softmax', default=False, action="store_true", help='use softmax') 41 | 42 | parser.add_argument('--max_num', type=int, default=-1) 43 | parser.add_argument('--input_size', type=int, default=224) 44 | parser.add_argument('--crop_fusion_type', type=str, default='avg') 45 | parser.add_argument('--gpus', nargs='+', type=int, default=None) 46 | parser.add_argument('--img_feature_dim',type=int, default=256) 47 | parser.add_argument('--num_set_segments',type=int, default=1,help='TODO: select multiply set of n-frames from a video') 48 | parser.add_argument('--pretrain', type=str, default='imagenet') 49 | parser.add_argument('--lmdb', default=False, action="store_true", help='use lmdb format dataset') 50 | parser.add_argument('--VAP', default=False, action="store_true", help='use VAP for various-timescale aggregation') 51 | args = parser.parse_args() 52 | 53 | 54 | class AverageMeter(object): 55 | """Computes and stores the average and current value""" 56 | def __init__(self): 57 | self.reset() 58 | 59 | def reset(self): 60 | self.val = 0 61 | self.avg = 0 62 | self.sum = 0 63 | self.count = 0 64 | 65 | def update(self, val, n=1): 66 | self.val = val 67 | self.sum += val * n 68 | self.count += n 69 | self.avg = self.sum / self.count 70 | 71 | 72 | def accuracy(output, target, topk=(1,)): 73 | """Computes the precision@k for the specified values of k""" 74 | maxk = max(topk) 75 | batch_size = target.size(0) 76 | _, pred = output.topk(maxk, 1, True, True) 77 | pred = pred.t() 78 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 79 | res = [] 80 | for k in topk: 81 | correct_k = correct[:k].view(-1).float().sum(0) 82 | res.append(correct_k.mul_(100.0 / batch_size)) 83 | return res 84 | 85 | 86 | def parse_shift_option_from_log_name(log_name): 87 | if 'shift' in log_name: 88 | strings = log_name.split('_') 89 | for i, s in enumerate(strings): 90 | if 'shift' in s: 91 | break 92 | return True, int(strings[i].replace('shift', '')), strings[i + 1] 93 | else: 94 | return False, None, None 95 | 96 | 97 | weights_list = args.weights.split(',') 98 | test_segments_list = [int(s) for s in args.test_segments.split(',')] 99 | assert len(weights_list) == len(test_segments_list) 100 | if args.coeff is None: 101 | coeff_list = [1] * len(weights_list) 102 | else: 103 | coeff_list = [float(c) for c in args.coeff.split(',')] 104 | 105 | if args.test_list is not None: 106 | test_file_list = args.test_list.split(',') 107 | else: 108 | test_file_list = [None] * len(weights_list) 109 | 110 | 111 | data_iter_list = [] 112 | net_list = [] 113 | modality_list = [] 114 | 115 | total_num = None 116 | for this_weights, this_test_segments, test_file in zip(weights_list, test_segments_list, test_file_list): 117 | is_shift, shift_div, shift_place = parse_shift_option_from_log_name(this_weights) 118 | if 'Lite' in this_weights: 119 | modality = 'Lite' 120 | data_length = 4 121 | elif 'RGB' in this_weights: 122 | modality = 'RGB' 123 | data_length = 1 124 | elif 'PA' in this_weights: 125 | modality = 'PA' 126 | data_length = 4 127 | else: 128 | modality = 'Flow' 129 | data_length = 5 130 | this_arch = this_weights.split('PAN_')[1].split('_')[2] 131 | modality_list.append(modality) 132 | num_class, args.train_list, val_list, root_path, prefix = dataset_config.return_dataset(args.dataset, 133 | modality) 134 | print('=> shift: {}, shift_div: {}, shift_place: {}'.format(is_shift, shift_div, shift_place)) 135 | net = PAN(num_class, this_test_segments if is_shift else 1, modality, 136 | base_model=this_arch, 137 | consensus_type=args.crop_fusion_type, 138 | img_feature_dim=args.img_feature_dim, 139 | pretrain=args.pretrain, 140 | is_shift=is_shift, shift_div=shift_div, shift_place=shift_place, 141 | non_local='_nl' in this_weights, 142 | data_length=data_length, 143 | has_VAP=args.VAP, 144 | ) 145 | 146 | if 'tpool' in this_weights: 147 | from ops.temporal_shift import make_temporal_pool 148 | make_temporal_pool(net.base_model, this_test_segments) # since DataParallel 149 | 150 | checkpoint = torch.load(this_weights) 151 | checkpoint = checkpoint['state_dict'] 152 | 153 | # base_dict = {('base_model.' + k).replace('base_model.fc', 'new_fc'): v for k, v in list(checkpoint.items())} 154 | base_dict = {'.'.join(k.split('.')[1:]): v for k, v in list(checkpoint.items())} 155 | replace_dict = {'base_model.classifier.weight': 'new_fc.weight', 156 | 'base_model.classifier.bias': 'new_fc.bias', 157 | } 158 | for k, v in replace_dict.items(): 159 | if k in base_dict: 160 | base_dict[v] = base_dict.pop(k) 161 | 162 | net.load_state_dict(base_dict) 163 | 164 | input_size = net.scale_size if args.full_res else net.input_size 165 | if args.test_crops == 1: 166 | cropping = torchvision.transforms.Compose([ 167 | GroupScale(net.scale_size), 168 | GroupCenterCrop(input_size), 169 | ]) 170 | elif args.test_crops == 3: # do not flip, so only 5 crops 171 | cropping = torchvision.transforms.Compose([ 172 | GroupFullResSample(input_size, net.scale_size, flip=False) 173 | ]) 174 | elif args.test_crops == 5: # do not flip, so only 5 crops 175 | cropping = torchvision.transforms.Compose([ 176 | GroupOverSample(input_size, net.scale_size, flip=False) 177 | ]) 178 | elif args.test_crops == 10: 179 | cropping = torchvision.transforms.Compose([ 180 | GroupOverSample(input_size, net.scale_size) 181 | ]) 182 | else: 183 | raise ValueError("Only 1, 5, 10 crops are supported while we got {}".format(args.test_crops)) 184 | 185 | data_loader = torch.utils.data.DataLoader( 186 | PANDataSet(root_path, test_file if test_file is not None else val_list, num_segments=this_test_segments, 187 | new_length=data_length, 188 | modality=modality, 189 | image_tmpl=prefix, 190 | test_mode=True, 191 | remove_missing=len(weights_list) == 1, 192 | transform=torchvision.transforms.Compose([ 193 | cropping, 194 | Stack(roll=(this_arch in ['BNInception', 'InceptionV3'])), 195 | ToTorchFormatTensor(div=(this_arch not in ['BNInception', 'InceptionV3'])), 196 | GroupNormalize(net.input_mean, net.input_std), 197 | ]), dense_sample=args.dense_sample, twice_sample=args.twice_sample, is_lmdb=args.lmdb), 198 | batch_size=args.batch_size, shuffle=False, 199 | num_workers=args.workers, pin_memory=True, 200 | ) 201 | 202 | if args.gpus is not None: 203 | devices = [args.gpus[i] for i in range(args.workers)] 204 | else: 205 | devices = list(range(args.workers)) 206 | 207 | net = torch.nn.DataParallel(net.cuda()) 208 | net.eval() 209 | 210 | data_gen = enumerate(data_loader) 211 | 212 | if total_num is None: 213 | total_num = len(data_loader.dataset) 214 | else: 215 | assert total_num == len(data_loader.dataset) 216 | 217 | data_iter_list.append(data_gen) 218 | net_list.append(net) 219 | 220 | 221 | output = [] 222 | 223 | 224 | def eval_video(video_data, net, this_test_segments, modality): 225 | net.eval() 226 | with torch.no_grad(): 227 | i, data, label = video_data 228 | batch_size = label.numel() 229 | num_crop = args.test_crops 230 | if args.dense_sample: 231 | num_crop *= 10 # 10 clips for testing when using dense sample 232 | 233 | if args.twice_sample: 234 | num_crop *= 2 235 | 236 | if modality == 'RGB': 237 | length = 3 238 | elif modality in ['PA', 'Lite']: 239 | length = 12 240 | elif modality == 'Flow': 241 | length = 10 242 | elif modality == 'RGBDiff': 243 | length = 18 244 | else: 245 | raise ValueError("Unknown modality "+ modality) 246 | 247 | if modality in ['PA', 'Lite']: 248 | PA_length = 4 249 | else: 250 | PA_length = 1 251 | 252 | data_in = data.view(-1, length, data.size(2), data.size(3)) 253 | if is_shift: 254 | data_in = data_in.view(batch_size * num_crop, this_test_segments, length, data_in.size(2), data_in.size(3)) 255 | rst = net(data_in) 256 | rst = rst.reshape(batch_size, num_crop, -1).mean(1) 257 | 258 | if args.softmax: 259 | # take the softmax to normalize the output to probability 260 | rst = F.softmax(rst, dim=1) 261 | 262 | rst = rst.data.cpu().numpy().copy() 263 | 264 | if net.module.is_shift: 265 | rst = rst.reshape(batch_size, num_class) 266 | else: 267 | rst = rst.reshape((batch_size, -1, num_class)).mean(axis=1).reshape((batch_size, num_class)) 268 | 269 | return i, rst, label 270 | 271 | 272 | proc_start_time = time.time() 273 | max_num = args.max_num if args.max_num > 0 else total_num 274 | 275 | top1 = AverageMeter() 276 | top5 = AverageMeter() 277 | 278 | for i, data_label_pairs in enumerate(zip(*data_iter_list)): 279 | with torch.no_grad(): 280 | if i >= max_num: 281 | break 282 | this_rst_list = [] 283 | this_label = None 284 | for n_seg, (_, (data, label)), net, modality in zip(test_segments_list, data_label_pairs, net_list, modality_list): 285 | rst = eval_video((i, data, label), net, n_seg, modality) 286 | this_rst_list.append(rst[1]) 287 | this_label = label 288 | assert len(this_rst_list) == len(coeff_list) 289 | for i_coeff in range(len(this_rst_list)): 290 | this_rst_list[i_coeff] *= coeff_list[i_coeff] 291 | ensembled_predict = sum(this_rst_list) / len(this_rst_list) 292 | 293 | for p, g in zip(ensembled_predict, this_label.cpu().numpy()): 294 | output.append([p[None, ...], g]) 295 | cnt_time = time.time() - proc_start_time 296 | prec1, prec5 = accuracy(torch.from_numpy(ensembled_predict), this_label, topk=(1, 5)) 297 | top1.update(prec1.item(), this_label.numel()) 298 | top5.update(prec5.item(), this_label.numel()) 299 | if i % 20 == 0: 300 | print('video {} done, total {}/{}, average {:.3f} sec/video, ' 301 | 'moving Prec@1 {:.3f} Prec@5 {:.3f}'.format(i * args.batch_size, i * args.batch_size, total_num, 302 | float(cnt_time) / (i+1) / args.batch_size, top1.avg, top5.avg)) 303 | 304 | video_pred = [np.argmax(x[0]) for x in output] 305 | video_pred_top5 = [np.argsort(np.mean(x[0], axis=0).reshape(-1))[::-1][:5] for x in output] 306 | 307 | video_labels = [x[1] for x in output] 308 | 309 | 310 | if args.csv_file is not None: 311 | print('=> Writing result to csv file: {}'.format(args.csv_file)) 312 | with open('sth_category.txt') as f: 313 | categories = f.readlines() 314 | categories = sorted([f.strip() for f in categories]) 315 | with open(test_file_list[0]) as f: 316 | vid_names = f.readlines() 317 | vid_names = [n.split(' ')[0] for n in vid_names] 318 | assert len(vid_names) == len(video_pred) 319 | if args.dataset != 'somethingv2': # only output top1 320 | with open(args.csv_file, 'w') as f: 321 | for n, pred in zip(vid_names, video_pred): 322 | f.write('{};{}\n'.format(n, categories[pred])) 323 | else: 324 | with open(args.csv_file, 'w') as f: 325 | for n, pred5 in zip(vid_names, video_pred_top5): 326 | fill = [n] 327 | for p in list(pred5): 328 | fill.append(p) 329 | f.write('{};{};{};{};{};{}\n'.format(*fill)) 330 | 331 | 332 | cf = confusion_matrix(video_labels, video_pred).astype(float) 333 | 334 | np.save('cm.npy', cf) 335 | cls_cnt = cf.sum(axis=1) 336 | cls_hit = np.diag(cf) 337 | 338 | cls_acc = cls_hit / cls_cnt 339 | print(cls_acc) 340 | upper = np.mean(np.max(cf, axis=1) / cls_cnt) 341 | print('upper bound: {}'.format(upper)) 342 | 343 | print('-----Evaluation is finished------') 344 | print('Class Accuracy {:.02f}%'.format(np.mean(cls_acc) * 100)) 345 | print('Overall Prec@1 {:.02f}% Prec@5 {:.02f}%'.format(top1.avg, top5.avg)) 346 | 347 | # reorder before saving 348 | name_list = [x.strip().split()[0] for x in open(val_list)] 349 | 350 | order_dict = {e:i for i, e in enumerate(sorted(name_list))} 351 | 352 | reorder_output = [None] * len(output) 353 | reorder_label = [None] * len(output) 354 | 355 | for i in range(len(output)): 356 | idx = order_dict[name_list[i]] 357 | reorder_output[idx] = output[i] 358 | reorder_label[idx] = video_labels[i] 359 | 360 | if set(['PA', 'RGB']) == set(modality_list): 361 | modality = 'Full' 362 | elif set(['PA', 'RGB', 'Lite']) == set(modality_list): 363 | modality = 'En' 364 | 365 | np.savez("_".join([args.dataset, modality, str(top1.avg)]), scores=reorder_output, labels=reorder_label) 366 | -------------------------------------------------------------------------------- /tools/gen_label_kinetics.py: -------------------------------------------------------------------------------- 1 | # Code for paper: 2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance" 3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan 4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch 5 | # ------------------------------------------------------ 6 | # Code adapted from https://github.com/metalbubble/TRN-pytorch/blob/master/process_dataset.py 7 | 8 | import os 9 | 10 | 11 | dataset_path = '/ssd/video/kinetics/images256/' 12 | label_path = '/ssd/video/kinetics/labels' 13 | 14 | if __name__ == '__main__': 15 | with open('kinetics_label_map.txt') as f: 16 | categories = f.readlines() 17 | categories = [c.strip().replace(' ', '_').replace('"', '').replace('(', '').replace(')', '').replace("'", '') for c in categories] 18 | assert len(set(categories)) == 400 19 | dict_categories = {} 20 | for i, category in enumerate(categories): 21 | dict_categories[category] = i 22 | 23 | print(dict_categories) 24 | 25 | files_input = ['kinetics_val.csv', 'kinetics_train.csv'] 26 | files_output = ['val_videofolder.txt', 'train_videofolder.txt'] 27 | for (filename_input, filename_output) in zip(files_input, files_output): 28 | count_cat = {k: 0 for k in dict_categories.keys()} 29 | with open(os.path.join(label_path, filename_input)) as f: 30 | lines = f.readlines()[1:] 31 | folders = [] 32 | idx_categories = [] 33 | categories_list = [] 34 | for line in lines: 35 | line = line.rstrip() 36 | items = line.split(',') 37 | folders.append(items[1] + '_' + items[2]) 38 | this_catergory = items[0].replace(' ', '_').replace('"', '').replace('(', '').replace(')', '').replace("'", '') 39 | categories_list.append(this_catergory) 40 | idx_categories.append(dict_categories[this_catergory]) 41 | count_cat[this_catergory] += 1 42 | print(max(count_cat.values())) 43 | 44 | assert len(idx_categories) == len(folders) 45 | missing_folders = [] 46 | output = [] 47 | for i in range(len(folders)): 48 | curFolder = folders[i] 49 | curIDX = idx_categories[i] 50 | # counting the number of frames in each video folders 51 | img_dir = os.path.join(dataset_path, categories_list[i], curFolder) 52 | if not os.path.exists(img_dir): 53 | missing_folders.append(img_dir) 54 | # print(missing_folders) 55 | else: 56 | dir_files = os.listdir(img_dir) 57 | output.append('%s %d %d'%(os.path.join(categories_list[i], curFolder), len(dir_files), curIDX)) 58 | print('%d/%d, missing %d'%(i, len(folders), len(missing_folders))) 59 | with open(os.path.join(label_path, filename_output),'w') as f: 60 | f.write('\n'.join(output)) 61 | with open(os.path.join(label_path, 'missing_' + filename_output),'w') as f: 62 | f.write('\n'.join(missing_folders)) 63 | -------------------------------------------------------------------------------- /tools/gen_label_sthv1.py: -------------------------------------------------------------------------------- 1 | # Code for paper: 2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance" 3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan 4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch 5 | # ------------------------------------------------------ 6 | # Code adapted from https://github.com/metalbubble/TRN-pytorch/blob/master/process_dataset.py 7 | # processing the raw data of the video Something-Something-V1 8 | 9 | import os 10 | 11 | if __name__ == '__main__': 12 | dataset_name = 'something-something-v1' # 'jester-v1' 13 | with open('%s-labels.csv' % dataset_name) as f: 14 | lines = f.readlines() 15 | categories = [] 16 | for line in lines: 17 | line = line.rstrip() 18 | categories.append(line) 19 | categories = sorted(categories) 20 | with open('category.txt', 'w') as f: 21 | f.write('\n'.join(categories)) 22 | 23 | dict_categories = {} 24 | for i, category in enumerate(categories): 25 | dict_categories[category] = i 26 | 27 | files_input = ['%s-validation.csv' % dataset_name, '%s-train.csv' % dataset_name] 28 | files_output = ['val_videofolder.txt', 'train_videofolder.txt'] 29 | for (filename_input, filename_output) in zip(files_input, files_output): 30 | with open(filename_input) as f: 31 | lines = f.readlines() 32 | folders = [] 33 | idx_categories = [] 34 | for line in lines: 35 | line = line.rstrip() 36 | items = line.split(';') 37 | folders.append(items[0]) 38 | idx_categories.append(dict_categories[items[1]]) 39 | output = [] 40 | for i in range(len(folders)): 41 | curFolder = folders[i] 42 | curIDX = idx_categories[i] 43 | # counting the number of frames in each video folders 44 | dir_files = os.listdir(os.path.join('../img', curFolder)) 45 | output.append('%s %d %d' % ('something/v1/img/' + curFolder, len(dir_files), curIDX)) 46 | print('%d/%d' % (i, len(folders))) 47 | with open(filename_output, 'w') as f: 48 | f.write('\n'.join(output)) 49 | -------------------------------------------------------------------------------- /tools/gen_label_sthv2.py: -------------------------------------------------------------------------------- 1 | # Code for paper: 2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance" 3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan 4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch 5 | # ------------------------------------------------------ 6 | # Code adapted from https://github.com/metalbubble/TRN-pytorch/blob/master/process_dataset.py 7 | # processing the raw data of the video Something-Something-V2 8 | 9 | import os 10 | import json 11 | 12 | if __name__ == '__main__': 13 | dataset_name = 'something-something-v2' # 'jester-v1' 14 | with open('%s-labels.json' % dataset_name) as f: 15 | data = json.load(f) 16 | categories = [] 17 | for i, (cat, idx) in enumerate(data.items()): 18 | assert i == int(idx) # make sure the rank is right 19 | categories.append(cat) 20 | 21 | with open('category.txt', 'w') as f: 22 | f.write('\n'.join(categories)) 23 | 24 | dict_categories = {} 25 | for i, category in enumerate(categories): 26 | dict_categories[category] = i 27 | 28 | files_input = ['%s-validation.json' % dataset_name, '%s-train.json' % dataset_name, '%s-test.json' % dataset_name] 29 | files_output = ['val_videofolder.txt', 'train_videofolder.txt', 'test_videofolder.txt'] 30 | for (filename_input, filename_output) in zip(files_input, files_output): 31 | with open(filename_input) as f: 32 | data = json.load(f) 33 | folders = [] 34 | idx_categories = [] 35 | for item in data: 36 | folders.append(item['id']) 37 | if 'test' not in filename_input: 38 | idx_categories.append(dict_categories[item['template'].replace('[', '').replace(']', '')]) 39 | else: 40 | idx_categories.append(0) 41 | output = [] 42 | for i in range(len(folders)): 43 | curFolder = folders[i] 44 | curIDX = idx_categories[i] 45 | # counting the number of frames in each video folders 46 | dir_files = os.listdir(os.path.join('20bn-something-something-v2-frames', curFolder)) 47 | output.append('%s %d %d' % (curFolder, len(dir_files), curIDX)) 48 | print('%d/%d' % (i, len(folders))) 49 | with open(filename_output, 'w') as f: 50 | f.write('\n'.join(output)) 51 | -------------------------------------------------------------------------------- /tools/kinetics_label_map.txt: -------------------------------------------------------------------------------- 1 | abseiling 2 | air drumming 3 | answering questions 4 | applauding 5 | applying cream 6 | archery 7 | arm wrestling 8 | arranging flowers 9 | assembling computer 10 | auctioning 11 | baby waking up 12 | baking cookies 13 | balloon blowing 14 | bandaging 15 | barbequing 16 | bartending 17 | beatboxing 18 | bee keeping 19 | belly dancing 20 | bench pressing 21 | bending back 22 | bending metal 23 | biking through snow 24 | blasting sand 25 | blowing glass 26 | blowing leaves 27 | blowing nose 28 | blowing out candles 29 | bobsledding 30 | bookbinding 31 | bouncing on trampoline 32 | bowling 33 | braiding hair 34 | breading or breadcrumbing 35 | breakdancing 36 | brush painting 37 | brushing hair 38 | brushing teeth 39 | building cabinet 40 | building shed 41 | bungee jumping 42 | busking 43 | canoeing or kayaking 44 | capoeira 45 | carrying baby 46 | cartwheeling 47 | carving pumpkin 48 | catching fish 49 | catching or throwing baseball 50 | catching or throwing frisbee 51 | catching or throwing softball 52 | celebrating 53 | changing oil 54 | changing wheel 55 | checking tires 56 | cheerleading 57 | chopping wood 58 | clapping 59 | clay pottery making 60 | clean and jerk 61 | cleaning floor 62 | cleaning gutters 63 | cleaning pool 64 | cleaning shoes 65 | cleaning toilet 66 | cleaning windows 67 | climbing a rope 68 | climbing ladder 69 | climbing tree 70 | contact juggling 71 | cooking chicken 72 | cooking egg 73 | cooking on campfire 74 | cooking sausages 75 | counting money 76 | country line dancing 77 | cracking neck 78 | crawling baby 79 | crossing river 80 | crying 81 | curling hair 82 | cutting nails 83 | cutting pineapple 84 | cutting watermelon 85 | dancing ballet 86 | dancing charleston 87 | dancing gangnam style 88 | dancing macarena 89 | deadlifting 90 | decorating the christmas tree 91 | digging 92 | dining 93 | disc golfing 94 | diving cliff 95 | dodgeball 96 | doing aerobics 97 | doing laundry 98 | doing nails 99 | drawing 100 | dribbling basketball 101 | drinking 102 | drinking beer 103 | drinking shots 104 | driving car 105 | driving tractor 106 | drop kicking 107 | drumming fingers 108 | dunking basketball 109 | dying hair 110 | eating burger 111 | eating cake 112 | eating carrots 113 | eating chips 114 | eating doughnuts 115 | eating hotdog 116 | eating ice cream 117 | eating spaghetti 118 | eating watermelon 119 | egg hunting 120 | exercising arm 121 | exercising with an exercise ball 122 | extinguishing fire 123 | faceplanting 124 | feeding birds 125 | feeding fish 126 | feeding goats 127 | filling eyebrows 128 | finger snapping 129 | fixing hair 130 | flipping pancake 131 | flying kite 132 | folding clothes 133 | folding napkins 134 | folding paper 135 | front raises 136 | frying vegetables 137 | garbage collecting 138 | gargling 139 | getting a haircut 140 | getting a tattoo 141 | giving or receiving award 142 | golf chipping 143 | golf driving 144 | golf putting 145 | grinding meat 146 | grooming dog 147 | grooming horse 148 | gymnastics tumbling 149 | hammer throw 150 | headbanging 151 | headbutting 152 | high jump 153 | high kick 154 | hitting baseball 155 | hockey stop 156 | holding snake 157 | hopscotch 158 | hoverboarding 159 | hugging 160 | hula hooping 161 | hurdling 162 | hurling (sport) 163 | ice climbing 164 | ice fishing 165 | ice skating 166 | ironing 167 | javelin throw 168 | jetskiing 169 | jogging 170 | juggling balls 171 | juggling fire 172 | juggling soccer ball 173 | jumping into pool 174 | jumpstyle dancing 175 | kicking field goal 176 | kicking soccer ball 177 | kissing 178 | kitesurfing 179 | knitting 180 | krumping 181 | laughing 182 | laying bricks 183 | long jump 184 | lunge 185 | making a cake 186 | making a sandwich 187 | making bed 188 | making jewelry 189 | making pizza 190 | making snowman 191 | making sushi 192 | making tea 193 | marching 194 | massaging back 195 | massaging feet 196 | massaging legs 197 | massaging person's head 198 | milking cow 199 | mopping floor 200 | motorcycling 201 | moving furniture 202 | mowing lawn 203 | news anchoring 204 | opening bottle 205 | opening present 206 | paragliding 207 | parasailing 208 | parkour 209 | passing American football (in game) 210 | passing American football (not in game) 211 | peeling apples 212 | peeling potatoes 213 | petting animal (not cat) 214 | petting cat 215 | picking fruit 216 | planting trees 217 | plastering 218 | playing accordion 219 | playing badminton 220 | playing bagpipes 221 | playing basketball 222 | playing bass guitar 223 | playing cards 224 | playing cello 225 | playing chess 226 | playing clarinet 227 | playing controller 228 | playing cricket 229 | playing cymbals 230 | playing didgeridoo 231 | playing drums 232 | playing flute 233 | playing guitar 234 | playing harmonica 235 | playing harp 236 | playing ice hockey 237 | playing keyboard 238 | playing kickball 239 | playing monopoly 240 | playing organ 241 | playing paintball 242 | playing piano 243 | playing poker 244 | playing recorder 245 | playing saxophone 246 | playing squash or racquetball 247 | playing tennis 248 | playing trombone 249 | playing trumpet 250 | playing ukulele 251 | playing violin 252 | playing volleyball 253 | playing xylophone 254 | pole vault 255 | presenting weather forecast 256 | pull ups 257 | pumping fist 258 | pumping gas 259 | punching bag 260 | punching person (boxing) 261 | push up 262 | pushing car 263 | pushing cart 264 | pushing wheelchair 265 | reading book 266 | reading newspaper 267 | recording music 268 | riding a bike 269 | riding camel 270 | riding elephant 271 | riding mechanical bull 272 | riding mountain bike 273 | riding mule 274 | riding or walking with horse 275 | riding scooter 276 | riding unicycle 277 | ripping paper 278 | robot dancing 279 | rock climbing 280 | rock scissors paper 281 | roller skating 282 | running on treadmill 283 | sailing 284 | salsa dancing 285 | sanding floor 286 | scrambling eggs 287 | scuba diving 288 | setting table 289 | shaking hands 290 | shaking head 291 | sharpening knives 292 | sharpening pencil 293 | shaving head 294 | shaving legs 295 | shearing sheep 296 | shining shoes 297 | shooting basketball 298 | shooting goal (soccer) 299 | shot put 300 | shoveling snow 301 | shredding paper 302 | shuffling cards 303 | side kick 304 | sign language interpreting 305 | singing 306 | situp 307 | skateboarding 308 | ski jumping 309 | skiing (not slalom or crosscountry) 310 | skiing crosscountry 311 | skiing slalom 312 | skipping rope 313 | skydiving 314 | slacklining 315 | slapping 316 | sled dog racing 317 | smoking 318 | smoking hookah 319 | snatch weight lifting 320 | sneezing 321 | sniffing 322 | snorkeling 323 | snowboarding 324 | snowkiting 325 | snowmobiling 326 | somersaulting 327 | spinning poi 328 | spray painting 329 | spraying 330 | springboard diving 331 | squat 332 | sticking tongue out 333 | stomping grapes 334 | stretching arm 335 | stretching leg 336 | strumming guitar 337 | surfing crowd 338 | surfing water 339 | sweeping floor 340 | swimming backstroke 341 | swimming breast stroke 342 | swimming butterfly stroke 343 | swing dancing 344 | swinging legs 345 | swinging on something 346 | sword fighting 347 | tai chi 348 | taking a shower 349 | tango dancing 350 | tap dancing 351 | tapping guitar 352 | tapping pen 353 | tasting beer 354 | tasting food 355 | testifying 356 | texting 357 | throwing axe 358 | throwing ball 359 | throwing discus 360 | tickling 361 | tobogganing 362 | tossing coin 363 | tossing salad 364 | training dog 365 | trapezing 366 | trimming or shaving beard 367 | trimming trees 368 | triple jump 369 | tying bow tie 370 | tying knot (not on a tie) 371 | tying tie 372 | unboxing 373 | unloading truck 374 | using computer 375 | using remote controller (not gaming) 376 | using segway 377 | vault 378 | waiting in line 379 | walking the dog 380 | washing dishes 381 | washing feet 382 | washing hair 383 | washing hands 384 | water skiing 385 | water sliding 386 | watering plants 387 | waxing back 388 | waxing chest 389 | waxing eyebrows 390 | waxing legs 391 | weaving basket 392 | welding 393 | whistling 394 | windsurfing 395 | wrapping present 396 | wrestling 397 | writing 398 | yawning 399 | yoga 400 | zumba -------------------------------------------------------------------------------- /tools/vid2img_kinetics.py: -------------------------------------------------------------------------------- 1 | # Code for paper: 2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance" 3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan 4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch 5 | 6 | from __future__ import print_function, division 7 | import os 8 | import sys 9 | import subprocess 10 | from multiprocessing import Pool 11 | from tqdm import tqdm 12 | 13 | n_thread = 100 14 | 15 | 16 | def vid2jpg(file_name, class_path, dst_class_path): 17 | if '.mp4' not in file_name: 18 | return 19 | name, ext = os.path.splitext(file_name) 20 | dst_directory_path = os.path.join(dst_class_path, name) 21 | 22 | video_file_path = os.path.join(class_path, file_name) 23 | try: 24 | if os.path.exists(dst_directory_path): 25 | if not os.path.exists(os.path.join(dst_directory_path, 'img_00001.jpg')): 26 | subprocess.call('rm -r \"{}\"'.format(dst_directory_path), shell=True) 27 | print('remove {}'.format(dst_directory_path)) 28 | os.mkdir(dst_directory_path) 29 | else: 30 | print('*** convert has been done: {}'.format(dst_directory_path)) 31 | return 32 | else: 33 | os.mkdir(dst_directory_path) 34 | except: 35 | print(dst_directory_path) 36 | return 37 | cmd = 'ffmpeg -i \"{}\" -threads 1 -vf scale=-1:331 -q:v 0 \"{}/img_%05d.jpg\"'.format(video_file_path, dst_directory_path) 38 | # print(cmd) 39 | subprocess.call(cmd, shell=True, 40 | stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) 41 | 42 | 43 | def class_process(dir_path, dst_dir_path, class_name): 44 | print('*' * 20, class_name, '*'*20) 45 | class_path = os.path.join(dir_path, class_name) 46 | if not os.path.isdir(class_path): 47 | print('*** is not a dir {}'.format(class_path)) 48 | return 49 | 50 | dst_class_path = os.path.join(dst_dir_path, class_name) 51 | if not os.path.exists(dst_class_path): 52 | os.mkdir(dst_class_path) 53 | 54 | vid_list = os.listdir(class_path) 55 | vid_list.sort() 56 | p = Pool(n_thread) 57 | from functools import partial 58 | worker = partial(vid2jpg, class_path=class_path, dst_class_path=dst_class_path) 59 | for _ in tqdm(p.imap_unordered(worker, vid_list), total=len(vid_list)): 60 | pass 61 | # p.map(worker, vid_list) 62 | p.close() 63 | p.join() 64 | 65 | print('\n') 66 | 67 | 68 | if __name__ == "__main__": 69 | dir_path = sys.argv[1] 70 | dst_dir_path = sys.argv[2] 71 | 72 | class_list = os.listdir(dir_path) 73 | class_list.sort() 74 | for class_name in class_list: 75 | class_process(dir_path, dst_dir_path, class_name) 76 | 77 | class_name = 'test' 78 | class_process(dir_path, dst_dir_path, class_name) 79 | -------------------------------------------------------------------------------- /tools/vid2img_sthv2.py: -------------------------------------------------------------------------------- 1 | # Code for paper: 2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance" 3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan 4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch 5 | 6 | import os 7 | import threading 8 | 9 | NUM_THREADS = 100 10 | VIDEO_ROOT = '/ssd/video/something/v2/20bn-something-something-v2' # Downloaded webm videos 11 | FRAME_ROOT = '/ssd/video/something/v2/20bn-something-something-v2-frames' # Directory for extracted frames 12 | 13 | 14 | def split(l, n): 15 | """Yield successive n-sized chunks from l.""" 16 | for i in range(0, len(l), n): 17 | yield l[i:i + n] 18 | 19 | 20 | def extract(video, tmpl='%06d.jpg'): 21 | # os.system(f'ffmpeg -i {VIDEO_ROOT}/{video} -vf -threads 1 -vf scale=-1:256 -q:v 0 ' 22 | # f'{FRAME_ROOT}/{video[:-5]}/{tmpl}') 23 | cmd = 'ffmpeg -i \"{}/{}\" -threads 1 -vf scale=-1:256 -q:v 0 \"{}/{}/%06d.jpg\"'.format(VIDEO_ROOT, video, 24 | FRAME_ROOT, video[:-5]) 25 | os.system(cmd) 26 | 27 | 28 | def target(video_list): 29 | for video in video_list: 30 | os.makedirs(os.path.join(FRAME_ROOT, video[:-5])) 31 | extract(video) 32 | 33 | 34 | if __name__ == '__main__': 35 | if not os.path.exists(VIDEO_ROOT): 36 | raise ValueError('Please download videos and set VIDEO_ROOT variable.') 37 | if not os.path.exists(FRAME_ROOT): 38 | os.makedirs(FRAME_ROOT) 39 | 40 | video_list = os.listdir(VIDEO_ROOT) 41 | splits = list(split(video_list, NUM_THREADS)) 42 | 43 | threads = [] 44 | for i, split in enumerate(splits): 45 | thread = threading.Thread(target=target, args=(split,)) 46 | thread.start() 47 | threads.append(thread) 48 | 49 | for thread in threads: 50 | thread.join() 51 | --------------------------------------------------------------------------------