├── LICENSE
├── README.md
├── archs
├── __init__.py
├── bn_inception.py
└── mobilenet_v2.py
├── main.py
├── ops
├── PAN_modules.py
├── __init__.py
├── basic_ops.py
├── dataset.py
├── dataset_config.py
├── models.py
├── non_local.py
├── temporal_shift.py
├── transforms.py
└── utils.py
├── opts.py
├── pretrained
└── models_urls.md
├── scripts
├── test
│ ├── sthv1
│ │ ├── En.sh
│ │ ├── Full.sh
│ │ └── Lite.sh
│ └── sthv2
│ │ ├── En.sh
│ │ ├── Full.sh
│ │ └── Lite.sh
└── train
│ ├── sthv1
│ ├── Full_PA.sh
│ ├── Full_RGB.sh
│ └── Lite.sh
│ └── sthv2
│ ├── Full_PA.sh
│ ├── Full_RGB.sh
│ └── Lite.sh
├── test_models.py
└── tools
├── gen_label_kinetics.py
├── gen_label_sthv1.py
├── gen_label_sthv2.py
├── kinetics_label_map.txt
├── vid2img_kinetics.py
└── vid2img_sthv2.py
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright MIT HAN Lab
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # PAN: Persistent Appearance Network
2 |
3 | [](https://paperswithcode.com/sota/action-recognition-in-videos-on-something-1?p=pan-towards-fast-action-recognition-via)
4 | [](https://paperswithcode.com/sota/action-recognition-in-videos-on-something?p=pan-towards-fast-action-recognition-via)
5 | [](https://paperswithcode.com/sota/action-recognition-in-videos-on-jester?p=pan-towards-fast-action-recognition-via)
6 |
7 | PyTorch Implementation of paper:
8 |
9 | > **PAN: Towards Fast Action Recognition via Learning Persistence of Appearance**
10 | >
11 | > Can Zhang, Yuexian Zou\*, Guang Chen and Lei Gan.
12 | >
13 | > [[ArXiv](https://arxiv.org/abs/2008.03462)]
14 |
15 | ## Updates
16 |
17 | **[12 Aug 2020]** We have released the codebase and models of the PAN.
18 |
19 | ## Main Contribution
20 |
21 | Efficiently modeling dynamic motion information in videos is crucial for action recognition task. Most state-of-the-art methods heavily rely on dense optical flow as motion representation. Although combining optical flow with RGB frames as input can achieve excellent recognition performance, the optical flow extraction is very time-consuming. This undoubtably will count against real-time action recognition. In this paper, we shed light on **fast action recognition** by lifting the reliance on optical flow. We design a novel **motion cue** called **Persistence of Appearance (PA)** that focuses more on distilling the motion information at boundaries. Extensive experiments show that our PA is over 1000x faster (8196fps *vs.* 8fps) than conventional optical flow in terms of motion modeling speed.
22 |
23 |
24 |
25 |
26 |
27 | ## Content
28 |
29 | - [Dependencies](#dependencies)
30 | - [Data Preparation](#data-preparation)
31 | - [Core Codes](#core-codes)
32 | - [PA Module](#pa-module)
33 | - [VAP Module](#vap-module)
34 | - [Pretrained Models](#pretrained-models)
35 | + [Something-Something-V1](#something-something-v1)
36 | + [Something-Something-V2](#something-something-v2)
37 | - [Testing](#testing)
38 | - [Training](#training)
39 | - [Other Info](#other-info)
40 | - [References](#references)
41 | - [Citation](#citation)
42 | - [Contact](#contact)
43 |
44 | ## Dependencies
45 |
46 | Please make sure the following libraries are installed successfully:
47 |
48 | - [PyTorch](https://pytorch.org/) >= 1.0
49 | - [TensorboardX](https://github.com/lanpa/tensorboardX)
50 | - [tqdm](https://github.com/tqdm/tqdm.git)
51 | - [scikit-learn](https://scikit-learn.org/stable/)
52 |
53 | ## Data Preparation
54 |
55 | Following the common practice, we need to first extract videos into frames for fast reading. Please refer to [TSN](https://github.com/yjxiong/temporal-segment-networks) repo for the detailed guide of data pre-processing. We have successfully trained on [Kinetics](https://deepmind.com/research/open-source/open-source-datasets/kinetics/), [UCF101](http://crcv.ucf.edu/data/UCF101.php), [HMDB51](http://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/), [Something-Something-V1](https://20bn.com/datasets/something-something/v1) and [V2](https://20bn.com/datasets/something-something/v2), [Jester](https://20bn.com/datasets/jester) datasets with this codebase. Basically, the processing of video data can be summarized into 3 steps:
56 |
57 | 1. Extract frames from videos:
58 |
59 | * For Something-Something-V2 dataset, please use [tools/vid2img_sthv2.py](tools/vid2img_sthv2.py)
60 |
61 | * For Kinetics dataset, please use [tools/vid2img_kinetics.py](tools/vid2img_kinetics.py)
62 |
63 | 2. Generate file lists needed for dataloader:
64 |
65 | * Each line of the list file will contain a tuple of (*extracted video frame folder name, video frame number, and video groundtruth class*). A list file looks like this:
66 |
67 | ```
68 | video_frame_folder 100 10
69 | video_2_frame_folder 150 31
70 | ...
71 | ```
72 |
73 | * Or you can use off-the-shelf tools provided by other repos:
74 | * For Something-Something-V1 & V2 datasets, please use [tools/gen_label_sthv1.py](tools/gen_label_sthv1.py) & [tools/gen_label_sthv2.py](tools/gen_label_sthv2.py)
75 | * For Kinetics dataset, please use [tools/gen_label_kinetics.py](tools/gen_label_kinetics.py)
76 |
77 | 3. Add the information to [ops/dataset_configs.py](ops/dataset_configs.py)
78 |
79 | ## Core Codes
80 |
81 | ### PA Module
82 |
83 |
84 |
85 |
86 |
87 | PA module aims to speed up the motion modeling procedure, it can be simply injected at the bottom of the network to lift the reliance on optical flow.
88 |
89 | ```python
90 | from ops.PAN_modules import PA
91 |
92 | PA_module = PA(n_length=4) # adjacent '4' frames are sampled for computing PA
93 | # shape of x: [N*T*m, 3, H, W]
94 | x = torch.randn(5*8*4, 3, 224, 224)
95 | # shape of PA_out: [N*T, m-1, H, W]
96 | PA_out = PA_module(x) # torch.Size([40, 3, 224, 224])
97 | ```
98 |
99 | ### VAP Module
100 |
101 | VAP module aims to adaptively emphasize expressive features and suppress less informative ones by observing global information across various timescales. It is adopted at the top of the network to achieve long-term temporal modeling.
102 |
103 |
104 |
105 |
106 |
107 | ```python
108 | from ops.PAN_modules import VAP
109 |
110 | VAP_module = VAP(n_segment=8, feature_dim=2048, num_class=174, dropout_ratio=0.5)
111 | # shape of x: [N*T, D]
112 | x = torch.randn(5*8, 2048)
113 | # shape of VAP_out: [N, num_class]
114 | VAP_out = VAP_module(x) # torch.Size([5, 174])
115 | ```
116 |
117 | ## Pretrained Models
118 |
119 | Here, we provide the pretrained models of PAN models on Something-Something-V1 & V2 datasets. Recognizing actions in these datasets requires strong temporal modeling ability, as many action classes are symmetrical. PAN achieves state-of-the-art performance on these datasets. Notably, our method even surpasses optical flow based methods while with only RGB frames as input.
120 |
121 | ### Something-Something-V1
122 |
123 |
124 |
125 |
126 |
127 | Model |
128 | Backbone |
129 | FLOPs * views |
130 | Val Top1 |
131 | Val Top5 |
132 | Checkpoints |
133 |
134 |
135 |
136 |
137 | PANLite |
138 | ResNet-50 |
139 | 35.7G * 1 |
140 | 48.0 |
141 | 76.1 |
142 | [Google Drive] or [Weiyun] |
143 |
144 |
145 | PANFull |
146 | 67.7G * 1 |
147 | 50.5 |
148 | 79.2 |
149 |
150 |
151 | PANEn |
152 | (46.6G+88.4G) * 2 |
153 | 53.4 |
154 | 81.1 |
155 |
156 |
157 | PANEn |
158 | ResNet-101 |
159 | (85.6G+166.1G) * 2 |
160 | 55.3 |
161 | 82.8 |
162 | [Google Drive] or [Weiyun] |
163 |
164 |
165 |
166 |
167 |
168 | ### Something-Something-V2
169 |
170 |
171 |
172 |
173 |
174 | Model |
175 | Backbone |
176 | FLOPs * views |
177 | Val Top1 |
178 | Val Top5 |
179 | Checkpoints |
180 |
181 |
182 |
183 |
184 | PANLite |
185 | ResNet-50 |
186 | 35.7G * 1 |
187 | 60.8 |
188 | 86.7 |
189 | [Google Drive] or [Weiyun] |
190 |
191 |
192 | PANFull |
193 | 67.7G * 1 |
194 | 63.8 |
195 | 88.6 |
196 |
197 |
198 | PANEn |
199 | (46.6G+88.4G) * 2 |
200 | 66.2 |
201 | 90.1 |
202 |
203 |
204 | PANEn |
205 | ResNet-101 |
206 | (85.6G+166.1G) * 2 |
207 | 66.5 |
208 | 90.6 |
209 | [Google Drive] or [Weiyun] |
210 |
211 |
212 |
213 |
214 |
215 | ## Testing
216 |
217 | For example, to test the PAN models on Something-Something-V1, you can first put the downloaded `.pth.tar` files into the "pretrained" folder and then run:
218 |
219 | ```bash
220 | # test PAN_Lite
221 | bash scripts/test/sthv1/Lite.sh
222 |
223 | # test PAN_Full
224 | bash scripts/test/sthv1/Full.sh
225 |
226 | # test PAN_En
227 | bash scripts/test/sthv1/En.sh
228 | ```
229 |
230 | ## Training
231 |
232 | We provided several scripts to train PAN with this repo, please refer to "[scripts](scripts/)" folder for more details. For example, to train PAN on Something-Something-V1, you can run:
233 |
234 | ```bash
235 | # train PAN_Lite
236 | bash scripts/train/sthv1/Lite.sh
237 |
238 | # train PAN_Full RGB branch
239 | bash scripts/train/sthv1/Full_RGB.sh
240 |
241 | # train PAN_Full PA branch
242 | bash scripts/train/sthv1/Full_PA.sh
243 | ```
244 |
245 | Notice that you should scale up the learning rate with batch size. For example, if you use a batch size of 256 you should set learning rate to 0.04.
246 |
247 | ## Other Info
248 |
249 | ### References
250 |
251 | This repository is built upon the following baseline implementations for the action recognition task.
252 |
253 | - [TSM](https://github.com/mit-han-lab/temporal-shift-module)
254 | - [TSN](https://github.com/yjxiong/tsn-pytorch)
255 |
256 | ### Citation
257 |
258 | Please **[★star]** this repo and **[cite]** the following arXiv paper if you feel our PAN useful to your research:
259 |
260 | ```
261 | @misc{zhang2020pan,
262 | title={PAN: Towards Fast Action Recognition via Learning Persistence of Appearance},
263 | author={Can Zhang and Yuexian Zou and Guang Chen and Lei Gan},
264 | year={2020},
265 | eprint={2008.03462},
266 | archivePrefix={arXiv},
267 | primaryClass={cs.CV}
268 | }
269 | ```
270 |
271 | Or if you prefer "publication", you can cite our preliminary work on ACM MM 2019:
272 |
273 | ```
274 | @inproceedings{zhang2019pan,
275 | title={PAN: Persistent Appearance Network with an Efficient Motion Cue for Fast Action Recognition},
276 | author={Zhang, Can and Zou, Yuexian and Chen, Guang and Gan, Lei},
277 | booktitle={Proceedings of the 27th ACM International Conference on Multimedia},
278 | pages={500--509},
279 | year={2019}
280 | }
281 | ```
282 |
283 | ### Contact
284 |
285 | For any questions, please feel free to open an issue or contact:
286 |
287 | ```
288 | Can Zhang: zhang.can.pku@gmail.com
289 | ```
290 |
--------------------------------------------------------------------------------
/archs/__init__.py:
--------------------------------------------------------------------------------
1 | from .bn_inception import *
2 |
--------------------------------------------------------------------------------
/archs/bn_inception.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function, division, absolute_import
2 | import torch
3 | import torch.nn as nn
4 | import torch.utils.model_zoo as model_zoo
5 | import torch.nn.functional as F
6 |
7 |
8 | __all__ = ['BNInception', 'bninception']
9 |
10 | pretrained_settings = {
11 | 'bninception': {
12 | 'imagenet': {
13 | 'url': 'https://www.dropbox.com/s/3cvod6kzwluijcw/BNInception-9baff57459f5a1744.pth?dl=1',
14 | 'input_space': 'BGR',
15 | 'input_size': 224,
16 | 'input_range': [0, 255],
17 | 'mean': [104, 117, 128],
18 | 'std': [1, 1, 1],
19 | 'num_classes': 1000
20 | },
21 | 'kinetics': {
22 | 'url': 'https://www.dropbox.com/s/gx4u7itoyygix0c/BNInceptionKinetics-47f0695e.pth?dl=1',
23 | 'input_space': 'BGR',
24 | 'input_size': 224,
25 | 'input_range': [0, 255],
26 | 'mean': [104, 117, 128], # [96.29023126, 103.16065604, 110.63666788]
27 | 'std': [1, 1, 1], # [40.02898126, 37.88248729, 38.7568578],
28 | 'num_classes': 400
29 | }
30 | },
31 | }
32 |
33 |
34 | class BNInception(nn.Module):
35 | def __init__(self, num_classes=1000):
36 | super(BNInception, self).__init__()
37 | inplace = True
38 | self._build_features(inplace, num_classes)
39 |
40 | def forward(self, x):
41 | # if self.input_space == 'BGR':
42 | # assert len(x.size()) == 4
43 | # x = x[:, (2, 1, 0)]
44 | x = self.features(x)
45 | x = self.logits(x)
46 | return x
47 |
48 | def features(self, x):
49 | # stage1
50 | pool1_3x3_s2_out = self._temporal_forward_wrap(self._block_1, 0)(x)
51 | # stage2
52 | pool2_3x3_s2_out = self._temporal_forward_wrap(self._block_2, 1)(pool1_3x3_s2_out)
53 |
54 | # stage3
55 | inception_3a_output_out = self._temporal_forward_wrap(self._block_3a, 2)(pool2_3x3_s2_out)
56 | inception_3b_output_out = self._temporal_forward_wrap(self._block_3b, 3)(inception_3a_output_out)
57 | inception_3c_output_out = self._temporal_forward_wrap(self._block_3c, 4)(inception_3b_output_out)
58 |
59 | inception_4a_output_out = self._temporal_forward_wrap(self._block_4a, 5)(inception_3c_output_out)
60 | inception_4b_output_out = self._temporal_forward_wrap(self._block_4b, 6)(inception_4a_output_out)
61 | inception_4c_output_out = self._temporal_forward_wrap(self._block_4c, 7)(inception_4b_output_out)
62 | inception_4d_output_out = self._temporal_forward_wrap(self._block_4d, 8)(inception_4c_output_out)
63 | inception_4e_output_out = self._temporal_forward_wrap(self._block_4e, 9)(inception_4d_output_out)
64 |
65 | inception_5a_output_out = self._temporal_forward_wrap(self._block_5a, 10)(inception_4e_output_out)
66 | inception_5b_output_out = self._temporal_forward_wrap(self._block_5b, 11)(inception_5a_output_out)
67 |
68 | return inception_5b_output_out
69 |
70 | def logits(self, features):
71 | x = self.global_pool(features)
72 | x = x.view(x.size(0), -1)
73 | x = self.fc(x)
74 | return x
75 |
76 | def build_temporal_ops(self, n_segment, is_temporal_shift='0' * 12, shift_div=8):
77 | # must call after loading weights
78 | self.n_segment = n_segment
79 | self.residual = 'res' in is_temporal_shift
80 | if self.residual:
81 | print('=> Using residual shift functions...')
82 | if is_temporal_shift in ['block', 'blockres']:
83 | self.is_temporal_shift = '1' * 12
84 | else:
85 | self.is_temporal_shift = is_temporal_shift
86 | self.is_temporal_shift = '0' + self.is_temporal_shift[1:] # image input does not shift
87 |
88 | assert len(self.is_temporal_shift) == 12
89 |
90 | print('=> Injecting temporal shift with mask {}'.format(self.is_temporal_shift))
91 | self.fold_div = shift_div
92 | print('=> Using fold div: {}'.format(self.fold_div))
93 |
94 | def _temporal_forward_wrap(self, layer_func, index):
95 | if hasattr(self, 'is_temporal_shift') and self.is_temporal_shift[index] == '1': # run temporal shuffling
96 | from ops.temporal_shift import TemporalShift
97 | def wrapped_func(x, is_residual, n_segment, fold_div):
98 | if is_residual:
99 | x_shift = TemporalShift.shift(x, n_segment, fold_div=fold_div)
100 | return F.relu(x + layer_func(x_shift))
101 | else:
102 | x = TemporalShift.shift(x, n_segment, fold_div=fold_div)
103 | return layer_func(x)
104 | from functools import partial
105 | return partial(wrapped_func, is_residual=self.residual, n_segment=self.n_segment,
106 | fold_div=self.fold_div)
107 | else:
108 | return layer_func
109 |
110 | def _block_1(self, x):
111 | conv1_7x7_s2_out = self.conv1_7x7_s2(x)
112 | conv1_7x7_s2_bn_out = self.conv1_7x7_s2_bn(conv1_7x7_s2_out)
113 | conv1_relu_7x7_out = self.conv1_relu_7x7(conv1_7x7_s2_bn_out)
114 | pool1_3x3_s2_out = self.pool1_3x3_s2(conv1_7x7_s2_bn_out)
115 | return pool1_3x3_s2_out
116 |
117 | def _block_2(self, x):
118 | conv2_3x3_reduce_out = self.conv2_3x3_reduce(x)
119 | conv2_3x3_reduce_bn_out = self.conv2_3x3_reduce_bn(conv2_3x3_reduce_out)
120 | conv2_relu_3x3_reduce_out = self.conv2_relu_3x3_reduce(conv2_3x3_reduce_bn_out)
121 | conv2_3x3_out = self.conv2_3x3(conv2_3x3_reduce_bn_out)
122 | conv2_3x3_bn_out = self.conv2_3x3_bn(conv2_3x3_out)
123 | conv2_relu_3x3_out = self.conv2_relu_3x3(conv2_3x3_bn_out)
124 | pool2_3x3_s2_out = self.pool2_3x3_s2(conv2_3x3_bn_out)
125 | return pool2_3x3_s2_out
126 |
127 | def _block_3a(self, pool2_3x3_s2_out):
128 | inception_3a_1x1_out = self.inception_3a_1x1(pool2_3x3_s2_out)
129 | inception_3a_1x1_bn_out = self.inception_3a_1x1_bn(inception_3a_1x1_out)
130 | inception_3a_relu_1x1_out = self.inception_3a_relu_1x1(inception_3a_1x1_bn_out)
131 | inception_3a_3x3_reduce_out = self.inception_3a_3x3_reduce(pool2_3x3_s2_out)
132 | inception_3a_3x3_reduce_bn_out = self.inception_3a_3x3_reduce_bn(inception_3a_3x3_reduce_out)
133 | inception_3a_relu_3x3_reduce_out = self.inception_3a_relu_3x3_reduce(inception_3a_3x3_reduce_bn_out)
134 | inception_3a_3x3_out = self.inception_3a_3x3(inception_3a_3x3_reduce_bn_out)
135 | inception_3a_3x3_bn_out = self.inception_3a_3x3_bn(inception_3a_3x3_out)
136 | inception_3a_relu_3x3_out = self.inception_3a_relu_3x3(inception_3a_3x3_bn_out)
137 | inception_3a_double_3x3_reduce_out = self.inception_3a_double_3x3_reduce(pool2_3x3_s2_out)
138 | inception_3a_double_3x3_reduce_bn_out = self.inception_3a_double_3x3_reduce_bn(
139 | inception_3a_double_3x3_reduce_out)
140 | inception_3a_relu_double_3x3_reduce_out = self.inception_3a_relu_double_3x3_reduce(
141 | inception_3a_double_3x3_reduce_bn_out)
142 | inception_3a_double_3x3_1_out = self.inception_3a_double_3x3_1(inception_3a_double_3x3_reduce_bn_out)
143 | inception_3a_double_3x3_1_bn_out = self.inception_3a_double_3x3_1_bn(inception_3a_double_3x3_1_out)
144 | inception_3a_relu_double_3x3_1_out = self.inception_3a_relu_double_3x3_1(inception_3a_double_3x3_1_bn_out)
145 | inception_3a_double_3x3_2_out = self.inception_3a_double_3x3_2(inception_3a_double_3x3_1_bn_out)
146 | inception_3a_double_3x3_2_bn_out = self.inception_3a_double_3x3_2_bn(inception_3a_double_3x3_2_out)
147 | inception_3a_relu_double_3x3_2_out = self.inception_3a_relu_double_3x3_2(inception_3a_double_3x3_2_bn_out)
148 | inception_3a_pool_out = self.inception_3a_pool(pool2_3x3_s2_out)
149 | inception_3a_pool_proj_out = self.inception_3a_pool_proj(inception_3a_pool_out)
150 | inception_3a_pool_proj_bn_out = self.inception_3a_pool_proj_bn(inception_3a_pool_proj_out)
151 | inception_3a_relu_pool_proj_out = self.inception_3a_relu_pool_proj(inception_3a_pool_proj_bn_out)
152 | inception_3a_output_out = torch.cat(
153 | [inception_3a_1x1_bn_out, inception_3a_3x3_bn_out, inception_3a_double_3x3_2_bn_out,
154 | inception_3a_pool_proj_bn_out], 1)
155 | return inception_3a_output_out
156 |
157 | def _block_3b(self, inception_3a_output_out):
158 | inception_3b_1x1_out = self.inception_3b_1x1(inception_3a_output_out)
159 | inception_3b_1x1_bn_out = self.inception_3b_1x1_bn(inception_3b_1x1_out)
160 | inception_3b_relu_1x1_out = self.inception_3b_relu_1x1(inception_3b_1x1_bn_out)
161 | inception_3b_3x3_reduce_out = self.inception_3b_3x3_reduce(inception_3a_output_out)
162 | inception_3b_3x3_reduce_bn_out = self.inception_3b_3x3_reduce_bn(inception_3b_3x3_reduce_out)
163 | inception_3b_relu_3x3_reduce_out = self.inception_3b_relu_3x3_reduce(inception_3b_3x3_reduce_bn_out)
164 | inception_3b_3x3_out = self.inception_3b_3x3(inception_3b_3x3_reduce_bn_out)
165 | inception_3b_3x3_bn_out = self.inception_3b_3x3_bn(inception_3b_3x3_out)
166 | inception_3b_relu_3x3_out = self.inception_3b_relu_3x3(inception_3b_3x3_bn_out)
167 | inception_3b_double_3x3_reduce_out = self.inception_3b_double_3x3_reduce(inception_3a_output_out)
168 | inception_3b_double_3x3_reduce_bn_out = self.inception_3b_double_3x3_reduce_bn(
169 | inception_3b_double_3x3_reduce_out)
170 | inception_3b_relu_double_3x3_reduce_out = self.inception_3b_relu_double_3x3_reduce(
171 | inception_3b_double_3x3_reduce_bn_out)
172 | inception_3b_double_3x3_1_out = self.inception_3b_double_3x3_1(inception_3b_double_3x3_reduce_bn_out)
173 | inception_3b_double_3x3_1_bn_out = self.inception_3b_double_3x3_1_bn(inception_3b_double_3x3_1_out)
174 | inception_3b_relu_double_3x3_1_out = self.inception_3b_relu_double_3x3_1(inception_3b_double_3x3_1_bn_out)
175 | inception_3b_double_3x3_2_out = self.inception_3b_double_3x3_2(inception_3b_double_3x3_1_bn_out)
176 | inception_3b_double_3x3_2_bn_out = self.inception_3b_double_3x3_2_bn(inception_3b_double_3x3_2_out)
177 | inception_3b_relu_double_3x3_2_out = self.inception_3b_relu_double_3x3_2(inception_3b_double_3x3_2_bn_out)
178 | inception_3b_pool_out = self.inception_3b_pool(inception_3a_output_out)
179 | inception_3b_pool_proj_out = self.inception_3b_pool_proj(inception_3b_pool_out)
180 | inception_3b_pool_proj_bn_out = self.inception_3b_pool_proj_bn(inception_3b_pool_proj_out)
181 | inception_3b_relu_pool_proj_out = self.inception_3b_relu_pool_proj(inception_3b_pool_proj_bn_out)
182 | inception_3b_output_out = torch.cat(
183 | [inception_3b_1x1_bn_out, inception_3b_3x3_bn_out, inception_3b_double_3x3_2_bn_out,
184 | inception_3b_pool_proj_bn_out], 1)
185 | return inception_3b_output_out
186 |
187 | def _block_3c(self, inception_3b_output_out):
188 | inception_3c_3x3_reduce_out = self.inception_3c_3x3_reduce(inception_3b_output_out)
189 | inception_3c_3x3_reduce_bn_out = self.inception_3c_3x3_reduce_bn(inception_3c_3x3_reduce_out)
190 | inception_3c_relu_3x3_reduce_out = self.inception_3c_relu_3x3_reduce(inception_3c_3x3_reduce_bn_out)
191 | inception_3c_3x3_out = self.inception_3c_3x3(inception_3c_3x3_reduce_bn_out)
192 | inception_3c_3x3_bn_out = self.inception_3c_3x3_bn(inception_3c_3x3_out)
193 | inception_3c_relu_3x3_out = self.inception_3c_relu_3x3(inception_3c_3x3_bn_out)
194 | inception_3c_double_3x3_reduce_out = self.inception_3c_double_3x3_reduce(inception_3b_output_out)
195 | inception_3c_double_3x3_reduce_bn_out = self.inception_3c_double_3x3_reduce_bn(
196 | inception_3c_double_3x3_reduce_out)
197 | inception_3c_relu_double_3x3_reduce_out = self.inception_3c_relu_double_3x3_reduce(
198 | inception_3c_double_3x3_reduce_bn_out)
199 | inception_3c_double_3x3_1_out = self.inception_3c_double_3x3_1(inception_3c_double_3x3_reduce_bn_out)
200 | inception_3c_double_3x3_1_bn_out = self.inception_3c_double_3x3_1_bn(inception_3c_double_3x3_1_out)
201 | inception_3c_relu_double_3x3_1_out = self.inception_3c_relu_double_3x3_1(inception_3c_double_3x3_1_bn_out)
202 | inception_3c_double_3x3_2_out = self.inception_3c_double_3x3_2(inception_3c_double_3x3_1_bn_out)
203 | inception_3c_double_3x3_2_bn_out = self.inception_3c_double_3x3_2_bn(inception_3c_double_3x3_2_out)
204 | inception_3c_relu_double_3x3_2_out = self.inception_3c_relu_double_3x3_2(inception_3c_double_3x3_2_bn_out)
205 | inception_3c_pool_out = self.inception_3c_pool(inception_3b_output_out)
206 | inception_3c_output_out = torch.cat(
207 | [inception_3c_3x3_bn_out, inception_3c_double_3x3_2_bn_out, inception_3c_pool_out], 1)
208 | return inception_3c_output_out
209 |
210 | def _block_4a(self, inception_3c_output_out):
211 | inception_4a_1x1_out = self.inception_4a_1x1(inception_3c_output_out)
212 | inception_4a_1x1_bn_out = self.inception_4a_1x1_bn(inception_4a_1x1_out)
213 | inception_4a_relu_1x1_out = self.inception_4a_relu_1x1(inception_4a_1x1_bn_out)
214 | inception_4a_3x3_reduce_out = self.inception_4a_3x3_reduce(inception_3c_output_out)
215 | inception_4a_3x3_reduce_bn_out = self.inception_4a_3x3_reduce_bn(inception_4a_3x3_reduce_out)
216 | inception_4a_relu_3x3_reduce_out = self.inception_4a_relu_3x3_reduce(inception_4a_3x3_reduce_bn_out)
217 | inception_4a_3x3_out = self.inception_4a_3x3(inception_4a_3x3_reduce_bn_out)
218 | inception_4a_3x3_bn_out = self.inception_4a_3x3_bn(inception_4a_3x3_out)
219 | inception_4a_relu_3x3_out = self.inception_4a_relu_3x3(inception_4a_3x3_bn_out)
220 | inception_4a_double_3x3_reduce_out = self.inception_4a_double_3x3_reduce(inception_3c_output_out)
221 | inception_4a_double_3x3_reduce_bn_out = self.inception_4a_double_3x3_reduce_bn(
222 | inception_4a_double_3x3_reduce_out)
223 | inception_4a_relu_double_3x3_reduce_out = self.inception_4a_relu_double_3x3_reduce(
224 | inception_4a_double_3x3_reduce_bn_out)
225 | inception_4a_double_3x3_1_out = self.inception_4a_double_3x3_1(inception_4a_double_3x3_reduce_bn_out)
226 | inception_4a_double_3x3_1_bn_out = self.inception_4a_double_3x3_1_bn(inception_4a_double_3x3_1_out)
227 | inception_4a_relu_double_3x3_1_out = self.inception_4a_relu_double_3x3_1(inception_4a_double_3x3_1_bn_out)
228 | inception_4a_double_3x3_2_out = self.inception_4a_double_3x3_2(inception_4a_double_3x3_1_bn_out)
229 | inception_4a_double_3x3_2_bn_out = self.inception_4a_double_3x3_2_bn(inception_4a_double_3x3_2_out)
230 | inception_4a_relu_double_3x3_2_out = self.inception_4a_relu_double_3x3_2(inception_4a_double_3x3_2_bn_out)
231 | inception_4a_pool_out = self.inception_4a_pool(inception_3c_output_out)
232 | inception_4a_pool_proj_out = self.inception_4a_pool_proj(inception_4a_pool_out)
233 | inception_4a_pool_proj_bn_out = self.inception_4a_pool_proj_bn(inception_4a_pool_proj_out)
234 | inception_4a_relu_pool_proj_out = self.inception_4a_relu_pool_proj(inception_4a_pool_proj_bn_out)
235 | inception_4a_output_out = torch.cat(
236 | [inception_4a_1x1_bn_out, inception_4a_3x3_bn_out, inception_4a_double_3x3_2_bn_out,
237 | inception_4a_pool_proj_bn_out], 1)
238 | return inception_4a_output_out
239 |
240 | def _block_4b(self, inception_4a_output_out):
241 | inception_4b_1x1_out = self.inception_4b_1x1(inception_4a_output_out)
242 | inception_4b_1x1_bn_out = self.inception_4b_1x1_bn(inception_4b_1x1_out)
243 | inception_4b_relu_1x1_out = self.inception_4b_relu_1x1(inception_4b_1x1_bn_out)
244 | inception_4b_3x3_reduce_out = self.inception_4b_3x3_reduce(inception_4a_output_out)
245 | inception_4b_3x3_reduce_bn_out = self.inception_4b_3x3_reduce_bn(inception_4b_3x3_reduce_out)
246 | inception_4b_relu_3x3_reduce_out = self.inception_4b_relu_3x3_reduce(inception_4b_3x3_reduce_bn_out)
247 | inception_4b_3x3_out = self.inception_4b_3x3(inception_4b_3x3_reduce_bn_out)
248 | inception_4b_3x3_bn_out = self.inception_4b_3x3_bn(inception_4b_3x3_out)
249 | inception_4b_relu_3x3_out = self.inception_4b_relu_3x3(inception_4b_3x3_bn_out)
250 | inception_4b_double_3x3_reduce_out = self.inception_4b_double_3x3_reduce(inception_4a_output_out)
251 | inception_4b_double_3x3_reduce_bn_out = self.inception_4b_double_3x3_reduce_bn(
252 | inception_4b_double_3x3_reduce_out)
253 | inception_4b_relu_double_3x3_reduce_out = self.inception_4b_relu_double_3x3_reduce(
254 | inception_4b_double_3x3_reduce_bn_out)
255 | inception_4b_double_3x3_1_out = self.inception_4b_double_3x3_1(inception_4b_double_3x3_reduce_bn_out)
256 | inception_4b_double_3x3_1_bn_out = self.inception_4b_double_3x3_1_bn(inception_4b_double_3x3_1_out)
257 | inception_4b_relu_double_3x3_1_out = self.inception_4b_relu_double_3x3_1(inception_4b_double_3x3_1_bn_out)
258 | inception_4b_double_3x3_2_out = self.inception_4b_double_3x3_2(inception_4b_double_3x3_1_bn_out)
259 | inception_4b_double_3x3_2_bn_out = self.inception_4b_double_3x3_2_bn(inception_4b_double_3x3_2_out)
260 | inception_4b_relu_double_3x3_2_out = self.inception_4b_relu_double_3x3_2(inception_4b_double_3x3_2_bn_out)
261 | inception_4b_pool_out = self.inception_4b_pool(inception_4a_output_out)
262 | inception_4b_pool_proj_out = self.inception_4b_pool_proj(inception_4b_pool_out)
263 | inception_4b_pool_proj_bn_out = self.inception_4b_pool_proj_bn(inception_4b_pool_proj_out)
264 | inception_4b_relu_pool_proj_out = self.inception_4b_relu_pool_proj(inception_4b_pool_proj_bn_out)
265 | inception_4b_output_out = torch.cat(
266 | [inception_4b_1x1_bn_out, inception_4b_3x3_bn_out, inception_4b_double_3x3_2_bn_out,
267 | inception_4b_pool_proj_bn_out], 1)
268 | return inception_4b_output_out
269 |
270 | def _block_4c(self, inception_4b_output_out):
271 | inception_4c_1x1_out = self.inception_4c_1x1(inception_4b_output_out)
272 | inception_4c_1x1_bn_out = self.inception_4c_1x1_bn(inception_4c_1x1_out)
273 | inception_4c_relu_1x1_out = self.inception_4c_relu_1x1(inception_4c_1x1_bn_out)
274 | inception_4c_3x3_reduce_out = self.inception_4c_3x3_reduce(inception_4b_output_out)
275 | inception_4c_3x3_reduce_bn_out = self.inception_4c_3x3_reduce_bn(inception_4c_3x3_reduce_out)
276 | inception_4c_relu_3x3_reduce_out = self.inception_4c_relu_3x3_reduce(inception_4c_3x3_reduce_bn_out)
277 | inception_4c_3x3_out = self.inception_4c_3x3(inception_4c_3x3_reduce_bn_out)
278 | inception_4c_3x3_bn_out = self.inception_4c_3x3_bn(inception_4c_3x3_out)
279 | inception_4c_relu_3x3_out = self.inception_4c_relu_3x3(inception_4c_3x3_bn_out)
280 | inception_4c_double_3x3_reduce_out = self.inception_4c_double_3x3_reduce(inception_4b_output_out)
281 | inception_4c_double_3x3_reduce_bn_out = self.inception_4c_double_3x3_reduce_bn(
282 | inception_4c_double_3x3_reduce_out)
283 | inception_4c_relu_double_3x3_reduce_out = self.inception_4c_relu_double_3x3_reduce(
284 | inception_4c_double_3x3_reduce_bn_out)
285 | inception_4c_double_3x3_1_out = self.inception_4c_double_3x3_1(inception_4c_double_3x3_reduce_bn_out)
286 | inception_4c_double_3x3_1_bn_out = self.inception_4c_double_3x3_1_bn(inception_4c_double_3x3_1_out)
287 | inception_4c_relu_double_3x3_1_out = self.inception_4c_relu_double_3x3_1(inception_4c_double_3x3_1_bn_out)
288 | inception_4c_double_3x3_2_out = self.inception_4c_double_3x3_2(inception_4c_double_3x3_1_bn_out)
289 | inception_4c_double_3x3_2_bn_out = self.inception_4c_double_3x3_2_bn(inception_4c_double_3x3_2_out)
290 | inception_4c_relu_double_3x3_2_out = self.inception_4c_relu_double_3x3_2(inception_4c_double_3x3_2_bn_out)
291 | inception_4c_pool_out = self.inception_4c_pool(inception_4b_output_out)
292 | inception_4c_pool_proj_out = self.inception_4c_pool_proj(inception_4c_pool_out)
293 | inception_4c_pool_proj_bn_out = self.inception_4c_pool_proj_bn(inception_4c_pool_proj_out)
294 | inception_4c_relu_pool_proj_out = self.inception_4c_relu_pool_proj(inception_4c_pool_proj_bn_out)
295 | inception_4c_output_out = torch.cat(
296 | [inception_4c_1x1_bn_out, inception_4c_3x3_bn_out, inception_4c_double_3x3_2_bn_out,
297 | inception_4c_pool_proj_bn_out], 1)
298 | return inception_4c_output_out
299 |
300 | def _block_4d(self, inception_4c_output_out):
301 | inception_4d_1x1_out = self.inception_4d_1x1(inception_4c_output_out)
302 | inception_4d_1x1_bn_out = self.inception_4d_1x1_bn(inception_4d_1x1_out)
303 | inception_4d_relu_1x1_out = self.inception_4d_relu_1x1(inception_4d_1x1_bn_out)
304 | inception_4d_3x3_reduce_out = self.inception_4d_3x3_reduce(inception_4c_output_out)
305 | inception_4d_3x3_reduce_bn_out = self.inception_4d_3x3_reduce_bn(inception_4d_3x3_reduce_out)
306 | inception_4d_relu_3x3_reduce_out = self.inception_4d_relu_3x3_reduce(inception_4d_3x3_reduce_bn_out)
307 | inception_4d_3x3_out = self.inception_4d_3x3(inception_4d_3x3_reduce_bn_out)
308 | inception_4d_3x3_bn_out = self.inception_4d_3x3_bn(inception_4d_3x3_out)
309 | inception_4d_relu_3x3_out = self.inception_4d_relu_3x3(inception_4d_3x3_bn_out)
310 | inception_4d_double_3x3_reduce_out = self.inception_4d_double_3x3_reduce(inception_4c_output_out)
311 | inception_4d_double_3x3_reduce_bn_out = self.inception_4d_double_3x3_reduce_bn(
312 | inception_4d_double_3x3_reduce_out)
313 | inception_4d_relu_double_3x3_reduce_out = self.inception_4d_relu_double_3x3_reduce(
314 | inception_4d_double_3x3_reduce_bn_out)
315 | inception_4d_double_3x3_1_out = self.inception_4d_double_3x3_1(inception_4d_double_3x3_reduce_bn_out)
316 | inception_4d_double_3x3_1_bn_out = self.inception_4d_double_3x3_1_bn(inception_4d_double_3x3_1_out)
317 | inception_4d_relu_double_3x3_1_out = self.inception_4d_relu_double_3x3_1(inception_4d_double_3x3_1_bn_out)
318 | inception_4d_double_3x3_2_out = self.inception_4d_double_3x3_2(inception_4d_double_3x3_1_bn_out)
319 | inception_4d_double_3x3_2_bn_out = self.inception_4d_double_3x3_2_bn(inception_4d_double_3x3_2_out)
320 | inception_4d_relu_double_3x3_2_out = self.inception_4d_relu_double_3x3_2(inception_4d_double_3x3_2_bn_out)
321 | inception_4d_pool_out = self.inception_4d_pool(inception_4c_output_out)
322 | inception_4d_pool_proj_out = self.inception_4d_pool_proj(inception_4d_pool_out)
323 | inception_4d_pool_proj_bn_out = self.inception_4d_pool_proj_bn(inception_4d_pool_proj_out)
324 | inception_4d_relu_pool_proj_out = self.inception_4d_relu_pool_proj(inception_4d_pool_proj_bn_out)
325 | inception_4d_output_out = torch.cat(
326 | [inception_4d_1x1_bn_out, inception_4d_3x3_bn_out, inception_4d_double_3x3_2_bn_out,
327 | inception_4d_pool_proj_bn_out], 1)
328 | return inception_4d_output_out
329 |
330 | def _block_4e(self, inception_4d_output_out):
331 | inception_4e_3x3_reduce_out = self.inception_4e_3x3_reduce(inception_4d_output_out)
332 | inception_4e_3x3_reduce_bn_out = self.inception_4e_3x3_reduce_bn(inception_4e_3x3_reduce_out)
333 | inception_4e_relu_3x3_reduce_out = self.inception_4e_relu_3x3_reduce(inception_4e_3x3_reduce_bn_out)
334 | inception_4e_3x3_out = self.inception_4e_3x3(inception_4e_3x3_reduce_bn_out)
335 | inception_4e_3x3_bn_out = self.inception_4e_3x3_bn(inception_4e_3x3_out)
336 | inception_4e_relu_3x3_out = self.inception_4e_relu_3x3(inception_4e_3x3_bn_out)
337 | inception_4e_double_3x3_reduce_out = self.inception_4e_double_3x3_reduce(inception_4d_output_out)
338 | inception_4e_double_3x3_reduce_bn_out = self.inception_4e_double_3x3_reduce_bn(
339 | inception_4e_double_3x3_reduce_out)
340 | inception_4e_relu_double_3x3_reduce_out = self.inception_4e_relu_double_3x3_reduce(
341 | inception_4e_double_3x3_reduce_bn_out)
342 | inception_4e_double_3x3_1_out = self.inception_4e_double_3x3_1(inception_4e_double_3x3_reduce_bn_out)
343 | inception_4e_double_3x3_1_bn_out = self.inception_4e_double_3x3_1_bn(inception_4e_double_3x3_1_out)
344 | inception_4e_relu_double_3x3_1_out = self.inception_4e_relu_double_3x3_1(inception_4e_double_3x3_1_bn_out)
345 | inception_4e_double_3x3_2_out = self.inception_4e_double_3x3_2(inception_4e_double_3x3_1_bn_out)
346 | inception_4e_double_3x3_2_bn_out = self.inception_4e_double_3x3_2_bn(inception_4e_double_3x3_2_out)
347 | inception_4e_relu_double_3x3_2_out = self.inception_4e_relu_double_3x3_2(inception_4e_double_3x3_2_bn_out)
348 | inception_4e_pool_out = self.inception_4e_pool(inception_4d_output_out)
349 | inception_4e_output_out = torch.cat(
350 | [inception_4e_3x3_bn_out, inception_4e_double_3x3_2_bn_out, inception_4e_pool_out], 1)
351 | return inception_4e_output_out
352 |
353 | def _block_5a(self, inception_4e_output_out):
354 | inception_5a_1x1_out = self.inception_5a_1x1(inception_4e_output_out)
355 | inception_5a_1x1_bn_out = self.inception_5a_1x1_bn(inception_5a_1x1_out)
356 | inception_5a_relu_1x1_out = self.inception_5a_relu_1x1(inception_5a_1x1_bn_out)
357 | inception_5a_3x3_reduce_out = self.inception_5a_3x3_reduce(inception_4e_output_out)
358 | inception_5a_3x3_reduce_bn_out = self.inception_5a_3x3_reduce_bn(inception_5a_3x3_reduce_out)
359 | inception_5a_relu_3x3_reduce_out = self.inception_5a_relu_3x3_reduce(inception_5a_3x3_reduce_bn_out)
360 | inception_5a_3x3_out = self.inception_5a_3x3(inception_5a_3x3_reduce_bn_out)
361 | inception_5a_3x3_bn_out = self.inception_5a_3x3_bn(inception_5a_3x3_out)
362 | inception_5a_relu_3x3_out = self.inception_5a_relu_3x3(inception_5a_3x3_bn_out)
363 | inception_5a_double_3x3_reduce_out = self.inception_5a_double_3x3_reduce(inception_4e_output_out)
364 | inception_5a_double_3x3_reduce_bn_out = self.inception_5a_double_3x3_reduce_bn(
365 | inception_5a_double_3x3_reduce_out)
366 | inception_5a_relu_double_3x3_reduce_out = self.inception_5a_relu_double_3x3_reduce(
367 | inception_5a_double_3x3_reduce_bn_out)
368 | inception_5a_double_3x3_1_out = self.inception_5a_double_3x3_1(inception_5a_double_3x3_reduce_bn_out)
369 | inception_5a_double_3x3_1_bn_out = self.inception_5a_double_3x3_1_bn(inception_5a_double_3x3_1_out)
370 | inception_5a_relu_double_3x3_1_out = self.inception_5a_relu_double_3x3_1(inception_5a_double_3x3_1_bn_out)
371 | inception_5a_double_3x3_2_out = self.inception_5a_double_3x3_2(inception_5a_double_3x3_1_bn_out)
372 | inception_5a_double_3x3_2_bn_out = self.inception_5a_double_3x3_2_bn(inception_5a_double_3x3_2_out)
373 | inception_5a_relu_double_3x3_2_out = self.inception_5a_relu_double_3x3_2(inception_5a_double_3x3_2_bn_out)
374 | inception_5a_pool_out = self.inception_5a_pool(inception_4e_output_out)
375 | inception_5a_pool_proj_out = self.inception_5a_pool_proj(inception_5a_pool_out)
376 | inception_5a_pool_proj_bn_out = self.inception_5a_pool_proj_bn(inception_5a_pool_proj_out)
377 | inception_5a_relu_pool_proj_out = self.inception_5a_relu_pool_proj(inception_5a_pool_proj_bn_out)
378 | inception_5a_output_out = torch.cat(
379 | [inception_5a_1x1_bn_out, inception_5a_3x3_bn_out, inception_5a_double_3x3_2_bn_out,
380 | inception_5a_pool_proj_bn_out], 1)
381 | return inception_5a_output_out
382 |
383 | def _block_5b(self, inception_5a_output_out):
384 | inception_5b_1x1_out = self.inception_5b_1x1(inception_5a_output_out)
385 | inception_5b_1x1_bn_out = self.inception_5b_1x1_bn(inception_5b_1x1_out)
386 | inception_5b_relu_1x1_out = self.inception_5b_relu_1x1(inception_5b_1x1_bn_out)
387 | inception_5b_3x3_reduce_out = self.inception_5b_3x3_reduce(inception_5a_output_out)
388 | inception_5b_3x3_reduce_bn_out = self.inception_5b_3x3_reduce_bn(inception_5b_3x3_reduce_out)
389 | inception_5b_relu_3x3_reduce_out = self.inception_5b_relu_3x3_reduce(inception_5b_3x3_reduce_bn_out)
390 | inception_5b_3x3_out = self.inception_5b_3x3(inception_5b_3x3_reduce_bn_out)
391 | inception_5b_3x3_bn_out = self.inception_5b_3x3_bn(inception_5b_3x3_out)
392 | inception_5b_relu_3x3_out = self.inception_5b_relu_3x3(inception_5b_3x3_bn_out)
393 | inception_5b_double_3x3_reduce_out = self.inception_5b_double_3x3_reduce(inception_5a_output_out)
394 | inception_5b_double_3x3_reduce_bn_out = self.inception_5b_double_3x3_reduce_bn(
395 | inception_5b_double_3x3_reduce_out)
396 | inception_5b_relu_double_3x3_reduce_out = self.inception_5b_relu_double_3x3_reduce(
397 | inception_5b_double_3x3_reduce_bn_out)
398 | inception_5b_double_3x3_1_out = self.inception_5b_double_3x3_1(inception_5b_double_3x3_reduce_bn_out)
399 | inception_5b_double_3x3_1_bn_out = self.inception_5b_double_3x3_1_bn(inception_5b_double_3x3_1_out)
400 | inception_5b_relu_double_3x3_1_out = self.inception_5b_relu_double_3x3_1(inception_5b_double_3x3_1_bn_out)
401 | inception_5b_double_3x3_2_out = self.inception_5b_double_3x3_2(inception_5b_double_3x3_1_bn_out)
402 | inception_5b_double_3x3_2_bn_out = self.inception_5b_double_3x3_2_bn(inception_5b_double_3x3_2_out)
403 | inception_5b_relu_double_3x3_2_out = self.inception_5b_relu_double_3x3_2(inception_5b_double_3x3_2_bn_out)
404 | inception_5b_pool_out = self.inception_5b_pool(inception_5a_output_out)
405 | inception_5b_pool_proj_out = self.inception_5b_pool_proj(inception_5b_pool_out)
406 | inception_5b_pool_proj_bn_out = self.inception_5b_pool_proj_bn(inception_5b_pool_proj_out)
407 | inception_5b_relu_pool_proj_out = self.inception_5b_relu_pool_proj(inception_5b_pool_proj_bn_out)
408 | inception_5b_output_out = torch.cat(
409 | [inception_5b_1x1_bn_out, inception_5b_3x3_bn_out, inception_5b_double_3x3_2_bn_out,
410 | inception_5b_pool_proj_bn_out], 1)
411 | return inception_5b_output_out
412 |
413 | def _build_features(self, inplace, num_classes):
414 | self.conv1_7x7_s2 = nn.Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
415 | self.conv1_7x7_s2_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True)
416 | self.conv1_relu_7x7 = nn.ReLU(inplace)
417 | self.pool1_3x3_s2 = nn.MaxPool2d((3, 3), stride=(2, 2), dilation=(1, 1), ceil_mode=True)
418 | self.conv2_3x3_reduce = nn.Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
419 | self.conv2_3x3_reduce_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True)
420 | self.conv2_relu_3x3_reduce = nn.ReLU(inplace)
421 | self.conv2_3x3 = nn.Conv2d(64, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
422 | self.conv2_3x3_bn = nn.BatchNorm2d(192, eps=1e-05, momentum=0.9, affine=True)
423 | self.conv2_relu_3x3 = nn.ReLU(inplace)
424 | self.pool2_3x3_s2 = nn.MaxPool2d((3, 3), stride=(2, 2), dilation=(1, 1), ceil_mode=True)
425 | self.inception_3a_1x1 = nn.Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1))
426 | self.inception_3a_1x1_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True)
427 | self.inception_3a_relu_1x1 = nn.ReLU(inplace)
428 | self.inception_3a_3x3_reduce = nn.Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1))
429 | self.inception_3a_3x3_reduce_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True)
430 | self.inception_3a_relu_3x3_reduce = nn.ReLU(inplace)
431 | self.inception_3a_3x3 = nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
432 | self.inception_3a_3x3_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True)
433 | self.inception_3a_relu_3x3 = nn.ReLU(inplace)
434 | self.inception_3a_double_3x3_reduce = nn.Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1))
435 | self.inception_3a_double_3x3_reduce_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True)
436 | self.inception_3a_relu_double_3x3_reduce = nn.ReLU(inplace)
437 | self.inception_3a_double_3x3_1 = nn.Conv2d(64, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
438 | self.inception_3a_double_3x3_1_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.9, affine=True)
439 | self.inception_3a_relu_double_3x3_1 = nn.ReLU(inplace)
440 | self.inception_3a_double_3x3_2 = nn.Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
441 | self.inception_3a_double_3x3_2_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.9, affine=True)
442 | self.inception_3a_relu_double_3x3_2 = nn.ReLU(inplace)
443 | self.inception_3a_pool = nn.AvgPool2d(3, stride=1, padding=1, ceil_mode=True, count_include_pad=True)
444 | self.inception_3a_pool_proj = nn.Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1))
445 | self.inception_3a_pool_proj_bn = nn.BatchNorm2d(32, eps=1e-05, momentum=0.9, affine=True)
446 | self.inception_3a_relu_pool_proj = nn.ReLU(inplace)
447 | self.inception_3b_1x1 = nn.Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1))
448 | self.inception_3b_1x1_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True)
449 | self.inception_3b_relu_1x1 = nn.ReLU(inplace)
450 | self.inception_3b_3x3_reduce = nn.Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1))
451 | self.inception_3b_3x3_reduce_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True)
452 | self.inception_3b_relu_3x3_reduce = nn.ReLU(inplace)
453 | self.inception_3b_3x3 = nn.Conv2d(64, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
454 | self.inception_3b_3x3_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.9, affine=True)
455 | self.inception_3b_relu_3x3 = nn.ReLU(inplace)
456 | self.inception_3b_double_3x3_reduce = nn.Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1))
457 | self.inception_3b_double_3x3_reduce_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True)
458 | self.inception_3b_relu_double_3x3_reduce = nn.ReLU(inplace)
459 | self.inception_3b_double_3x3_1 = nn.Conv2d(64, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
460 | self.inception_3b_double_3x3_1_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.9, affine=True)
461 | self.inception_3b_relu_double_3x3_1 = nn.ReLU(inplace)
462 | self.inception_3b_double_3x3_2 = nn.Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
463 | self.inception_3b_double_3x3_2_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.9, affine=True)
464 | self.inception_3b_relu_double_3x3_2 = nn.ReLU(inplace)
465 | self.inception_3b_pool = nn.AvgPool2d(3, stride=1, padding=1, ceil_mode=True, count_include_pad=True)
466 | self.inception_3b_pool_proj = nn.Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1))
467 | self.inception_3b_pool_proj_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True)
468 | self.inception_3b_relu_pool_proj = nn.ReLU(inplace)
469 | self.inception_3c_3x3_reduce = nn.Conv2d(320, 128, kernel_size=(1, 1), stride=(1, 1))
470 | self.inception_3c_3x3_reduce_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True)
471 | self.inception_3c_relu_3x3_reduce = nn.ReLU(inplace)
472 | self.inception_3c_3x3 = nn.Conv2d(128, 160, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
473 | self.inception_3c_3x3_bn = nn.BatchNorm2d(160, eps=1e-05, momentum=0.9, affine=True)
474 | self.inception_3c_relu_3x3 = nn.ReLU(inplace)
475 | self.inception_3c_double_3x3_reduce = nn.Conv2d(320, 64, kernel_size=(1, 1), stride=(1, 1))
476 | self.inception_3c_double_3x3_reduce_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True)
477 | self.inception_3c_relu_double_3x3_reduce = nn.ReLU(inplace)
478 | self.inception_3c_double_3x3_1 = nn.Conv2d(64, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
479 | self.inception_3c_double_3x3_1_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.9, affine=True)
480 | self.inception_3c_relu_double_3x3_1 = nn.ReLU(inplace)
481 | self.inception_3c_double_3x3_2 = nn.Conv2d(96, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
482 | self.inception_3c_double_3x3_2_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.9, affine=True)
483 | self.inception_3c_relu_double_3x3_2 = nn.ReLU(inplace)
484 | self.inception_3c_pool = nn.MaxPool2d((3, 3), stride=(2, 2), dilation=(1, 1), ceil_mode=True)
485 | self.inception_4a_1x1 = nn.Conv2d(576, 224, kernel_size=(1, 1), stride=(1, 1))
486 | self.inception_4a_1x1_bn = nn.BatchNorm2d(224, eps=1e-05, momentum=0.9, affine=True)
487 | self.inception_4a_relu_1x1 = nn.ReLU(inplace)
488 | self.inception_4a_3x3_reduce = nn.Conv2d(576, 64, kernel_size=(1, 1), stride=(1, 1))
489 | self.inception_4a_3x3_reduce_bn = nn.BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True)
490 | self.inception_4a_relu_3x3_reduce = nn.ReLU(inplace)
491 | self.inception_4a_3x3 = nn.Conv2d(64, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
492 | self.inception_4a_3x3_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.9, affine=True)
493 | self.inception_4a_relu_3x3 = nn.ReLU(inplace)
494 | self.inception_4a_double_3x3_reduce = nn.Conv2d(576, 96, kernel_size=(1, 1), stride=(1, 1))
495 | self.inception_4a_double_3x3_reduce_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.9, affine=True)
496 | self.inception_4a_relu_double_3x3_reduce = nn.ReLU(inplace)
497 | self.inception_4a_double_3x3_1 = nn.Conv2d(96, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
498 | self.inception_4a_double_3x3_1_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True)
499 | self.inception_4a_relu_double_3x3_1 = nn.ReLU(inplace)
500 | self.inception_4a_double_3x3_2 = nn.Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
501 | self.inception_4a_double_3x3_2_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True)
502 | self.inception_4a_relu_double_3x3_2 = nn.ReLU(inplace)
503 | self.inception_4a_pool = nn.AvgPool2d(3, stride=1, padding=1, ceil_mode=True, count_include_pad=True)
504 | self.inception_4a_pool_proj = nn.Conv2d(576, 128, kernel_size=(1, 1), stride=(1, 1))
505 | self.inception_4a_pool_proj_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True)
506 | self.inception_4a_relu_pool_proj = nn.ReLU(inplace)
507 | self.inception_4b_1x1 = nn.Conv2d(576, 192, kernel_size=(1, 1), stride=(1, 1))
508 | self.inception_4b_1x1_bn = nn.BatchNorm2d(192, eps=1e-05, momentum=0.9, affine=True)
509 | self.inception_4b_relu_1x1 = nn.ReLU(inplace)
510 | self.inception_4b_3x3_reduce = nn.Conv2d(576, 96, kernel_size=(1, 1), stride=(1, 1))
511 | self.inception_4b_3x3_reduce_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.9, affine=True)
512 | self.inception_4b_relu_3x3_reduce = nn.ReLU(inplace)
513 | self.inception_4b_3x3 = nn.Conv2d(96, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
514 | self.inception_4b_3x3_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True)
515 | self.inception_4b_relu_3x3 = nn.ReLU(inplace)
516 | self.inception_4b_double_3x3_reduce = nn.Conv2d(576, 96, kernel_size=(1, 1), stride=(1, 1))
517 | self.inception_4b_double_3x3_reduce_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.9, affine=True)
518 | self.inception_4b_relu_double_3x3_reduce = nn.ReLU(inplace)
519 | self.inception_4b_double_3x3_1 = nn.Conv2d(96, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
520 | self.inception_4b_double_3x3_1_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True)
521 | self.inception_4b_relu_double_3x3_1 = nn.ReLU(inplace)
522 | self.inception_4b_double_3x3_2 = nn.Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
523 | self.inception_4b_double_3x3_2_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True)
524 | self.inception_4b_relu_double_3x3_2 = nn.ReLU(inplace)
525 | self.inception_4b_pool = nn.AvgPool2d(3, stride=1, padding=1, ceil_mode=True, count_include_pad=True)
526 | self.inception_4b_pool_proj = nn.Conv2d(576, 128, kernel_size=(1, 1), stride=(1, 1))
527 | self.inception_4b_pool_proj_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True)
528 | self.inception_4b_relu_pool_proj = nn.ReLU(inplace)
529 | self.inception_4c_1x1 = nn.Conv2d(576, 160, kernel_size=(1, 1), stride=(1, 1))
530 | self.inception_4c_1x1_bn = nn.BatchNorm2d(160, eps=1e-05, momentum=0.9, affine=True)
531 | self.inception_4c_relu_1x1 = nn.ReLU(inplace)
532 | self.inception_4c_3x3_reduce = nn.Conv2d(576, 128, kernel_size=(1, 1), stride=(1, 1))
533 | self.inception_4c_3x3_reduce_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True)
534 | self.inception_4c_relu_3x3_reduce = nn.ReLU(inplace)
535 | self.inception_4c_3x3 = nn.Conv2d(128, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
536 | self.inception_4c_3x3_bn = nn.BatchNorm2d(160, eps=1e-05, momentum=0.9, affine=True)
537 | self.inception_4c_relu_3x3 = nn.ReLU(inplace)
538 | self.inception_4c_double_3x3_reduce = nn.Conv2d(576, 128, kernel_size=(1, 1), stride=(1, 1))
539 | self.inception_4c_double_3x3_reduce_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True)
540 | self.inception_4c_relu_double_3x3_reduce = nn.ReLU(inplace)
541 | self.inception_4c_double_3x3_1 = nn.Conv2d(128, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
542 | self.inception_4c_double_3x3_1_bn = nn.BatchNorm2d(160, eps=1e-05, momentum=0.9, affine=True)
543 | self.inception_4c_relu_double_3x3_1 = nn.ReLU(inplace)
544 | self.inception_4c_double_3x3_2 = nn.Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
545 | self.inception_4c_double_3x3_2_bn = nn.BatchNorm2d(160, eps=1e-05, momentum=0.9, affine=True)
546 | self.inception_4c_relu_double_3x3_2 = nn.ReLU(inplace)
547 | self.inception_4c_pool = nn.AvgPool2d(3, stride=1, padding=1, ceil_mode=True, count_include_pad=True)
548 | self.inception_4c_pool_proj = nn.Conv2d(576, 128, kernel_size=(1, 1), stride=(1, 1))
549 | self.inception_4c_pool_proj_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True)
550 | self.inception_4c_relu_pool_proj = nn.ReLU(inplace)
551 | self.inception_4d_1x1 = nn.Conv2d(608, 96, kernel_size=(1, 1), stride=(1, 1))
552 | self.inception_4d_1x1_bn = nn.BatchNorm2d(96, eps=1e-05, momentum=0.9, affine=True)
553 | self.inception_4d_relu_1x1 = nn.ReLU(inplace)
554 | self.inception_4d_3x3_reduce = nn.Conv2d(608, 128, kernel_size=(1, 1), stride=(1, 1))
555 | self.inception_4d_3x3_reduce_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True)
556 | self.inception_4d_relu_3x3_reduce = nn.ReLU(inplace)
557 | self.inception_4d_3x3 = nn.Conv2d(128, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
558 | self.inception_4d_3x3_bn = nn.BatchNorm2d(192, eps=1e-05, momentum=0.9, affine=True)
559 | self.inception_4d_relu_3x3 = nn.ReLU(inplace)
560 | self.inception_4d_double_3x3_reduce = nn.Conv2d(608, 160, kernel_size=(1, 1), stride=(1, 1))
561 | self.inception_4d_double_3x3_reduce_bn = nn.BatchNorm2d(160, eps=1e-05, momentum=0.9, affine=True)
562 | self.inception_4d_relu_double_3x3_reduce = nn.ReLU(inplace)
563 | self.inception_4d_double_3x3_1 = nn.Conv2d(160, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
564 | self.inception_4d_double_3x3_1_bn = nn.BatchNorm2d(192, eps=1e-05, momentum=0.9, affine=True)
565 | self.inception_4d_relu_double_3x3_1 = nn.ReLU(inplace)
566 | self.inception_4d_double_3x3_2 = nn.Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
567 | self.inception_4d_double_3x3_2_bn = nn.BatchNorm2d(192, eps=1e-05, momentum=0.9, affine=True)
568 | self.inception_4d_relu_double_3x3_2 = nn.ReLU(inplace)
569 | self.inception_4d_pool = nn.AvgPool2d(3, stride=1, padding=1, ceil_mode=True, count_include_pad=True)
570 | self.inception_4d_pool_proj = nn.Conv2d(608, 128, kernel_size=(1, 1), stride=(1, 1))
571 | self.inception_4d_pool_proj_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True)
572 | self.inception_4d_relu_pool_proj = nn.ReLU(inplace)
573 | self.inception_4e_3x3_reduce = nn.Conv2d(608, 128, kernel_size=(1, 1), stride=(1, 1))
574 | self.inception_4e_3x3_reduce_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True)
575 | self.inception_4e_relu_3x3_reduce = nn.ReLU(inplace)
576 | self.inception_4e_3x3 = nn.Conv2d(128, 192, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
577 | self.inception_4e_3x3_bn = nn.BatchNorm2d(192, eps=1e-05, momentum=0.9, affine=True)
578 | self.inception_4e_relu_3x3 = nn.ReLU(inplace)
579 | self.inception_4e_double_3x3_reduce = nn.Conv2d(608, 192, kernel_size=(1, 1), stride=(1, 1))
580 | self.inception_4e_double_3x3_reduce_bn = nn.BatchNorm2d(192, eps=1e-05, momentum=0.9, affine=True)
581 | self.inception_4e_relu_double_3x3_reduce = nn.ReLU(inplace)
582 | self.inception_4e_double_3x3_1 = nn.Conv2d(192, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
583 | self.inception_4e_double_3x3_1_bn = nn.BatchNorm2d(256, eps=1e-05, momentum=0.9, affine=True)
584 | self.inception_4e_relu_double_3x3_1 = nn.ReLU(inplace)
585 | self.inception_4e_double_3x3_2 = nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
586 | self.inception_4e_double_3x3_2_bn = nn.BatchNorm2d(256, eps=1e-05, momentum=0.9, affine=True)
587 | self.inception_4e_relu_double_3x3_2 = nn.ReLU(inplace)
588 | self.inception_4e_pool = nn.MaxPool2d((3, 3), stride=(2, 2), dilation=(1, 1), ceil_mode=True)
589 | self.inception_5a_1x1 = nn.Conv2d(1056, 352, kernel_size=(1, 1), stride=(1, 1))
590 | self.inception_5a_1x1_bn = nn.BatchNorm2d(352, eps=1e-05, momentum=0.9, affine=True)
591 | self.inception_5a_relu_1x1 = nn.ReLU(inplace)
592 | self.inception_5a_3x3_reduce = nn.Conv2d(1056, 192, kernel_size=(1, 1), stride=(1, 1))
593 | self.inception_5a_3x3_reduce_bn = nn.BatchNorm2d(192, eps=1e-05, momentum=0.9, affine=True)
594 | self.inception_5a_relu_3x3_reduce = nn.ReLU(inplace)
595 | self.inception_5a_3x3 = nn.Conv2d(192, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
596 | self.inception_5a_3x3_bn = nn.BatchNorm2d(320, eps=1e-05, momentum=0.9, affine=True)
597 | self.inception_5a_relu_3x3 = nn.ReLU(inplace)
598 | self.inception_5a_double_3x3_reduce = nn.Conv2d(1056, 160, kernel_size=(1, 1), stride=(1, 1))
599 | self.inception_5a_double_3x3_reduce_bn = nn.BatchNorm2d(160, eps=1e-05, momentum=0.9, affine=True)
600 | self.inception_5a_relu_double_3x3_reduce = nn.ReLU(inplace)
601 | self.inception_5a_double_3x3_1 = nn.Conv2d(160, 224, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
602 | self.inception_5a_double_3x3_1_bn = nn.BatchNorm2d(224, eps=1e-05, momentum=0.9, affine=True)
603 | self.inception_5a_relu_double_3x3_1 = nn.ReLU(inplace)
604 | self.inception_5a_double_3x3_2 = nn.Conv2d(224, 224, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
605 | self.inception_5a_double_3x3_2_bn = nn.BatchNorm2d(224, eps=1e-05, momentum=0.9, affine=True)
606 | self.inception_5a_relu_double_3x3_2 = nn.ReLU(inplace)
607 | self.inception_5a_pool = nn.AvgPool2d(3, stride=1, padding=1, ceil_mode=True, count_include_pad=True)
608 | self.inception_5a_pool_proj = nn.Conv2d(1056, 128, kernel_size=(1, 1), stride=(1, 1))
609 | self.inception_5a_pool_proj_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True)
610 | self.inception_5a_relu_pool_proj = nn.ReLU(inplace)
611 | self.inception_5b_1x1 = nn.Conv2d(1024, 352, kernel_size=(1, 1), stride=(1, 1))
612 | self.inception_5b_1x1_bn = nn.BatchNorm2d(352, eps=1e-05, momentum=0.9, affine=True)
613 | self.inception_5b_relu_1x1 = nn.ReLU(inplace)
614 | self.inception_5b_3x3_reduce = nn.Conv2d(1024, 192, kernel_size=(1, 1), stride=(1, 1))
615 | self.inception_5b_3x3_reduce_bn = nn.BatchNorm2d(192, eps=1e-05, momentum=0.9, affine=True)
616 | self.inception_5b_relu_3x3_reduce = nn.ReLU(inplace)
617 | self.inception_5b_3x3 = nn.Conv2d(192, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
618 | self.inception_5b_3x3_bn = nn.BatchNorm2d(320, eps=1e-05, momentum=0.9, affine=True)
619 | self.inception_5b_relu_3x3 = nn.ReLU(inplace)
620 | self.inception_5b_double_3x3_reduce = nn.Conv2d(1024, 192, kernel_size=(1, 1), stride=(1, 1))
621 | self.inception_5b_double_3x3_reduce_bn = nn.BatchNorm2d(192, eps=1e-05, momentum=0.9, affine=True)
622 | self.inception_5b_relu_double_3x3_reduce = nn.ReLU(inplace)
623 | self.inception_5b_double_3x3_1 = nn.Conv2d(192, 224, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
624 | self.inception_5b_double_3x3_1_bn = nn.BatchNorm2d(224, eps=1e-05, momentum=0.9, affine=True)
625 | self.inception_5b_relu_double_3x3_1 = nn.ReLU(inplace)
626 | self.inception_5b_double_3x3_2 = nn.Conv2d(224, 224, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
627 | self.inception_5b_double_3x3_2_bn = nn.BatchNorm2d(224, eps=1e-05, momentum=0.9, affine=True)
628 | self.inception_5b_relu_double_3x3_2 = nn.ReLU(inplace)
629 | self.inception_5b_pool = nn.MaxPool2d((3, 3), stride=(1, 1), padding=(1, 1), dilation=(1, 1), ceil_mode=True)
630 | self.inception_5b_pool_proj = nn.Conv2d(1024, 128, kernel_size=(1, 1), stride=(1, 1))
631 | self.inception_5b_pool_proj_bn = nn.BatchNorm2d(128, eps=1e-05, momentum=0.9, affine=True)
632 | self.inception_5b_relu_pool_proj = nn.ReLU(inplace)
633 | self.global_pool = nn.AvgPool2d(7, stride=1, padding=0, ceil_mode=True, count_include_pad=True)
634 | self.fc = nn.Linear(1024, num_classes)
635 |
636 |
637 | def bninception(pretrained='imagenet'):
638 | r"""BNInception model architecture from `_ paper.
639 | """
640 | if pretrained is not None:
641 | print('=> Loading from pretrained model: {}'.format(pretrained))
642 | settings = pretrained_settings['bninception'][pretrained]
643 | num_classes = settings['num_classes']
644 | model = BNInception(num_classes=num_classes)
645 | model.load_state_dict(model_zoo.load_url(settings['url']))
646 | model.input_space = settings['input_space']
647 | model.input_size = settings['input_size']
648 | model.input_range = settings['input_range']
649 | model.mean = settings['mean']
650 | model.std = settings['std']
651 | else:
652 | raise NotImplementedError
653 | return model
654 |
655 |
656 | if __name__ == '__main__':
657 | model = bninception()
658 |
--------------------------------------------------------------------------------
/archs/mobilenet_v2.py:
--------------------------------------------------------------------------------
1 | # Code adapted from https://github.com/tonylins/pytorch-mobilenet-v2
2 |
3 | import torch.nn as nn
4 | import math
5 |
6 |
7 | def conv_bn(inp, oup, stride):
8 | return nn.Sequential(
9 | nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
10 | nn.BatchNorm2d(oup),
11 | nn.ReLU6(inplace=True)
12 | )
13 |
14 |
15 | def conv_1x1_bn(inp, oup):
16 | return nn.Sequential(
17 | nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
18 | nn.BatchNorm2d(oup),
19 | nn.ReLU6(inplace=True)
20 | )
21 |
22 |
23 | def make_divisible(x, divisible_by=8):
24 | import numpy as np
25 | return int(np.ceil(x * 1. / divisible_by) * divisible_by)
26 |
27 |
28 | class InvertedResidual(nn.Module):
29 | def __init__(self, inp, oup, stride, expand_ratio):
30 | super(InvertedResidual, self).__init__()
31 | self.stride = stride
32 | assert stride in [1, 2]
33 |
34 | hidden_dim = int(inp * expand_ratio)
35 | self.use_res_connect = self.stride == 1 and inp == oup
36 |
37 | if expand_ratio == 1:
38 | self.conv = nn.Sequential(
39 | # dw
40 | nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
41 | nn.BatchNorm2d(hidden_dim),
42 | nn.ReLU6(inplace=True),
43 | # pw-linear
44 | nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
45 | nn.BatchNorm2d(oup),
46 | )
47 | else:
48 | self.conv = nn.Sequential(
49 | # pw
50 | nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
51 | nn.BatchNorm2d(hidden_dim),
52 | nn.ReLU6(inplace=True),
53 | # dw
54 | nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
55 | nn.BatchNorm2d(hidden_dim),
56 | nn.ReLU6(inplace=True),
57 | # pw-linear
58 | nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
59 | nn.BatchNorm2d(oup),
60 | )
61 |
62 | def forward(self, x):
63 | if self.use_res_connect:
64 | return x + self.conv(x)
65 | else:
66 | return self.conv(x)
67 |
68 |
69 | class MobileNetV2(nn.Module):
70 | def __init__(self, n_class=1000, input_size=224, width_mult=1.):
71 | super(MobileNetV2, self).__init__()
72 | block = InvertedResidual
73 | input_channel = 32
74 | last_channel = 1280
75 | interverted_residual_setting = [
76 | # t, c, n, s
77 | [1, 16, 1, 1],
78 | [6, 24, 2, 2],
79 | [6, 32, 3, 2],
80 | [6, 64, 4, 2],
81 | [6, 96, 3, 1],
82 | [6, 160, 3, 2],
83 | [6, 320, 1, 1],
84 | ]
85 |
86 | # building first layer
87 | assert input_size % 32 == 0
88 | # input_channel = make_divisible(input_channel * width_mult) # first channel is always 32!
89 | self.last_channel = make_divisible(last_channel * width_mult) if width_mult > 1.0 else last_channel
90 | self.features = [conv_bn(3, input_channel, 2)]
91 | # building inverted residual blocks
92 | for t, c, n, s in interverted_residual_setting:
93 | output_channel = make_divisible(c * width_mult) if t > 1 else c
94 | for i in range(n):
95 | if i == 0:
96 | self.features.append(block(input_channel, output_channel, s, expand_ratio=t))
97 | else:
98 | self.features.append(block(input_channel, output_channel, 1, expand_ratio=t))
99 | input_channel = output_channel
100 | # building last several layers
101 | self.features.append(conv_1x1_bn(input_channel, self.last_channel))
102 | # make it nn.Sequential
103 | self.features = nn.Sequential(*self.features)
104 |
105 | # building classifier
106 | self.classifier = nn.Linear(self.last_channel, n_class)
107 |
108 | self._initialize_weights()
109 |
110 | def forward(self, x):
111 | x = self.features(x)
112 | x = x.mean(3).mean(2)
113 | x = self.classifier(x)
114 | return x
115 |
116 | def _initialize_weights(self):
117 | for m in self.modules():
118 | if isinstance(m, nn.Conv2d):
119 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
120 | m.weight.data.normal_(0, math.sqrt(2. / n))
121 | if m.bias is not None:
122 | m.bias.data.zero_()
123 | elif isinstance(m, nn.BatchNorm2d):
124 | m.weight.data.fill_(1)
125 | m.bias.data.zero_()
126 | elif isinstance(m, nn.Linear):
127 | n = m.weight.size(1)
128 | m.weight.data.normal_(0, 0.01)
129 | m.bias.data.zero_()
130 |
131 |
132 | def mobilenet_v2(pretrained=True):
133 | model = MobileNetV2(width_mult=1)
134 |
135 | if pretrained:
136 | try:
137 | from torch.hub import load_state_dict_from_url
138 | except ImportError:
139 | from torch.utils.model_zoo import load_url as load_state_dict_from_url
140 | state_dict = load_state_dict_from_url(
141 | 'https://www.dropbox.com/s/47tyzpofuuyyv1b/mobilenetv2_1.0-f2a8633.pth.tar?dl=1', progress=True)
142 | model.load_state_dict(state_dict)
143 | return model
144 |
145 |
146 | if __name__ == '__main__':
147 | net = mobilenet_v2(True)
148 |
149 |
150 |
151 |
152 |
153 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | # Code for paper:
2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance"
3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan
4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch
5 |
6 | import sys
7 | import os
8 | import time
9 | import shutil
10 | import torch.nn.parallel
11 | import torch.backends.cudnn as cudnn
12 | import torch.optim
13 | from torch.nn.utils import clip_grad_norm_
14 |
15 | from ops.dataset import PANDataSet
16 | from ops.models import PAN
17 | from ops.transforms import *
18 | from opts import parser
19 | from ops import dataset_config
20 | from ops.utils import AverageMeter, accuracy
21 | from ops.temporal_shift import make_temporal_pool
22 |
23 | from tensorboardX import SummaryWriter
24 |
25 | best_prec1 = 0
26 |
27 | def main():
28 |
29 | global args, best_prec1
30 | args = parser.parse_args()
31 |
32 | if args.base == 'TSM':
33 | args.shift = True
34 | args.shift_div = 8
35 | args.shift_place = 'blockres'
36 |
37 | num_class, args.train_list, args.val_list, args.root_path, prefix = dataset_config.return_dataset(args.dataset,
38 | args.modality)
39 |
40 | full_arch_name = args.arch
41 | if args.shift:
42 | full_arch_name += '_shift{}_{}'.format(args.shift_div, args.shift_place)
43 | if args.temporal_pool:
44 | full_arch_name += '_tpool'
45 | args.store_name = '_'.join(
46 | ['PAN', args.modality, args.dataset, full_arch_name, args.consensus_type, 'segment%d' % args.num_segments,
47 | 'e{}'.format(args.epochs)])
48 | if args.pretrain != 'imagenet':
49 | args.store_name += '_{}'.format(args.pretrain)
50 | if args.lr_type != 'step':
51 | args.store_name += '_{}'.format(args.lr_type)
52 | if args.dense_sample:
53 | args.store_name += '_dense'
54 | if args.non_local > 0:
55 | args.store_name += '_nl'
56 | if args.suffix is not None:
57 | args.store_name += '_{}'.format(args.suffix)
58 | print('- storing name: ' + args.store_name)
59 |
60 | check_rootfolders()
61 |
62 | if args.modality == 'RGB':
63 | data_length = 1
64 | elif args.modality in ['PA', 'Lite']:
65 | data_length = 4
66 | elif args.modality in ['Flow', 'RGBDiff']:
67 | data_length = 5
68 |
69 | print("-"*30)
70 | print("Environment Versions:")
71 | print("- Python: {}".format(sys.version))
72 | print("- PyTorch: {}".format(torch.__version__))
73 | print("- TorchVison: {}".format(torchvision.__version__))
74 |
75 | args_dict = args.__dict__
76 | print("-"*30)
77 | print("PAN Configurations:")
78 | print(args_dict)
79 | print("-"*30)
80 |
81 | model = PAN(num_class, args.num_segments, args.modality,
82 | base_model=args.arch,
83 | consensus_type=args.consensus_type,
84 | dropout=args.dropout,
85 | img_feature_dim=args.img_feature_dim,
86 | partial_bn=not args.no_partialbn,
87 | pretrain=args.pretrain,
88 | is_shift=args.shift, shift_div=args.shift_div, shift_place=args.shift_place,
89 | fc_lr5=not (args.tune_from and args.dataset in args.tune_from),
90 | temporal_pool=args.temporal_pool,
91 | non_local=args.non_local, data_length=data_length, has_VAP=args.VAP)
92 |
93 | #print(model)
94 |
95 | crop_size = model.crop_size
96 | scale_size = model.scale_size
97 | input_mean = model.input_mean
98 | input_std = model.input_std
99 | policies = model.get_optim_policies()
100 | train_augmentation = model.get_augmentation(flip=False if 'something' in args.dataset or 'jester' in args.dataset else True)
101 |
102 | model = torch.nn.DataParallel(model, device_ids=args.gpus).cuda()
103 |
104 | optimizer = torch.optim.SGD(policies,
105 | args.lr,
106 | momentum=args.momentum,
107 | weight_decay=args.weight_decay)
108 |
109 | if args.resume:
110 | if args.temporal_pool: # early temporal pool so that we can load the state_dict
111 | make_temporal_pool(model.module.base_model, args.num_segments)
112 | if os.path.isfile(args.resume):
113 | print(("=> loading checkpoint '{}'".format(args.resume)))
114 | checkpoint = torch.load(args.resume)
115 | args.start_epoch = checkpoint['epoch']
116 | best_prec1 = checkpoint['best_prec1']
117 | model.load_state_dict(checkpoint['state_dict'])
118 | optimizer.load_state_dict(checkpoint['optimizer'])
119 | print(("=> loaded checkpoint '{}' (epoch {})"
120 | .format(args.evaluate, checkpoint['epoch'])))
121 | else:
122 | print(("=> no checkpoint found at '{}'".format(args.resume)))
123 |
124 | if args.tune_from:
125 | print(("=> fine-tuning from '{}'".format(args.tune_from)))
126 | sd = torch.load(args.tune_from)
127 | sd = sd['state_dict']
128 | model_dict = model.state_dict()
129 | replace_dict = []
130 | for k, v in sd.items():
131 | if k not in model_dict and k.replace('.net', '') in model_dict:
132 | print('=> Load after remove .net: ', k)
133 | replace_dict.append((k, k.replace('.net', '')))
134 | for k, v in model_dict.items():
135 | if k not in sd and k.replace('.net', '') in sd:
136 | print('=> Load after adding .net: ', k)
137 | replace_dict.append((k.replace('.net', ''), k))
138 |
139 | for k, k_new in replace_dict:
140 | sd[k_new] = sd.pop(k)
141 | keys1 = set(list(sd.keys()))
142 | keys2 = set(list(model_dict.keys()))
143 | set_diff = (keys1 - keys2) | (keys2 - keys1)
144 | print('#### Notice: keys that failed to load: {}'.format(set_diff))
145 | if args.dataset not in args.tune_from: # new dataset
146 | print('=> New dataset, do not load fc weights')
147 | sd = {k: v for k, v in sd.items() if 'fc' not in k}
148 | if args.modality == 'Flow' and 'Flow' not in args.tune_from:
149 | sd = {k: v for k, v in sd.items() if 'conv1.weight' not in k}
150 | model_dict.update(sd)
151 | model.load_state_dict(model_dict)
152 |
153 | if args.temporal_pool and not args.resume:
154 | make_temporal_pool(model.module.base_model, args.num_segments)
155 |
156 | cudnn.benchmark = True
157 |
158 | # Data loading code
159 | if args.modality != 'RGBDiff':
160 | normalize = GroupNormalize(input_mean, input_std)
161 | else:
162 | normalize = IdentityTransform()
163 |
164 | train_loader = torch.utils.data.DataLoader(
165 | PANDataSet(args.root_path, args.train_list, num_segments=args.num_segments,
166 | new_length=data_length,
167 | modality=args.modality,
168 | image_tmpl=prefix,
169 | transform=torchvision.transforms.Compose([
170 | train_augmentation,
171 | Stack(roll=(args.arch in ['BNInception', 'InceptionV3'])),
172 | ToTorchFormatTensor(div=(args.arch not in ['BNInception', 'InceptionV3'])),
173 | normalize,
174 | ]), dense_sample=args.dense_sample, is_lmdb=args.lmdb),
175 | batch_size=args.batch_size, shuffle=True,
176 | num_workers=args.workers, pin_memory=True,
177 | drop_last=True) # prevent something not % n_GPU
178 |
179 | val_loader = torch.utils.data.DataLoader(
180 | PANDataSet(args.root_path, args.val_list, num_segments=args.num_segments,
181 | new_length=data_length,
182 | modality=args.modality,
183 | image_tmpl=prefix,
184 | random_shift=False,
185 | transform=torchvision.transforms.Compose([
186 | GroupScale(int(scale_size)),
187 | GroupCenterCrop(crop_size),
188 | Stack(roll=(args.arch in ['BNInception', 'InceptionV3'])),
189 | ToTorchFormatTensor(div=(args.arch not in ['BNInception', 'InceptionV3'])),
190 | normalize,
191 | ]), dense_sample=args.dense_sample, is_lmdb=args.lmdb),
192 | batch_size=args.batch_size, shuffle=False,
193 | num_workers=args.workers, pin_memory=True)
194 |
195 | # define loss function (criterion) and optimizer
196 | if args.loss_type == 'nll':
197 | criterion = torch.nn.CrossEntropyLoss().cuda()
198 | else:
199 | raise ValueError("Unknown loss type")
200 |
201 | print("-"*30)
202 | for group in policies:
203 | print(('group: {} has {} params, lr_mult: {}, decay_mult: {}'.format(
204 | group['name'], len(group['params']), group['lr_mult'], group['decay_mult'])))
205 | print("-"*30)
206 |
207 | if args.evaluate:
208 | validate(val_loader, model, criterion, 0)
209 | return
210 |
211 | log_training = open(os.path.join(args.root_log, args.store_name, 'log.csv'), 'w')
212 | with open(os.path.join(args.root_log, args.store_name, 'args.txt'), 'w') as f:
213 | f.write(str(args))
214 | tf_writer = SummaryWriter(log_dir=os.path.join(args.root_log, args.store_name))
215 | for epoch in range(args.start_epoch, args.epochs):
216 | adjust_learning_rate(optimizer, epoch, args.lr_type, args.lr_steps)
217 |
218 | # train for one epoch
219 | train(train_loader, model, criterion, optimizer, epoch, log_training, tf_writer)
220 |
221 | # evaluate on validation set
222 | if (epoch + 1) % args.eval_freq == 0 or epoch == args.epochs - 1:
223 | prec1 = validate(val_loader, model, criterion, epoch, log_training, tf_writer)
224 |
225 | # remember best prec@1 and save checkpoint
226 | is_best = prec1 > best_prec1
227 | best_prec1 = max(prec1, best_prec1)
228 | tf_writer.add_scalar('acc/test_top1_best', best_prec1, epoch)
229 |
230 | output_best = 'Best Prec@1: %.3f\n' % (best_prec1)
231 | print(output_best)
232 | log_training.write(output_best + '\n')
233 | log_training.flush()
234 |
235 | save_checkpoint({
236 | 'epoch': epoch + 1,
237 | 'arch': args.arch,
238 | 'state_dict': model.state_dict(),
239 | 'optimizer': optimizer.state_dict(),
240 | 'best_prec1': best_prec1,
241 | }, is_best)
242 |
243 | def train(train_loader, model, criterion, optimizer, epoch, log, tf_writer):
244 | batch_time = AverageMeter()
245 | data_time = AverageMeter()
246 | losses = AverageMeter()
247 | top1 = AverageMeter()
248 | top5 = AverageMeter()
249 |
250 | if args.no_partialbn:
251 | model.module.partialBN(False)
252 | else:
253 | model.module.partialBN(True)
254 |
255 | # switch to train mode
256 | model.train()
257 |
258 | end = time.time()
259 | for i, (input, target) in enumerate(train_loader):
260 | # measure data loading time
261 | data_time.update(time.time() - end)
262 | if i == 20:
263 | os.system("gpustat")
264 | target = target.cuda()
265 | input_var = torch.autograd.Variable(input)
266 | target_var = torch.autograd.Variable(target)
267 |
268 | # compute output
269 | output = model(input_var)
270 | loss = criterion(output, target_var)
271 |
272 | # measure accuracy and record loss
273 | prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
274 | losses.update(loss.item(), input.size(0))
275 | top1.update(prec1.item(), input.size(0))
276 | top5.update(prec5.item(), input.size(0))
277 |
278 | # compute gradient and do SGD step
279 | loss.backward()
280 |
281 | no_grad_cnt = 0
282 |
283 | if i % args.iter_size == 0:
284 | # scale down gradients when iter size is functioning
285 | if args.iter_size != 1:
286 | for g in optimizer.param_groups:
287 | for p in g['params']:
288 | if isinstance(p.grad, torch.Tensor):
289 | p.grad /= args.iter_size
290 | else:
291 | no_grad_cnt = no_grad_cnt + 1
292 |
293 | if args.clip_gradient is not None:
294 | total_norm = clip_grad_norm_(model.parameters(), args.clip_gradient)
295 | else:
296 | total_norm = 0
297 |
298 | optimizer.step()
299 | optimizer.zero_grad()
300 |
301 | #if i == 0:
302 | # print("{}\nWARNING: There are {} params without gradient!!!!!\n{}".format("*"*50, no_grad_cnt, "*"*50))
303 |
304 | # measure elapsed time
305 | batch_time.update(time.time() - end)
306 | end = time.time()
307 |
308 | if i % args.print_freq == 0:
309 | output = ('Epoch: [{0}][{1}/{2}], lr: {lr:.5f}\t'
310 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
311 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
312 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
313 | 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
314 | 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
315 | epoch, i, len(train_loader), batch_time=batch_time,
316 | data_time=data_time, loss=losses, top1=top1, top5=top5, lr=optimizer.param_groups[-1]['lr'] * 0.1)) # TODO
317 | print(output)
318 | log.write(output + '\n')
319 | log.flush()
320 |
321 | tf_writer.add_scalar('loss/train', losses.avg, epoch)
322 | tf_writer.add_scalar('acc/train_top1', top1.avg, epoch)
323 | tf_writer.add_scalar('acc/train_top5', top5.avg, epoch)
324 | tf_writer.add_scalar('lr', optimizer.param_groups[-1]['lr'], epoch)
325 |
326 |
327 | def validate(val_loader, model, criterion, epoch, log=None, tf_writer=None):
328 | batch_time = AverageMeter()
329 | losses = AverageMeter()
330 | top1 = AverageMeter()
331 | top5 = AverageMeter()
332 |
333 | # switch to evaluate mode
334 | model.eval()
335 |
336 | end = time.time()
337 | with torch.no_grad():
338 | for i, (input, target) in enumerate(val_loader):
339 | target = target.cuda()
340 |
341 | # compute output
342 | output = model(input)
343 | loss = criterion(output, target)
344 |
345 | # measure accuracy and record loss
346 | prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
347 |
348 | losses.update(loss.item(), input.size(0))
349 | top1.update(prec1.item(), input.size(0))
350 | top5.update(prec5.item(), input.size(0))
351 |
352 | # measure elapsed time
353 | batch_time.update(time.time() - end)
354 | end = time.time()
355 |
356 | if i % args.print_freq == 0:
357 | output = ('Test: [{0}/{1}]\t'
358 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
359 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
360 | 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
361 | 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
362 | i, len(val_loader), batch_time=batch_time, loss=losses,
363 | top1=top1, top5=top5))
364 | print(output)
365 | if log is not None:
366 | log.write(output + '\n')
367 | log.flush()
368 |
369 | output = ('Testing Results: Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f} Loss {loss.avg:.5f}'
370 | .format(top1=top1, top5=top5, loss=losses))
371 | print(output)
372 | if log is not None:
373 | log.write(output + '\n')
374 | log.flush()
375 |
376 | if tf_writer is not None:
377 | tf_writer.add_scalar('loss/test', losses.avg, epoch)
378 | tf_writer.add_scalar('acc/test_top1', top1.avg, epoch)
379 | tf_writer.add_scalar('acc/test_top5', top5.avg, epoch)
380 |
381 | return top1.avg
382 |
383 |
384 | def save_checkpoint(state, is_best):
385 | filename = '%s/%s/ckpt.pth.tar' % (args.root_model, args.store_name)
386 | torch.save(state, filename)
387 | if is_best:
388 | shutil.copyfile(filename, filename.replace('pth.tar', 'best.pth.tar'))
389 |
390 |
391 | def adjust_learning_rate(optimizer, epoch, lr_type, lr_steps):
392 | """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
393 | if lr_type == 'step':
394 | decay = 0.1 ** (sum(epoch >= np.array(lr_steps)))
395 | lr = args.lr * decay
396 | decay = args.weight_decay
397 | elif lr_type == 'cos':
398 | import math
399 | lr = 0.5 * args.lr * (1 + math.cos(math.pi * epoch / args.epochs))
400 | decay = args.weight_decay
401 | else:
402 | raise NotImplementedError
403 | for param_group in optimizer.param_groups:
404 | param_group['lr'] = lr * param_group['lr_mult']
405 | param_group['weight_decay'] = decay * param_group['decay_mult']
406 |
407 |
408 | def check_rootfolders():
409 | """Create log and model folder"""
410 | folders_util = [args.root_log, args.root_model,
411 | os.path.join(args.root_log, args.store_name),
412 | os.path.join(args.root_model, args.store_name)]
413 | for folder in folders_util:
414 | if not os.path.exists(folder):
415 | print('creating folder ' + folder)
416 | os.mkdir(folder)
417 |
418 | if __name__ == '__main__':
419 | main()
420 |
--------------------------------------------------------------------------------
/ops/PAN_modules.py:
--------------------------------------------------------------------------------
1 | # Code for paper:
2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance"
3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan
4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch
5 |
6 | import torch
7 | from torch import nn
8 | import math
9 |
10 | class PA(nn.Module):
11 | def __init__(self, n_length):
12 | super(PA, self).__init__()
13 | self.shallow_conv = nn.Conv2d(3,8,7,1,3)
14 | self.n_length = n_length
15 | for m in self.modules():
16 | if isinstance(m, nn.Conv2d):
17 | nn.init.normal_(m.weight.data, 0, 0.001)
18 | nn.init.constant_(m.bias.data, 0)
19 |
20 | def forward(self, x):
21 | h, w = x.size(-2), x.size(-1)
22 | x = x.view((-1, 3) + x.size()[-2:])
23 | x = self.shallow_conv(x)
24 | x = x.view(-1, self.n_length, x.size(-3), x.size(-2)*x.size(-1))
25 | for i in range(self.n_length-1):
26 | d_i = nn.PairwiseDistance(p=2)(x[:,i,:,:], x[:,i+1,:,:]).unsqueeze(1)
27 | d = d_i if i == 0 else torch.cat((d, d_i), 1)
28 | PA = d.view(-1, 1*(self.n_length-1), h, w)
29 | return PA
30 |
31 | class VAP(nn.Module):
32 | def __init__(self, n_segment, feature_dim, num_class, dropout_ratio):
33 | super(VAP, self).__init__()
34 | VAP_level = int(math.log(n_segment, 2))
35 | print("=> Using {}-level VAP".format(VAP_level))
36 | self.n_segment = n_segment
37 | self.VAP_level = VAP_level
38 | total_timescale = 0
39 | for i in range(VAP_level):
40 | timescale = 2**i
41 | total_timescale += timescale
42 | setattr(self, "VAP_{}".format(timescale), nn.MaxPool3d((n_segment//timescale,1,1),1,0,(timescale,1,1)))
43 | self.GAP = nn.AdaptiveAvgPool1d(1)
44 | self.TES = nn.Sequential(
45 | nn.Linear(total_timescale, total_timescale*4, bias=False),
46 | nn.ReLU(inplace=True),
47 | nn.Linear(total_timescale*4, total_timescale, bias=False)
48 | )
49 | self.softmax = nn.Softmax(dim=1)
50 | self.dropout = nn.Dropout(p=dropout_ratio)
51 | self.pred = nn.Linear(feature_dim, num_class)
52 |
53 | # fc init
54 | for m in self.modules():
55 | if isinstance(m, nn.Linear):
56 | nn.init.normal_(m.weight.data, 0, 0.001)
57 | if hasattr(m.bias, 'data'):
58 | nn.init.constant_(m.bias.data, 0)
59 |
60 | def forward(self, x):
61 | _, d = x.size()
62 | x = x.view(-1, self.n_segment, d, 1, 1).permute(0,2,1,3,4)
63 | x = torch.cat(tuple([getattr(self, "VAP_{}".format(2**i))(x) for i in range(self.VAP_level)]), 2).squeeze(3).squeeze(3).permute(0,2,1)
64 | w = self.GAP(x).squeeze(2)
65 | w = self.softmax(self.TES(w))
66 | x = x * w.unsqueeze(2)
67 | x = x.sum(dim=1)
68 | x = self.dropout(x)
69 | x = self.pred(x.view(-1,d))
70 | return x
71 |
72 |
--------------------------------------------------------------------------------
/ops/__init__.py:
--------------------------------------------------------------------------------
1 | from ops.basic_ops import *
--------------------------------------------------------------------------------
/ops/basic_ops.py:
--------------------------------------------------------------------------------
1 | # Code for paper:
2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance"
3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan
4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch
5 |
6 | import torch
7 |
8 |
9 | class Identity(torch.nn.Module):
10 | def forward(self, input):
11 | return input
12 |
13 |
14 | class SegmentConsensus(torch.nn.Module):
15 |
16 | def __init__(self, consensus_type, dim=1):
17 | super(SegmentConsensus, self).__init__()
18 | self.consensus_type = consensus_type
19 | self.dim = dim
20 | self.shape = None
21 |
22 | def forward(self, input_tensor):
23 | self.shape = input_tensor.size()
24 | if self.consensus_type == 'avg':
25 | output = input_tensor.mean(dim=self.dim, keepdim=True)
26 | elif self.consensus_type == 'identity':
27 | output = input_tensor
28 | else:
29 | output = None
30 |
31 | return output
32 |
33 |
34 | class ConsensusModule(torch.nn.Module):
35 |
36 | def __init__(self, consensus_type, dim=1):
37 | super(ConsensusModule, self).__init__()
38 | self.consensus_type = consensus_type if consensus_type != 'rnn' else 'identity'
39 | self.dim = dim
40 |
41 | def forward(self, input):
42 | return SegmentConsensus(self.consensus_type, self.dim)(input)
43 |
--------------------------------------------------------------------------------
/ops/dataset.py:
--------------------------------------------------------------------------------
1 | # Code for paper:
2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance"
3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan
4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch
5 |
6 | import torch.utils.data as data
7 |
8 | from PIL import Image
9 | import os
10 | import numpy as np
11 | from numpy.random import randint
12 | import lmdb
13 | from io import BytesIO
14 |
15 | class VideoRecord(object):
16 | def __init__(self, row):
17 | self._data = row
18 |
19 | @property
20 | def path(self):
21 | return self._data[0]
22 |
23 | @property
24 | def num_frames(self):
25 | return int(self._data[1])
26 |
27 | @property
28 | def label(self):
29 | return int(self._data[2])
30 |
31 |
32 | class PANDataSet(data.Dataset):
33 | def __init__(self, root_path, list_file,
34 | num_segments=3, new_length=1, modality='RGB',
35 | image_tmpl='img_{:05d}.jpg', transform=None,
36 | random_shift=True, test_mode=False,
37 | remove_missing=False, dense_sample=False, twice_sample=False, is_lmdb=False):
38 |
39 | self.root_path = root_path
40 | self.list_file = list_file
41 | self.num_segments = num_segments
42 | self.new_length = new_length
43 | self.modality = modality
44 | self.image_tmpl = image_tmpl
45 | self.transform = transform
46 | self.random_shift = random_shift
47 | self.test_mode = test_mode
48 | self.remove_missing = remove_missing
49 | self.dense_sample = dense_sample # using dense sample as I3D
50 | self.twice_sample = twice_sample # twice sample for more validation
51 | if self.dense_sample:
52 | print('=> Using dense sample for the dataset...')
53 | if self.twice_sample:
54 | print('=> Using twice sample for the dataset...')
55 |
56 | self.is_lmdb = is_lmdb
57 | if self.is_lmdb:
58 | print('=> Loading lmdb dataset from: {}'.format(self.root_path))
59 | self.database = lmdb.open(self.root_path, readonly=True).begin().cursor()
60 |
61 | if self.modality == 'RGBDiff':
62 | self.new_length += 1 # Diff needs one more image to calculate diff
63 |
64 | self._parse_list()
65 |
66 | def _load_image(self, directory, idx):
67 | if self.modality in ['RGB','PA', 'Lite', 'RGBDiff']:
68 | if self.is_lmdb:
69 | return [Image.open(BytesIO(self.database.get("{}/{:03d}/{:08d}".format(directory, 0, idx-1).encode())))]
70 | else:
71 | return [Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format(idx))).convert('RGB')]
72 | '''
73 | try:
74 | if self.is_lmdb:
75 | return [Image.open(BytesIO(self.database.get("{}/{:03d}/{:08d}".format(directory, 0, idx-1).encode())))]
76 | else:
77 | return [Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format(idx))).convert('RGB')]
78 | except Exception:
79 | print('error loading image:', os.path.join(self.root_path, directory, self.image_tmpl.format(idx)))
80 | return [Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format(1))).convert('RGB')]
81 | '''
82 | elif self.modality == 'Flow':
83 | if self.image_tmpl == 'flow_{}_{:05d}.jpg': # ucf
84 | x_img = Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format('x', idx))).convert(
85 | 'L')
86 | y_img = Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format('y', idx))).convert(
87 | 'L')
88 | elif self.image_tmpl == '{:06d}-{}_{:05d}.jpg': # something v1 flow
89 | x_img = Image.open(os.path.join(self.root_path, '{:06d}'.format(int(directory)), self.image_tmpl.
90 | format(int(directory), 'x', idx))).convert('L')
91 | y_img = Image.open(os.path.join(self.root_path, '{:06d}'.format(int(directory)), self.image_tmpl.
92 | format(int(directory), 'y', idx))).convert('L')
93 | else:
94 | try:
95 | # idx_skip = 1 + (idx-1)*5
96 | flow = Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format(idx))).convert(
97 | 'RGB')
98 | except Exception:
99 | print('error loading flow file:',
100 | os.path.join(self.root_path, directory, self.image_tmpl.format(idx)))
101 | flow = Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format(1))).convert('RGB')
102 | # the input flow file is RGB image with (flow_x, flow_y, blank) for each channel
103 | flow_x, flow_y, _ = flow.split()
104 | x_img = flow_x.convert('L')
105 | y_img = flow_y.convert('L')
106 |
107 | return [x_img, y_img]
108 |
109 | def _parse_list(self):
110 | # check the frame number is large >3:
111 | tmp = [x.strip().split(' ') for x in open(self.list_file)]
112 | if not self.test_mode or self.remove_missing:
113 | tmp = [item for item in tmp if int(item[1]) >= 3]
114 | self.video_list = [VideoRecord(item) for item in tmp]
115 |
116 | if self.image_tmpl == '{:06d}-{}_{:05d}.jpg':
117 | for v in self.video_list:
118 | v._data[1] = int(v._data[1]) / 2
119 | print('video number:%d' % (len(self.video_list)))
120 |
121 | def _sample_indices(self, record):
122 | """
123 |
124 | :param record: VideoRecord
125 | :return: list
126 | """
127 | if self.dense_sample: # i3d dense sample
128 | sample_pos = max(1, 1 + record.num_frames - 64)
129 | t_stride = 64 // self.num_segments
130 | start_idx = 0 if sample_pos == 1 else np.random.randint(0, sample_pos - 1)
131 | offsets = [(idx * t_stride + start_idx) % record.num_frames for idx in range(self.num_segments)]
132 | return np.array(offsets) + 1
133 | else: # normal sample
134 | average_duration = (record.num_frames - self.new_length + 1) // self.num_segments
135 | if average_duration > 0:
136 | offsets = np.multiply(list(range(self.num_segments)), average_duration) + randint(average_duration,
137 | size=self.num_segments)
138 | elif record.num_frames > self.num_segments:
139 | offsets = np.sort(randint(record.num_frames - self.new_length + 1, size=self.num_segments))
140 | else:
141 | offsets = np.zeros((self.num_segments,))
142 | return offsets + 1
143 |
144 | def _get_val_indices(self, record):
145 | if self.dense_sample: # i3d dense sample
146 | sample_pos = max(1, 1 + record.num_frames - 64)
147 | t_stride = 64 // self.num_segments
148 | start_idx = 0 if sample_pos == 1 else np.random.randint(0, sample_pos - 1)
149 | offsets = [(idx * t_stride + start_idx) % record.num_frames for idx in range(self.num_segments)]
150 | return np.array(offsets) + 1
151 | else:
152 | if record.num_frames > self.num_segments + self.new_length - 1:
153 | tick = (record.num_frames - self.new_length + 1) / float(self.num_segments)
154 | offsets = np.array([int(tick / 2.0 + tick * x) for x in range(self.num_segments)])
155 | else:
156 | offsets = np.zeros((self.num_segments,))
157 | return offsets + 1
158 |
159 | def _get_test_indices(self, record):
160 | if self.dense_sample:
161 | sample_pos = max(1, 1 + record.num_frames - 64)
162 | t_stride = 64 // self.num_segments
163 | start_list = np.linspace(0, sample_pos - 1, num=10, dtype=int)
164 | offsets = []
165 | for start_idx in start_list.tolist():
166 | offsets += [(idx * t_stride + start_idx) % record.num_frames for idx in range(self.num_segments)]
167 | return np.array(offsets) + 1
168 | elif self.twice_sample:
169 | tick = (record.num_frames - self.new_length + 1) / float(self.num_segments)
170 |
171 | offsets = np.array([int(tick / 2.0 + tick * x) for x in range(self.num_segments)] +
172 | [int(tick * x) for x in range(self.num_segments)])
173 |
174 | return offsets + 1
175 | else:
176 | tick = (record.num_frames - self.new_length + 1) / float(self.num_segments)
177 | offsets = np.array([int(tick / 2.0 + tick * x) for x in range(self.num_segments)])
178 | return offsets + 1
179 |
180 | def __getitem__(self, index):
181 | record = self.video_list[index]
182 | # check this is a legit video folder
183 |
184 | if self.image_tmpl == 'flow_{}_{:05d}.jpg':
185 | file_name = self.image_tmpl.format('x', 1)
186 | full_path = os.path.join(self.root_path, record.path, file_name)
187 | elif self.image_tmpl == '{:06d}-{}_{:05d}.jpg':
188 | file_name = self.image_tmpl.format(int(record.path), 'x', 1)
189 | full_path = os.path.join(self.root_path, '{:06d}'.format(int(record.path)), file_name)
190 | else:
191 | file_name = self.image_tmpl.format(1)
192 | full_path = os.path.join(self.root_path, record.path, file_name)
193 |
194 | '''
195 | while not os.path.exists(full_path):
196 | print('################## Not Found:', os.path.join(self.root_path, record.path, file_name))
197 | index = np.random.randint(len(self.video_list))
198 | record = self.video_list[index]
199 | if self.image_tmpl == 'flow_{}_{:05d}.jpg':
200 | file_name = self.image_tmpl.format('x', 1)
201 | full_path = os.path.join(self.root_path, record.path, file_name)
202 | elif self.image_tmpl == '{:06d}-{}_{:05d}.jpg':
203 | file_name = self.image_tmpl.format(int(record.path), 'x', 1)
204 | full_path = os.path.join(self.root_path, '{:06d}'.format(int(record.path)), file_name)
205 | else:
206 | file_name = self.image_tmpl.format(1)
207 | full_path = os.path.join(self.root_path, record.path, file_name)
208 | '''
209 |
210 | if not self.test_mode:
211 | segment_indices = self._sample_indices(record) if self.random_shift else self._get_val_indices(record)
212 | else:
213 | segment_indices = self._get_test_indices(record)
214 | return self.get(record, segment_indices)
215 |
216 | def get(self, record, indices):
217 |
218 | images = list()
219 | for seg_ind in indices:
220 | p = int(seg_ind)
221 | for i in range(self.new_length):
222 | seg_imgs = self._load_image(record.path, p)
223 | images.extend(seg_imgs)
224 | if p < record.num_frames:
225 | p += 1
226 |
227 | process_data = self.transform(images)
228 | return process_data, record.label
229 |
230 | def __len__(self):
231 | return len(self.video_list)
232 |
--------------------------------------------------------------------------------
/ops/dataset_config.py:
--------------------------------------------------------------------------------
1 | # Code for paper:
2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance"
3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan
4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch
5 |
6 | import os
7 |
8 | ROOT_DATASET = '/data/zhangcan/dataset/'
9 |
10 |
11 | def return_ucf101(modality):
12 | filename_categories = 101
13 | if modality in ['RGB', 'PA', 'Lite']:
14 | root_data = ROOT_DATASET + 'ucf101_frames'
15 | filename_imglist_train = '/data/zhangcan/file_lists/ucf101/split1/train.txt'
16 | filename_imglist_val = '/data/zhangcan/file_lists/ucf101/split1/val.txt'
17 | prefix = 'img_{:05d}.jpg'
18 | elif modality == 'Flow':
19 | root_data = ROOT_DATASET + 'UCF101/jpg'
20 | filename_imglist_train = 'UCF101/file_list/ucf101_flow_train_split_1.txt'
21 | filename_imglist_val = 'UCF101/file_list/ucf101_flow_val_split_1.txt'
22 | prefix = 'flow_{}_{:05d}.jpg'
23 | else:
24 | raise NotImplementedError('no such modality:' + modality)
25 | return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix
26 |
27 |
28 | def return_hmdb51(modality):
29 | filename_categories = 51
30 | if modality in ['RGB', 'PA', 'Lite']:
31 | root_data = ROOT_DATASET + 'hmdb51_frames'
32 | filename_imglist_train = '/data/zhangcan/file_lists/hmdb51/split1/train.txt'
33 | filename_imglist_val = '/data/zhangcan/file_lists/hmdb51/split1/val.txt'
34 | prefix = 'img_{:05d}.jpg'
35 | elif modality == 'Flow':
36 | root_data = ROOT_DATASET + 'HMDB51/images'
37 | filename_imglist_train = 'HMDB51/splits/hmdb51_flow_train_split_1.txt'
38 | filename_imglist_val = 'HMDB51/splits/hmdb51_flow_val_split_1.txt'
39 | prefix = 'flow_{}_{:05d}.jpg'
40 | else:
41 | raise NotImplementedError('no such modality:' + modality)
42 | return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix
43 |
44 |
45 | def return_something(modality):
46 | filename_categories = 174
47 | if modality in ['RGB', 'PA', 'Lite']:
48 | root_data = ROOT_DATASET + 'sthv1_frames'
49 | filename_imglist_train = '/data/zhangcan/file_lists/sthv1/split/train.txt'
50 | filename_imglist_val = '/data/zhangcan/file_lists/sthv1/split/val.txt'
51 | prefix = '{:05d}.jpg'
52 | elif modality == 'Flow':
53 | root_data = ROOT_DATASET + 'something/v1/20bn-something-something-v1-flow'
54 | filename_imglist_train = 'something/v1/train_videofolder_flow.txt'
55 | filename_imglist_val = 'something/v1/val_videofolder_flow.txt'
56 | prefix = '{:06d}-{}_{:05d}.jpg'
57 | else:
58 | print('no such modality:'+modality)
59 | raise NotImplementedError
60 | return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix
61 |
62 |
63 | def return_somethingv2(modality):
64 | filename_categories = 174
65 | if modality in ['RGB', 'PA', 'Lite']:
66 | root_data = ROOT_DATASET + 'sthv2_frames'
67 | filename_imglist_train = '/data/zhangcan/file_lists/sthv2/split/train.txt'
68 | filename_imglist_val = '/data/zhangcan/file_lists/sthv2/split/val.txt'
69 | prefix = '{:06d}.jpg'
70 | elif modality == 'Flow':
71 | root_data = ROOT_DATASET + 'something/v2/20bn-something-something-v2-flow'
72 | filename_imglist_train = 'something/v2/train_videofolder_flow.txt'
73 | filename_imglist_val = 'something/v2/val_videofolder_flow.txt'
74 | prefix = '{:06d}.jpg'
75 | else:
76 | raise NotImplementedError('no such modality:'+modality)
77 | return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix
78 |
79 |
80 | def return_jester(modality):
81 | filename_categories = 27
82 | if modality in ['RGB', 'PA', 'Lite']:
83 | prefix = '{:05d}.jpg'
84 | root_data = ROOT_DATASET + 'jester_frames'
85 | filename_imglist_train = '/data/zhangcan/file_lists/jester/split/train.txt'
86 | filename_imglist_val = '/data/zhangcan/file_lists/jester/split/val.txt'
87 | else:
88 | raise NotImplementedError('no such modality:'+modality)
89 | return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix
90 |
91 |
92 | def return_kinetics(modality):
93 | filename_categories = 400
94 | if modality in ['RGB', 'PA', 'Lite']:
95 | root_data = ROOT_DATASET + 'kinetics400_frames'
96 | filename_imglist_train = '/data/zhangcan/file_lists/kin400/split/train.txt'
97 | filename_imglist_val = '/data/zhangcan/file_lists/kin400/split/val.txt'
98 | prefix = 'img_{:05d}.jpg'
99 | else:
100 | raise NotImplementedError('no such modality:' + modality)
101 | return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix
102 |
103 |
104 | def return_dataset(dataset, modality):
105 | dict_single = {'jester': return_jester, 'something': return_something, 'somethingv2': return_somethingv2,
106 | 'ucf101': return_ucf101, 'hmdb51': return_hmdb51,
107 | 'kinetics': return_kinetics }
108 | if dataset in dict_single:
109 | file_categories, file_imglist_train, file_imglist_val, root_data, prefix = dict_single[dataset](modality)
110 | else:
111 | raise ValueError('Unknown dataset '+dataset)
112 |
113 | file_imglist_train = os.path.join(ROOT_DATASET, file_imglist_train)
114 | file_imglist_val = os.path.join(ROOT_DATASET, file_imglist_val)
115 | if isinstance(file_categories, str):
116 | file_categories = os.path.join(ROOT_DATASET, file_categories)
117 | with open(file_categories) as f:
118 | lines = f.readlines()
119 | categories = [item.rstrip() for item in lines]
120 | else: # number of categories
121 | categories = [None] * file_categories
122 | n_class = len(categories)
123 | print('{}: {} classes'.format(dataset, n_class))
124 | return n_class, file_imglist_train, file_imglist_val, root_data, prefix
125 |
--------------------------------------------------------------------------------
/ops/models.py:
--------------------------------------------------------------------------------
1 | # Code for paper:
2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance"
3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan
4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch
5 |
6 | from torch import nn
7 |
8 | from ops.basic_ops import ConsensusModule
9 | from ops.transforms import *
10 | from torch.nn.init import normal_, constant_
11 | from ops.PAN_modules import PA, VAP
12 |
13 | class PAN(nn.Module):
14 | def __init__(self, num_class, num_segments, modality,
15 | base_model='resnet101', new_length=None,
16 | consensus_type='avg', before_softmax=True,
17 | dropout=0.8, img_feature_dim=256,
18 | crop_num=1, partial_bn=True, print_spec=False, pretrain='imagenet',
19 | is_shift=False, shift_div=8, shift_place='blockres', fc_lr5=False,
20 | temporal_pool=False, non_local=False, data_length=1, has_VAP=False):
21 | super(PAN, self).__init__()
22 | self.modality = modality
23 | self.num_segments = num_segments
24 | self.reshape = True
25 | self.before_softmax = before_softmax
26 | self.dropout = dropout
27 | self.crop_num = crop_num
28 | self.consensus_type = consensus_type
29 | self.img_feature_dim = img_feature_dim # the dimension of the CNN feature to represent each frame
30 | self.pretrain = pretrain
31 |
32 | self.is_shift = is_shift
33 | self.shift_div = shift_div
34 | self.shift_place = shift_place
35 | self.base_model_name = base_model
36 | self.fc_lr5 = fc_lr5
37 | self.temporal_pool = temporal_pool
38 | self.non_local = non_local
39 | self.data_length = data_length
40 | self.num_class = num_class
41 | self.has_VIP = has_VAP
42 |
43 | if not before_softmax and consensus_type != 'avg':
44 | raise ValueError("Only avg consensus can be used after Softmax")
45 |
46 | if new_length is None:
47 | self.new_length = 1 if modality == "RGB" or modality in ["PA", "Lite"] else 5
48 | else:
49 | self.new_length = new_length
50 | if print_spec:
51 | print(("""
52 | Initializing PAN with base model: {}.
53 | PAN Configurations:
54 | input_modality: {}
55 | num_segments: {}
56 | new_length: {}
57 | consensus_module: {}
58 | dropout_ratio: {}
59 | img_feature_dim: {}
60 | """.format(base_model, self.modality, self.num_segments, self.new_length, consensus_type, self.dropout, self.img_feature_dim)))
61 |
62 | self._prepare_base_model(base_model)
63 |
64 | if self.has_VIP:
65 | feature_dim = getattr(self.base_model, self.base_model.last_layer_name).in_features
66 | setattr(self.base_model, self.base_model.last_layer_name, VAP(self.num_segments, feature_dim, self.num_class, self.dropout))
67 | else:
68 | feature_dim = self._prepare_tsn(num_class)
69 |
70 | if self.modality == 'Flow':
71 | print("Converting the ImageNet model to a flow init model")
72 | self.base_model = self._construct_flow_model(self.base_model)
73 | print("Done. Flow model ready...")
74 | elif self.modality == 'Lite':
75 | print("=> Converting the ImageNet model to a PAN_Lite init model")
76 | self.base_model = self._construct_pa_model(self.base_model)
77 | print("=> Done. PAN_lite model ready...")
78 | elif self.modality == 'RGBDiff':
79 | print("Converting the ImageNet model to RGB+Diff init model")
80 | self.base_model = self._construct_diff_model(self.base_model)
81 | print("Done. RGBDiff model ready.")
82 |
83 | if not self.has_VIP:
84 | self.consensus = ConsensusModule(consensus_type)
85 |
86 | if not self.before_softmax:
87 | self.softmax = nn.Softmax()
88 |
89 | self._enable_pbn = partial_bn
90 | if partial_bn:
91 | self.partialBN(True)
92 |
93 | def _prepare_tsn(self, num_class):
94 | feature_dim = getattr(self.base_model, self.base_model.last_layer_name).in_features
95 | if self.dropout == 0:
96 | setattr(self.base_model, self.base_model.last_layer_name, nn.Linear(feature_dim, num_class))
97 | self.new_fc = None
98 | else:
99 | setattr(self.base_model, self.base_model.last_layer_name, nn.Dropout(p=self.dropout))
100 | self.new_fc = nn.Linear(feature_dim, num_class)
101 |
102 | std = 0.001
103 | if self.new_fc is None:
104 | normal_(getattr(self.base_model, self.base_model.last_layer_name).weight, 0, std)
105 | constant_(getattr(self.base_model, self.base_model.last_layer_name).bias, 0)
106 | else:
107 | if hasattr(self.new_fc, 'weight'):
108 | normal_(self.new_fc.weight, 0, std)
109 | constant_(self.new_fc.bias, 0)
110 | return feature_dim
111 |
112 | def _prepare_base_model(self, base_model):
113 | print('=> base model: {}'.format(base_model))
114 |
115 | if 'resnet' in base_model:
116 | if self.modality in ["PA", "Lite"]:
117 | self.PA = PA(self.data_length)
118 | self.base_model = getattr(torchvision.models, base_model)(True if self.pretrain == 'imagenet' else False)
119 | if self.is_shift:
120 | print('=> Adding temporal shift...')
121 | from ops.temporal_shift import make_temporal_shift
122 | make_temporal_shift(self.base_model, self.num_segments,
123 | n_div=self.shift_div, place=self.shift_place, temporal_pool=self.temporal_pool)
124 |
125 | if self.non_local:
126 | print('=> Adding non-local module...')
127 | from ops.non_local import make_non_local
128 | make_non_local(self.base_model, self.num_segments)
129 |
130 | self.base_model.last_layer_name = 'fc'
131 | self.input_size = 224
132 | self.input_mean = [0.485, 0.456, 0.406]
133 | self.input_std = [0.229, 0.224, 0.225]
134 |
135 | self.base_model.avgpool = nn.AdaptiveAvgPool2d(1)
136 |
137 | if self.modality == 'Flow':
138 | self.input_mean = [0.5]
139 | self.input_std = [np.mean(self.input_std)]
140 | elif self.modality == 'RGBDiff':
141 | self.input_mean = [0.485, 0.456, 0.406] + [0] * 3 * self.new_length
142 | self.input_std = self.input_std + [np.mean(self.input_std) * 2] * 3 * self.new_length
143 |
144 | elif base_model == 'mobilenetv2':
145 | from archs.mobilenet_v2 import mobilenet_v2, InvertedResidual
146 | self.base_model = mobilenet_v2(True if self.pretrain == 'imagenet' else False)
147 |
148 | self.base_model.last_layer_name = 'classifier'
149 | self.input_size = 224
150 | self.input_mean = [0.485, 0.456, 0.406]
151 | self.input_std = [0.229, 0.224, 0.225]
152 |
153 | self.base_model.avgpool = nn.AdaptiveAvgPool2d(1)
154 | if self.is_shift:
155 | from ops.temporal_shift import TemporalShift
156 | for m in self.base_model.modules():
157 | if isinstance(m, InvertedResidual) and len(m.conv) == 8 and m.use_res_connect:
158 | if self.print_spec:
159 | print('Adding temporal shift... {}'.format(m.use_res_connect))
160 | m.conv[0] = TemporalShift(m.conv[0], n_segment=self.num_segments, n_div=self.shift_div)
161 | if self.modality == 'Flow':
162 | self.input_mean = [0.5]
163 | self.input_std = [np.mean(self.input_std)]
164 | elif self.modality == 'RGBDiff':
165 | self.input_mean = [0.485, 0.456, 0.406] + [0] * 3 * self.new_length
166 | self.input_std = self.input_std + [np.mean(self.input_std) * 2] * 3 * self.new_length
167 |
168 | elif base_model == 'BNInception':
169 | from archs.bn_inception import bninception
170 | self.base_model = bninception(pretrained=self.pretrain)
171 | self.input_size = self.base_model.input_size
172 | self.input_mean = self.base_model.mean
173 | self.input_std = self.base_model.std
174 | self.base_model.last_layer_name = 'fc'
175 | if self.modality == 'Flow':
176 | self.input_mean = [128]
177 | elif self.modality == 'RGBDiff':
178 | self.input_mean = self.input_mean * (1 + self.new_length)
179 | if self.is_shift:
180 | print('Adding temporal shift...')
181 | self.base_model.build_temporal_ops(
182 | self.num_segments, is_temporal_shift=self.shift_place, shift_div=self.shift_div)
183 | else:
184 | raise ValueError('Unknown base model: {}'.format(base_model))
185 |
186 | def train(self, mode=True):
187 | """
188 | Override the default train() to freeze the BN parameters
189 | :return:
190 | """
191 | super(PAN, self).train(mode)
192 | count = 0
193 | if self._enable_pbn and mode:
194 | print("Freezing BatchNorm2D except the first one.")
195 | for m in self.base_model.modules():
196 | if isinstance(m, nn.BatchNorm2d):
197 | count += 1
198 | if count >= (2 if self._enable_pbn else 1):
199 | m.eval()
200 | # shutdown update in frozen mode
201 | m.weight.requires_grad = False
202 | m.bias.requires_grad = False
203 |
204 | def partialBN(self, enable):
205 | self._enable_pbn = enable
206 |
207 | def get_optim_policies(self):
208 | first_conv_weight = []
209 | first_conv_bias = []
210 | normal_weight = []
211 | normal_bias = []
212 | lr5_weight = []
213 | lr10_bias = []
214 | bn = []
215 | custom_ops = []
216 |
217 | conv_cnt = 0
218 | bn_cnt = 0
219 | for m in self.modules():
220 | if isinstance(m, torch.nn.Conv2d) or isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv3d):
221 | ps = list(m.parameters())
222 | conv_cnt += 1
223 | if conv_cnt == 1:
224 | first_conv_weight.append(ps[0])
225 | if len(ps) == 2:
226 | first_conv_bias.append(ps[1])
227 | else:
228 | normal_weight.append(ps[0])
229 | if len(ps) == 2:
230 | normal_bias.append(ps[1])
231 | elif isinstance(m, torch.nn.Linear):
232 | ps = list(m.parameters())
233 | if self.fc_lr5:
234 | lr5_weight.append(ps[0])
235 | else:
236 | normal_weight.append(ps[0])
237 | if len(ps) == 2:
238 | if self.fc_lr5:
239 | lr10_bias.append(ps[1])
240 | else:
241 | normal_bias.append(ps[1])
242 |
243 | elif isinstance(m, torch.nn.BatchNorm2d):
244 | bn_cnt += 1
245 | # later BN's are frozen
246 | if not self._enable_pbn or bn_cnt == 1:
247 | bn.extend(list(m.parameters()))
248 | elif isinstance(m, torch.nn.BatchNorm3d):
249 | bn_cnt += 1
250 | # later BN's are frozen
251 | if not self._enable_pbn or bn_cnt == 1:
252 | bn.extend(list(m.parameters()))
253 | elif len(m._modules) == 0:
254 | if len(list(m.parameters())) > 0:
255 | raise ValueError("New atomic module type: {}. Need to give it a learning policy".format(type(m)))
256 |
257 | return [
258 | {'params': first_conv_weight, 'lr_mult': 5 if self.modality == 'Flow' or self.modality in ['PA', 'Lite'] else 1, 'decay_mult': 1,
259 | 'name': "first_conv_weight"},
260 | {'params': first_conv_bias, 'lr_mult': 10 if self.modality == 'Flow' or self.modality in ['PA', 'Lite'] else 2, 'decay_mult': 0,
261 | 'name': "first_conv_bias"},
262 | {'params': normal_weight, 'lr_mult': 1, 'decay_mult': 1,
263 | 'name': "normal_weight"},
264 | {'params': normal_bias, 'lr_mult': 2, 'decay_mult': 0,
265 | 'name': "normal_bias"},
266 | {'params': bn, 'lr_mult': 1, 'decay_mult': 0,
267 | 'name': "BN scale/shift"},
268 | {'params': custom_ops, 'lr_mult': 1, 'decay_mult': 1,
269 | 'name': "custom_ops"},
270 | # for fc
271 | {'params': lr5_weight, 'lr_mult': 5, 'decay_mult': 1,
272 | 'name': "lr5_weight"},
273 | {'params': lr10_bias, 'lr_mult': 10, 'decay_mult': 0,
274 | 'name': "lr10_bias"},
275 | ]
276 |
277 | def forward(self, input, no_reshape=False):
278 | if not no_reshape:
279 | sample_len = (3 if self.modality in ['RGB', 'PA', 'Lite'] else 2) * self.new_length
280 |
281 | if self.modality == 'RGBDiff':
282 | sample_len = 3 * self.new_length
283 | input = self._get_diff(input)
284 |
285 | if self.modality == 'PA':
286 | base_out = self.PA(input.view((-1, sample_len) + input.size()[-2:]))
287 | base_out = self.base_model(base_out)
288 | elif self.modality == 'Lite':
289 | input = input.view((-1, sample_len) + input.size()[-2:])
290 | PA = self.PA(input)
291 | RGB = input.view((-1, self.data_length, sample_len) + input.size()[-2:])[:,0,:,:,:]
292 | base_out = torch.cat((RGB, PA), 1)
293 | base_out = self.base_model(base_out)
294 | else:
295 | base_out = self.base_model(input.view((-1, sample_len) + input.size()[-2:]))
296 | else:
297 | base_out = self.base_model(input)
298 |
299 | if self.has_VIP:
300 | return base_out
301 |
302 | if self.dropout > 0:
303 | base_out = self.new_fc(base_out)
304 |
305 | if not self.before_softmax:
306 | base_out = self.softmax(base_out)
307 |
308 | if self.reshape:
309 | if self.is_shift and self.temporal_pool:
310 | base_out = base_out.view((-1, self.num_segments // 2) + base_out.size()[1:])
311 | else:
312 | base_out = base_out.view((-1, self.num_segments) + base_out.size()[1:])
313 | output = self.consensus(base_out)
314 | return output.squeeze(1)
315 |
316 | def _get_diff(self, input, keep_rgb=False):
317 | input_c = 3 if self.modality in ["RGB", "PA", "Lite", "RGBDiff"] else 2
318 | input_view = input.view((-1, self.num_segments, self.new_length + 1, input_c,) + input.size()[2:])
319 | if keep_rgb:
320 | new_data = input_view.clone()
321 | else:
322 | new_data = input_view[:, :, 1:, :, :, :].clone()
323 |
324 | for x in reversed(list(range(1, self.new_length + 1))):
325 | if keep_rgb:
326 | new_data[:, :, x, :, :, :] = input_view[:, :, x, :, :, :] - input_view[:, :, x - 1, :, :, :]
327 | else:
328 | new_data[:, :, x - 1, :, :, :] = input_view[:, :, x, :, :, :] - input_view[:, :, x - 1, :, :, :]
329 |
330 | return new_data
331 |
332 | def _construct_pa_model(self, base_model):
333 | # modify the convolution layers
334 | # Torch models are usually defined in a hierarchical way.
335 | # nn.modules.children() return all sub modules in a DFS manner
336 | modules = list(self.base_model.modules())
337 | first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv2d), list(range(len(modules)))))[0]
338 | conv_layer = modules[first_conv_idx]
339 | container = modules[first_conv_idx - 1]
340 |
341 | # modify parameters, assume the first blob contains the convolution kernels
342 | params = [x.clone() for x in conv_layer.parameters()]
343 | kernel_size = params[0].size()
344 | new_kernel_size = kernel_size[:1] + (6, ) + kernel_size[2:]
345 | new_kernels = params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous()
346 |
347 | new_conv = nn.Conv2d(6, conv_layer.out_channels,
348 | conv_layer.kernel_size, conv_layer.stride, conv_layer.padding,
349 | bias=True if len(params) == 2 else False)
350 | new_conv.weight.data = new_kernels
351 | if len(params) == 2:
352 | new_conv.bias.data = params[1].data # add bias if neccessary
353 | layer_name = list(container.state_dict().keys())[0][:-7] # remove .weight suffix to get the layer name
354 |
355 | # replace the first convlution layer
356 | setattr(container, layer_name, new_conv)
357 |
358 | if self.base_model_name == 'BNInception':
359 | import torch.utils.model_zoo as model_zoo
360 | sd = model_zoo.load_url('https://www.dropbox.com/s/35ftw2t4mxxgjae/BNInceptionFlow-ef652051.pth.tar?dl=1')
361 | base_model.load_state_dict(sd)
362 | print('=> Loading pretrained Flow weight done...')
363 | return base_model
364 |
365 | def _construct_flow_model(self, base_model):
366 | # modify the convolution layers
367 | # Torch models are usually defined in a hierarchical way.
368 | # nn.modules.children() return all sub modules in a DFS manner
369 | modules = list(self.base_model.modules())
370 | first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv2d), list(range(len(modules)))))[0]
371 | conv_layer = modules[first_conv_idx]
372 | container = modules[first_conv_idx - 1]
373 |
374 | # modify parameters, assume the first blob contains the convolution kernels
375 | params = [x.clone() for x in conv_layer.parameters()]
376 | kernel_size = params[0].size()
377 | new_kernel_size = kernel_size[:1] + (2 * self.new_length, ) + kernel_size[2:]
378 | new_kernels = params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous()
379 |
380 | new_conv = nn.Conv2d(2 * self.new_length, conv_layer.out_channels,
381 | conv_layer.kernel_size, conv_layer.stride, conv_layer.padding,
382 | bias=True if len(params) == 2 else False)
383 | new_conv.weight.data = new_kernels
384 | if len(params) == 2:
385 | new_conv.bias.data = params[1].data # add bias if neccessary
386 | layer_name = list(container.state_dict().keys())[0][:-7] # remove .weight suffix to get the layer name
387 |
388 | # replace the first convlution layer
389 | setattr(container, layer_name, new_conv)
390 |
391 | if self.base_model_name == 'BNInception':
392 | import torch.utils.model_zoo as model_zoo
393 | sd = model_zoo.load_url('https://www.dropbox.com/s/35ftw2t4mxxgjae/BNInceptionFlow-ef652051.pth.tar?dl=1')
394 | base_model.load_state_dict(sd)
395 | print('=> Loading pretrained Flow weight done...')
396 | else:
397 | print('#' * 30, 'Warning! No Flow pretrained model is found')
398 | return base_model
399 |
400 | def _construct_diff_model(self, base_model, keep_rgb=False):
401 | # modify the convolution layers
402 | # Torch models are usually defined in a hierarchical way.
403 | # nn.modules.children() return all sub modules in a DFS manner
404 | modules = list(self.base_model.modules())
405 | first_conv_idx = filter(lambda x: isinstance(modules[x], nn.Conv2d), list(range(len(modules))))[0]
406 | conv_layer = modules[first_conv_idx]
407 | container = modules[first_conv_idx - 1]
408 |
409 | # modify parameters, assume the first blob contains the convolution kernels
410 | params = [x.clone() for x in conv_layer.parameters()]
411 | kernel_size = params[0].size()
412 | if not keep_rgb:
413 | new_kernel_size = kernel_size[:1] + (3 * self.new_length,) + kernel_size[2:]
414 | new_kernels = params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous()
415 | else:
416 | new_kernel_size = kernel_size[:1] + (3 * self.new_length,) + kernel_size[2:]
417 | new_kernels = torch.cat((params[0].data, params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous()),
418 | 1)
419 | new_kernel_size = kernel_size[:1] + (3 + 3 * self.new_length,) + kernel_size[2:]
420 |
421 | new_conv = nn.Conv2d(new_kernel_size[1], conv_layer.out_channels,
422 | conv_layer.kernel_size, conv_layer.stride, conv_layer.padding,
423 | bias=True if len(params) == 2 else False)
424 | new_conv.weight.data = new_kernels
425 | if len(params) == 2:
426 | new_conv.bias.data = params[1].data # add bias if neccessary
427 | layer_name = list(container.state_dict().keys())[0][:-7] # remove .weight suffix to get the layer name
428 |
429 | # replace the first convolution layer
430 | setattr(container, layer_name, new_conv)
431 | return base_model
432 |
433 | @property
434 | def crop_size(self):
435 | return self.input_size
436 |
437 | @property
438 | def scale_size(self):
439 | return self.input_size * 256 // 224
440 |
441 | def get_augmentation(self, flip=True):
442 | if self.modality in ['RGB', 'PA', 'Lite']:
443 | if flip:
444 | return torchvision.transforms.Compose([GroupMultiScaleCrop(self.input_size, [1, .875, .75, .66]),
445 | GroupRandomHorizontalFlip(is_flow=False)])
446 | else:
447 | print('=> NO FLIP!!!')
448 | return torchvision.transforms.Compose([GroupMultiScaleCrop(self.input_size, [1, .875, .75, .66])])
449 | elif self.modality == 'Flow':
450 | return torchvision.transforms.Compose([GroupMultiScaleCrop(self.input_size, [1, .875, .75]),
451 | GroupRandomHorizontalFlip(is_flow=True)])
452 | elif self.modality == 'RGBDiff':
453 | return torchvision.transforms.Compose([GroupMultiScaleCrop(self.input_size, [1, .875, .75]),
454 | GroupRandomHorizontalFlip(is_flow=False)])
455 |
--------------------------------------------------------------------------------
/ops/non_local.py:
--------------------------------------------------------------------------------
1 | # Non-local block using embedded gaussian
2 | # Code from
3 | # https://github.com/AlexHex7/Non-local_pytorch/blob/master/Non-Local_pytorch_0.3.1/lib/non_local_embedded_gaussian.py
4 |
5 | import torch
6 | from torch import nn
7 | from torch.nn import functional as F
8 |
9 |
10 | class _NonLocalBlockND(nn.Module):
11 | def __init__(self, in_channels, inter_channels=None, dimension=3, sub_sample=True, bn_layer=True):
12 | super(_NonLocalBlockND, self).__init__()
13 |
14 | assert dimension in [1, 2, 3]
15 |
16 | self.dimension = dimension
17 | self.sub_sample = sub_sample
18 |
19 | self.in_channels = in_channels
20 | self.inter_channels = inter_channels
21 |
22 | if self.inter_channels is None:
23 | self.inter_channels = in_channels // 2
24 | if self.inter_channels == 0:
25 | self.inter_channels = 1
26 |
27 | if dimension == 3:
28 | conv_nd = nn.Conv3d
29 | max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
30 | bn = nn.BatchNorm3d
31 | elif dimension == 2:
32 | conv_nd = nn.Conv2d
33 | max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
34 | bn = nn.BatchNorm2d
35 | else:
36 | conv_nd = nn.Conv1d
37 | max_pool_layer = nn.MaxPool1d(kernel_size=(2))
38 | bn = nn.BatchNorm1d
39 |
40 | self.g = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
41 | kernel_size=1, stride=1, padding=0)
42 |
43 | if bn_layer:
44 | self.W = nn.Sequential(
45 | conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels,
46 | kernel_size=1, stride=1, padding=0),
47 | bn(self.in_channels)
48 | )
49 | nn.init.constant_(self.W[1].weight, 0)
50 | nn.init.constant_(self.W[1].bias, 0)
51 | else:
52 | self.W = conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels,
53 | kernel_size=1, stride=1, padding=0)
54 | nn.init.constant_(self.W.weight, 0)
55 | nn.init.constant_(self.W.bias, 0)
56 |
57 | self.theta = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
58 | kernel_size=1, stride=1, padding=0)
59 | self.phi = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
60 | kernel_size=1, stride=1, padding=0)
61 |
62 | if sub_sample:
63 | self.g = nn.Sequential(self.g, max_pool_layer)
64 | self.phi = nn.Sequential(self.phi, max_pool_layer)
65 |
66 | def forward(self, x):
67 | '''
68 | :param x: (b, c, t, h, w)
69 | :return:
70 | '''
71 |
72 | batch_size = x.size(0)
73 |
74 | g_x = self.g(x).view(batch_size, self.inter_channels, -1)
75 | g_x = g_x.permute(0, 2, 1)
76 |
77 | theta_x = self.theta(x).view(batch_size, self.inter_channels, -1)
78 | theta_x = theta_x.permute(0, 2, 1)
79 | phi_x = self.phi(x).view(batch_size, self.inter_channels, -1)
80 | f = torch.matmul(theta_x, phi_x)
81 | f_div_C = F.softmax(f, dim=-1)
82 |
83 | y = torch.matmul(f_div_C, g_x)
84 | y = y.permute(0, 2, 1).contiguous()
85 | y = y.view(batch_size, self.inter_channels, *x.size()[2:])
86 | W_y = self.W(y)
87 | z = W_y + x
88 |
89 | return z
90 |
91 |
92 | class NONLocalBlock1D(_NonLocalBlockND):
93 | def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
94 | super(NONLocalBlock1D, self).__init__(in_channels,
95 | inter_channels=inter_channels,
96 | dimension=1, sub_sample=sub_sample,
97 | bn_layer=bn_layer)
98 |
99 |
100 | class NONLocalBlock2D(_NonLocalBlockND):
101 | def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
102 | super(NONLocalBlock2D, self).__init__(in_channels,
103 | inter_channels=inter_channels,
104 | dimension=2, sub_sample=sub_sample,
105 | bn_layer=bn_layer)
106 |
107 |
108 | class NONLocalBlock3D(_NonLocalBlockND):
109 | def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
110 | super(NONLocalBlock3D, self).__init__(in_channels,
111 | inter_channels=inter_channels,
112 | dimension=3, sub_sample=sub_sample,
113 | bn_layer=bn_layer)
114 |
115 |
116 | class NL3DWrapper(nn.Module):
117 | def __init__(self, block, n_segment):
118 | super(NL3DWrapper, self).__init__()
119 | self.block = block
120 | self.nl = NONLocalBlock3D(block.bn3.num_features)
121 | self.n_segment = n_segment
122 |
123 | def forward(self, x):
124 | x = self.block(x)
125 |
126 | nt, c, h, w = x.size()
127 | x = x.view(nt // self.n_segment, self.n_segment, c, h, w).transpose(1, 2) # n, c, t, h, w
128 | x = self.nl(x)
129 | x = x.transpose(1, 2).contiguous().view(nt, c, h, w)
130 | return x
131 |
132 |
133 | def make_non_local(net, n_segment):
134 | import torchvision
135 | import archs
136 | if isinstance(net, torchvision.models.ResNet):
137 | net.layer2 = nn.Sequential(
138 | NL3DWrapper(net.layer2[0], n_segment),
139 | net.layer2[1],
140 | NL3DWrapper(net.layer2[2], n_segment),
141 | net.layer2[3],
142 | )
143 | net.layer3 = nn.Sequential(
144 | NL3DWrapper(net.layer3[0], n_segment),
145 | net.layer3[1],
146 | NL3DWrapper(net.layer3[2], n_segment),
147 | net.layer3[3],
148 | NL3DWrapper(net.layer3[4], n_segment),
149 | net.layer3[5],
150 | )
151 | else:
152 | raise NotImplementedError
153 |
154 |
155 | if __name__ == '__main__':
156 | from torch.autograd import Variable
157 | import torch
158 |
159 | sub_sample = True
160 | bn_layer = True
161 |
162 | img = Variable(torch.zeros(2, 3, 20))
163 | net = NONLocalBlock1D(3, sub_sample=sub_sample, bn_layer=bn_layer)
164 | out = net(img)
165 | print(out.size())
166 |
167 | img = Variable(torch.zeros(2, 3, 20, 20))
168 | net = NONLocalBlock2D(3, sub_sample=sub_sample, bn_layer=bn_layer)
169 | out = net(img)
170 | print(out.size())
171 |
172 | img = Variable(torch.randn(2, 3, 10, 20, 20))
173 | net = NONLocalBlock3D(3, sub_sample=sub_sample, bn_layer=bn_layer)
174 | out = net(img)
175 | print(out.size())
176 |
--------------------------------------------------------------------------------
/ops/temporal_shift.py:
--------------------------------------------------------------------------------
1 | # Code from "TSM: Temporal Shift Module for Efficient Video Understanding"
2 | # https://github.com/mit-han-lab/temporal-shift-module
3 |
4 | import torch
5 | import torch.nn as nn
6 | import torch.nn.functional as F
7 |
8 |
9 | class TemporalShift(nn.Module):
10 | def __init__(self, net, n_segment=3, n_div=8, inplace=False):
11 | super(TemporalShift, self).__init__()
12 | self.net = net
13 | self.n_segment = n_segment
14 | self.fold_div = n_div
15 | self.inplace = inplace
16 | if inplace:
17 | print('=> Using in-place shift...')
18 | #print('=> Using fold div: {}'.format(self.fold_div))
19 |
20 | def forward(self, x):
21 | x = self.shift(x, self.n_segment, fold_div=self.fold_div, inplace=self.inplace)
22 | return self.net(x)
23 |
24 | @staticmethod
25 | def shift(x, n_segment, fold_div=3, inplace=False):
26 | nt, c, h, w = x.size()
27 | n_batch = nt // n_segment
28 | x = x.view(n_batch, n_segment, c, h, w)
29 |
30 | fold = c // fold_div
31 | if inplace:
32 | # Due to some out of order error when performing parallel computing.
33 | # May need to write a CUDA kernel.
34 | raise NotImplementedError
35 | # out = InplaceShift.apply(x, fold)
36 | else:
37 | out = torch.zeros_like(x)
38 | out[:, :-1, :fold] = x[:, 1:, :fold] # shift left
39 | out[:, 1:, fold: 2 * fold] = x[:, :-1, fold: 2 * fold] # shift right
40 | out[:, :, 2 * fold:] = x[:, :, 2 * fold:] # not shift
41 |
42 | return out.view(nt, c, h, w)
43 |
44 |
45 | class InplaceShift(torch.autograd.Function):
46 | # Special thanks to @raoyongming for the help to this function
47 | @staticmethod
48 | def forward(ctx, input, fold):
49 | # not support higher order gradient
50 | # input = input.detach_()
51 | ctx.fold_ = fold
52 | n, t, c, h, w = input.size()
53 | buffer = input.data.new(n, t, fold, h, w).zero_()
54 | buffer[:, :-1] = input.data[:, 1:, :fold]
55 | input.data[:, :, :fold] = buffer
56 | buffer.zero_()
57 | buffer[:, 1:] = input.data[:, :-1, fold: 2 * fold]
58 | input.data[:, :, fold: 2 * fold] = buffer
59 | return input
60 |
61 | @staticmethod
62 | def backward(ctx, grad_output):
63 | # grad_output = grad_output.detach_()
64 | fold = ctx.fold_
65 | n, t, c, h, w = grad_output.size()
66 | buffer = grad_output.data.new(n, t, fold, h, w).zero_()
67 | buffer[:, 1:] = grad_output.data[:, :-1, :fold]
68 | grad_output.data[:, :, :fold] = buffer
69 | buffer.zero_()
70 | buffer[:, :-1] = grad_output.data[:, 1:, fold: 2 * fold]
71 | grad_output.data[:, :, fold: 2 * fold] = buffer
72 | return grad_output, None
73 |
74 |
75 | class TemporalPool(nn.Module):
76 | def __init__(self, net, n_segment):
77 | super(TemporalPool, self).__init__()
78 | self.net = net
79 | self.n_segment = n_segment
80 |
81 | def forward(self, x):
82 | x = self.temporal_pool(x, n_segment=self.n_segment)
83 | return self.net(x)
84 |
85 | @staticmethod
86 | def temporal_pool(x, n_segment):
87 | nt, c, h, w = x.size()
88 | n_batch = nt // n_segment
89 | x = x.view(n_batch, n_segment, c, h, w).transpose(1, 2) # n, c, t, h, w
90 | x = F.max_pool3d(x, kernel_size=(3, 1, 1), stride=(2, 1, 1), padding=(1, 0, 0))
91 | x = x.transpose(1, 2).contiguous().view(nt // 2, c, h, w)
92 | return x
93 |
94 |
95 | def make_temporal_shift(net, n_segment, n_div=8, place='blockres', temporal_pool=False):
96 | if temporal_pool:
97 | n_segment_list = [n_segment, n_segment // 2, n_segment // 2, n_segment // 2]
98 | else:
99 | n_segment_list = [n_segment] * 4
100 | assert n_segment_list[-1] > 0
101 | #print('=> n_segment per stage: {}'.format(n_segment_list))
102 |
103 | import torchvision
104 | if isinstance(net, torchvision.models.ResNet):
105 | if place == 'block':
106 | def make_block_temporal(stage, this_segment):
107 | blocks = list(stage.children())
108 | #print('=> Processing stage with {} blocks'.format(len(blocks)))
109 | for i, b in enumerate(blocks):
110 | blocks[i] = TemporalShift(b, n_segment=this_segment, n_div=n_div)
111 | return nn.Sequential(*(blocks))
112 |
113 | net.layer1 = make_block_temporal(net.layer1, n_segment_list[0])
114 | net.layer2 = make_block_temporal(net.layer2, n_segment_list[1])
115 | net.layer3 = make_block_temporal(net.layer3, n_segment_list[2])
116 | net.layer4 = make_block_temporal(net.layer4, n_segment_list[3])
117 |
118 | elif 'blockres' in place:
119 | n_round = 1
120 | if len(list(net.layer3.children())) >= 23:
121 | n_round = 2
122 | #print('=> Using n_round {} to insert temporal shift'.format(n_round))
123 |
124 | def make_block_temporal(stage, this_segment):
125 | blocks = list(stage.children())
126 | #print('=> Processing stage with {} blocks residual'.format(len(blocks)))
127 | for i, b in enumerate(blocks):
128 | if i % n_round == 0:
129 | blocks[i].conv1 = TemporalShift(b.conv1, n_segment=this_segment, n_div=n_div)
130 | return nn.Sequential(*blocks)
131 |
132 | net.layer1 = make_block_temporal(net.layer1, n_segment_list[0])
133 | net.layer2 = make_block_temporal(net.layer2, n_segment_list[1])
134 | net.layer3 = make_block_temporal(net.layer3, n_segment_list[2])
135 | net.layer4 = make_block_temporal(net.layer4, n_segment_list[3])
136 | else:
137 | raise NotImplementedError(place)
138 |
139 |
140 | def make_temporal_pool(net, n_segment):
141 | import torchvision
142 | if isinstance(net, torchvision.models.ResNet):
143 | print('=> Injecting nonlocal pooling')
144 | net.layer2 = TemporalPool(net.layer2, n_segment)
145 | else:
146 | raise NotImplementedError
147 |
148 |
149 | if __name__ == '__main__':
150 | # test inplace shift v.s. vanilla shift
151 | tsm1 = TemporalShift(nn.Sequential(), n_segment=8, n_div=8, inplace=False)
152 | tsm2 = TemporalShift(nn.Sequential(), n_segment=8, n_div=8, inplace=True)
153 |
154 | print('=> Testing CPU...')
155 | # test forward
156 | with torch.no_grad():
157 | for i in range(10):
158 | x = torch.rand(2 * 8, 3, 224, 224)
159 | y1 = tsm1(x)
160 | y2 = tsm2(x)
161 | assert torch.norm(y1 - y2).item() < 1e-5
162 |
163 | # test backward
164 | with torch.enable_grad():
165 | for i in range(10):
166 | x1 = torch.rand(2 * 8, 3, 224, 224)
167 | x1.requires_grad_()
168 | x2 = x1.clone()
169 | y1 = tsm1(x1)
170 | y2 = tsm2(x2)
171 | grad1 = torch.autograd.grad((y1 ** 2).mean(), [x1])[0]
172 | grad2 = torch.autograd.grad((y2 ** 2).mean(), [x2])[0]
173 | assert torch.norm(grad1 - grad2).item() < 1e-5
174 |
175 | print('=> Testing GPU...')
176 | tsm1.cuda()
177 | tsm2.cuda()
178 | # test forward
179 | with torch.no_grad():
180 | for i in range(10):
181 | x = torch.rand(2 * 8, 3, 224, 224).cuda()
182 | y1 = tsm1(x)
183 | y2 = tsm2(x)
184 | assert torch.norm(y1 - y2).item() < 1e-5
185 |
186 | # test backward
187 | with torch.enable_grad():
188 | for i in range(10):
189 | x1 = torch.rand(2 * 8, 3, 224, 224).cuda()
190 | x1.requires_grad_()
191 | x2 = x1.clone()
192 | y1 = tsm1(x1)
193 | y2 = tsm2(x2)
194 | grad1 = torch.autograd.grad((y1 ** 2).mean(), [x1])[0]
195 | grad2 = torch.autograd.grad((y2 ** 2).mean(), [x2])[0]
196 | assert torch.norm(grad1 - grad2).item() < 1e-5
197 | print('Test passed.')
198 |
199 |
200 |
201 |
202 |
--------------------------------------------------------------------------------
/ops/transforms.py:
--------------------------------------------------------------------------------
1 | # Code for paper:
2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance"
3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan
4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch
5 |
6 | import torchvision
7 | import random
8 | from PIL import Image, ImageOps
9 | import numpy as np
10 | import numbers
11 | import math
12 | import torch
13 |
14 |
15 | class GroupRandomCrop(object):
16 | def __init__(self, size):
17 | if isinstance(size, numbers.Number):
18 | self.size = (int(size), int(size))
19 | else:
20 | self.size = size
21 |
22 | def __call__(self, img_group):
23 |
24 | w, h = img_group[0].size
25 | th, tw = self.size
26 |
27 | out_images = list()
28 |
29 | x1 = random.randint(0, w - tw)
30 | y1 = random.randint(0, h - th)
31 |
32 | for img in img_group:
33 | assert(img.size[0] == w and img.size[1] == h)
34 | if w == tw and h == th:
35 | out_images.append(img)
36 | else:
37 | out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
38 |
39 | return out_images
40 |
41 |
42 | class GroupCenterCrop(object):
43 | def __init__(self, size):
44 | self.worker = torchvision.transforms.CenterCrop(size)
45 |
46 | def __call__(self, img_group):
47 | return [self.worker(img) for img in img_group]
48 |
49 |
50 | class GroupRandomHorizontalFlip(object):
51 | """Randomly horizontally flips the given PIL.Image with a probability of 0.5
52 | """
53 | def __init__(self, is_flow=False):
54 | self.is_flow = is_flow
55 |
56 | def __call__(self, img_group, is_flow=False):
57 | v = random.random()
58 | if v < 0.5:
59 | ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group]
60 | if self.is_flow:
61 | for i in range(0, len(ret), 2):
62 | ret[i] = ImageOps.invert(ret[i]) # invert flow pixel values when flipping
63 | return ret
64 | else:
65 | return img_group
66 |
67 |
68 | class GroupNormalize(object):
69 | def __init__(self, mean, std):
70 | self.mean = mean
71 | self.std = std
72 |
73 | def __call__(self, tensor):
74 | rep_mean = self.mean * (tensor.size()[0]//len(self.mean))
75 | rep_std = self.std * (tensor.size()[0]//len(self.std))
76 |
77 | # TODO: make efficient
78 | for t, m, s in zip(tensor, rep_mean, rep_std):
79 | t.sub_(m).div_(s)
80 |
81 | return tensor
82 |
83 |
84 | class GroupScale(object):
85 | """ Rescales the input PIL.Image to the given 'size'.
86 | 'size' will be the size of the smaller edge.
87 | For example, if height > width, then image will be
88 | rescaled to (size * height / width, size)
89 | size: size of the smaller edge
90 | interpolation: Default: PIL.Image.BILINEAR
91 | """
92 |
93 | def __init__(self, size, interpolation=Image.BILINEAR):
94 | self.worker = torchvision.transforms.Resize(size, interpolation)
95 |
96 | def __call__(self, img_group):
97 | return [self.worker(img) for img in img_group]
98 |
99 |
100 | class GroupOverSample(object):
101 | def __init__(self, crop_size, scale_size=None, flip=True):
102 | self.crop_size = crop_size if not isinstance(crop_size, int) else (crop_size, crop_size)
103 |
104 | if scale_size is not None:
105 | self.scale_worker = GroupScale(scale_size)
106 | else:
107 | self.scale_worker = None
108 | self.flip = flip
109 |
110 | def __call__(self, img_group):
111 |
112 | if self.scale_worker is not None:
113 | img_group = self.scale_worker(img_group)
114 |
115 | image_w, image_h = img_group[0].size
116 | crop_w, crop_h = self.crop_size
117 |
118 | offsets = GroupMultiScaleCrop.fill_fix_offset(False, image_w, image_h, crop_w, crop_h)
119 | oversample_group = list()
120 | for o_w, o_h in offsets:
121 | normal_group = list()
122 | flip_group = list()
123 | for i, img in enumerate(img_group):
124 | crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h))
125 | normal_group.append(crop)
126 | flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT)
127 |
128 | if img.mode == 'L' and i % 2 == 0:
129 | flip_group.append(ImageOps.invert(flip_crop))
130 | else:
131 | flip_group.append(flip_crop)
132 |
133 | oversample_group.extend(normal_group)
134 | if self.flip:
135 | oversample_group.extend(flip_group)
136 | return oversample_group
137 |
138 |
139 | class GroupFullResSample(object):
140 | def __init__(self, crop_size, scale_size=None, flip=True):
141 | self.crop_size = crop_size if not isinstance(crop_size, int) else (crop_size, crop_size)
142 |
143 | if scale_size is not None:
144 | self.scale_worker = GroupScale(scale_size)
145 | else:
146 | self.scale_worker = None
147 | self.flip = flip
148 |
149 | def __call__(self, img_group):
150 |
151 | if self.scale_worker is not None:
152 | img_group = self.scale_worker(img_group)
153 |
154 | image_w, image_h = img_group[0].size
155 | crop_w, crop_h = self.crop_size
156 |
157 | w_step = (image_w - crop_w) // 4
158 | h_step = (image_h - crop_h) // 4
159 |
160 | offsets = list()
161 | offsets.append((0 * w_step, 2 * h_step)) # left
162 | offsets.append((4 * w_step, 2 * h_step)) # right
163 | offsets.append((2 * w_step, 2 * h_step)) # center
164 |
165 | oversample_group = list()
166 | for o_w, o_h in offsets:
167 | normal_group = list()
168 | flip_group = list()
169 | for i, img in enumerate(img_group):
170 | crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h))
171 | normal_group.append(crop)
172 | if self.flip:
173 | flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT)
174 |
175 | if img.mode == 'L' and i % 2 == 0:
176 | flip_group.append(ImageOps.invert(flip_crop))
177 | else:
178 | flip_group.append(flip_crop)
179 |
180 | oversample_group.extend(normal_group)
181 | oversample_group.extend(flip_group)
182 | return oversample_group
183 |
184 |
185 | class GroupMultiScaleCrop(object):
186 |
187 | def __init__(self, input_size, scales=None, max_distort=1, fix_crop=True, more_fix_crop=True):
188 | self.scales = scales if scales is not None else [1, .875, .75, .66]
189 | self.max_distort = max_distort
190 | self.fix_crop = fix_crop
191 | self.more_fix_crop = more_fix_crop
192 | self.input_size = input_size if not isinstance(input_size, int) else [input_size, input_size]
193 | self.interpolation = Image.BILINEAR
194 |
195 | def __call__(self, img_group):
196 |
197 | im_size = img_group[0].size
198 |
199 | crop_w, crop_h, offset_w, offset_h = self._sample_crop_size(im_size)
200 | crop_img_group = [img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h)) for img in img_group]
201 | ret_img_group = [img.resize((self.input_size[0], self.input_size[1]), self.interpolation)
202 | for img in crop_img_group]
203 | return ret_img_group
204 |
205 | def _sample_crop_size(self, im_size):
206 | image_w, image_h = im_size[0], im_size[1]
207 |
208 | # find a crop size
209 | base_size = min(image_w, image_h)
210 | crop_sizes = [int(base_size * x) for x in self.scales]
211 | crop_h = [self.input_size[1] if abs(x - self.input_size[1]) < 3 else x for x in crop_sizes]
212 | crop_w = [self.input_size[0] if abs(x - self.input_size[0]) < 3 else x for x in crop_sizes]
213 |
214 | pairs = []
215 | for i, h in enumerate(crop_h):
216 | for j, w in enumerate(crop_w):
217 | if abs(i - j) <= self.max_distort:
218 | pairs.append((w, h))
219 |
220 | crop_pair = random.choice(pairs)
221 | if not self.fix_crop:
222 | w_offset = random.randint(0, image_w - crop_pair[0])
223 | h_offset = random.randint(0, image_h - crop_pair[1])
224 | else:
225 | w_offset, h_offset = self._sample_fix_offset(image_w, image_h, crop_pair[0], crop_pair[1])
226 |
227 | return crop_pair[0], crop_pair[1], w_offset, h_offset
228 |
229 | def _sample_fix_offset(self, image_w, image_h, crop_w, crop_h):
230 | offsets = self.fill_fix_offset(self.more_fix_crop, image_w, image_h, crop_w, crop_h)
231 | return random.choice(offsets)
232 |
233 | @staticmethod
234 | def fill_fix_offset(more_fix_crop, image_w, image_h, crop_w, crop_h):
235 | w_step = (image_w - crop_w) // 4
236 | h_step = (image_h - crop_h) // 4
237 |
238 | ret = list()
239 | ret.append((0, 0)) # upper left
240 | ret.append((4 * w_step, 0)) # upper right
241 | ret.append((0, 4 * h_step)) # lower left
242 | ret.append((4 * w_step, 4 * h_step)) # lower right
243 | ret.append((2 * w_step, 2 * h_step)) # center
244 |
245 | if more_fix_crop:
246 | ret.append((0, 2 * h_step)) # center left
247 | ret.append((4 * w_step, 2 * h_step)) # center right
248 | ret.append((2 * w_step, 4 * h_step)) # lower center
249 | ret.append((2 * w_step, 0 * h_step)) # upper center
250 |
251 | ret.append((1 * w_step, 1 * h_step)) # upper left quarter
252 | ret.append((3 * w_step, 1 * h_step)) # upper right quarter
253 | ret.append((1 * w_step, 3 * h_step)) # lower left quarter
254 | ret.append((3 * w_step, 3 * h_step)) # lower righ quarter
255 |
256 | return ret
257 |
258 |
259 | class GroupRandomSizedCrop(object):
260 | """Random crop the given PIL.Image to a random size of (0.08 to 1.0) of the original size
261 | and and a random aspect ratio of 3/4 to 4/3 of the original aspect ratio
262 | This is popularly used to train the Inception networks
263 | size: size of the smaller edge
264 | interpolation: Default: PIL.Image.BILINEAR
265 | """
266 | def __init__(self, size, interpolation=Image.BILINEAR):
267 | self.size = size
268 | self.interpolation = interpolation
269 |
270 | def __call__(self, img_group):
271 | for attempt in range(10):
272 | area = img_group[0].size[0] * img_group[0].size[1]
273 | target_area = random.uniform(0.08, 1.0) * area
274 | aspect_ratio = random.uniform(3. / 4, 4. / 3)
275 |
276 | w = int(round(math.sqrt(target_area * aspect_ratio)))
277 | h = int(round(math.sqrt(target_area / aspect_ratio)))
278 |
279 | if random.random() < 0.5:
280 | w, h = h, w
281 |
282 | if w <= img_group[0].size[0] and h <= img_group[0].size[1]:
283 | x1 = random.randint(0, img_group[0].size[0] - w)
284 | y1 = random.randint(0, img_group[0].size[1] - h)
285 | found = True
286 | break
287 | else:
288 | found = False
289 | x1 = 0
290 | y1 = 0
291 |
292 | if found:
293 | out_group = list()
294 | for img in img_group:
295 | img = img.crop((x1, y1, x1 + w, y1 + h))
296 | assert(img.size == (w, h))
297 | out_group.append(img.resize((self.size, self.size), self.interpolation))
298 | return out_group
299 | else:
300 | # Fallback
301 | scale = GroupScale(self.size, interpolation=self.interpolation)
302 | crop = GroupRandomCrop(self.size)
303 | return crop(scale(img_group))
304 |
305 |
306 | class Stack(object):
307 |
308 | def __init__(self, roll=False):
309 | self.roll = roll
310 |
311 | def __call__(self, img_group):
312 | if img_group[0].mode == 'L':
313 | return np.concatenate([np.expand_dims(x, 2) for x in img_group], axis=2)
314 | elif img_group[0].mode == 'RGB':
315 | if self.roll:
316 | return np.concatenate([np.array(x)[:, :, ::-1] for x in img_group], axis=2)
317 | else:
318 | return np.concatenate(img_group, axis=2)
319 |
320 |
321 | class ToTorchFormatTensor(object):
322 | """ Converts a PIL.Image (RGB) or numpy.ndarray (H x W x C) in the range [0, 255]
323 | to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] """
324 | def __init__(self, div=True):
325 | self.div = div
326 |
327 | def __call__(self, pic):
328 | if isinstance(pic, np.ndarray):
329 | # handle numpy array
330 | img = torch.from_numpy(pic).permute(2, 0, 1).contiguous()
331 | else:
332 | # handle PIL Image
333 | img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
334 | img = img.view(pic.size[1], pic.size[0], len(pic.mode))
335 | # put it from HWC to CHW format
336 | # yikes, this transpose takes 80% of the loading time/CPU
337 | img = img.transpose(0, 1).transpose(0, 2).contiguous()
338 | return img.float().div(255) if self.div else img.float()
339 |
340 |
341 | class IdentityTransform(object):
342 |
343 | def __call__(self, data):
344 | return data
345 |
346 |
347 | if __name__ == "__main__":
348 | trans = torchvision.transforms.Compose([
349 | GroupScale(256),
350 | GroupRandomCrop(224),
351 | Stack(),
352 | ToTorchFormatTensor(),
353 | GroupNormalize(
354 | mean=[.485, .456, .406],
355 | std=[.229, .224, .225]
356 | )]
357 | )
358 |
359 | im = Image.open('../tensorflow-model-zoo.torch/lena_299.png')
360 |
361 | color_group = [im] * 3
362 | rst = trans(color_group)
363 |
364 | gray_group = [im.convert('L')] * 9
365 | gray_rst = trans(gray_group)
366 |
367 | trans2 = torchvision.transforms.Compose([
368 | GroupRandomSizedCrop(256),
369 | Stack(),
370 | ToTorchFormatTensor(),
371 | GroupNormalize(
372 | mean=[.485, .456, .406],
373 | std=[.229, .224, .225])
374 | ])
375 | print(trans2(color_group))
376 |
--------------------------------------------------------------------------------
/ops/utils.py:
--------------------------------------------------------------------------------
1 | # Code for paper:
2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance"
3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan
4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch
5 |
6 | import numpy as np
7 |
8 |
9 | def softmax(scores):
10 | es = np.exp(scores - scores.max(axis=-1)[..., None])
11 | return es / es.sum(axis=-1)[..., None]
12 |
13 |
14 | class AverageMeter(object):
15 | """Computes and stores the average and current value"""
16 |
17 | def __init__(self):
18 | self.reset()
19 |
20 | def reset(self):
21 | self.val = 0
22 | self.avg = 0
23 | self.sum = 0
24 | self.count = 0
25 |
26 | def update(self, val, n=1):
27 | self.val = val
28 | self.sum += val * n
29 | self.count += n
30 | self.avg = self.sum / self.count
31 |
32 |
33 | def accuracy(output, target, topk=(1,)):
34 | """Computes the precision@k for the specified values of k"""
35 | maxk = max(topk)
36 | batch_size = target.size(0)
37 |
38 | _, pred = output.topk(maxk, 1, True, True)
39 | pred = pred.t()
40 | correct = pred.eq(target.view(1, -1).expand_as(pred))
41 |
42 | res = []
43 | for k in topk:
44 | correct_k = correct[:k].view(-1).float().sum(0)
45 | res.append(correct_k.mul_(100.0 / batch_size))
46 | return res
47 |
--------------------------------------------------------------------------------
/opts.py:
--------------------------------------------------------------------------------
1 | # Code for paper:
2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance"
3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan
4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch
5 |
6 | import argparse
7 | parser = argparse.ArgumentParser(description="PyTorch implementation of Temporal Segment Networks")
8 | parser.add_argument('dataset', type=str)
9 | parser.add_argument('modality', type=str, choices=['Lite', 'RGB', 'PA', 'Flow'])
10 | parser.add_argument('--train_list', type=str, default="")
11 | parser.add_argument('--val_list', type=str, default="")
12 | parser.add_argument('--root_path', type=str, default="")
13 | parser.add_argument('--store_name', type=str, default="")
14 | parser.add_argument('--lmdb', default=False, action="store_true", help='use lmdb format dataset')
15 | # ========================= Model Configs ==========================
16 | parser.add_argument('--arch', type=str, default="BNInception")
17 | parser.add_argument('--num_segments', type=int, default=8)
18 | parser.add_argument('--consensus_type', type=str, default='avg')
19 | parser.add_argument('--k', type=int, default=3)
20 |
21 | parser.add_argument('--dropout', '--do', default=0.5, type=float,
22 | metavar='DO', help='dropout ratio (default: 0.5)')
23 | parser.add_argument('--loss_type', type=str, default="nll",
24 | choices=['nll'])
25 | parser.add_argument('--img_feature_dim', default=256, type=int, help="the feature dimension for each frame")
26 | parser.add_argument('--suffix', type=str, default=None)
27 | parser.add_argument('--pretrain', type=str, default='imagenet')
28 | parser.add_argument('--tune_from', type=str, default=None, help='fine-tune from checkpoint')
29 | parser.add_argument('--base', default='TSM', type=str, choices=['TSN', 'TSM'])
30 |
31 | # ========================= Learning Configs ==========================
32 | parser.add_argument('--epochs', default=120, type=int, metavar='N',
33 | help='number of total epochs to run')
34 | parser.add_argument('-b', '--batch-size', default=128, type=int,
35 | metavar='N', help='mini-batch size (default: 256)')
36 | parser.add_argument('--lr', '--learning-rate', default=0.001, type=float,
37 | metavar='LR', help='initial learning rate')
38 | parser.add_argument('--lr_type', default='step', type=str,
39 | metavar='LRtype', help='learning rate type')
40 | parser.add_argument('--lr_steps', default=[50, 100], type=float, nargs="+",
41 | metavar='LRSteps', help='epochs to decay learning rate by 10')
42 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
43 | help='momentum')
44 | parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float,
45 | metavar='W', help='weight decay (default: 5e-4)')
46 | parser.add_argument('--clip-gradient', '--gd', default=20, type=float,
47 | metavar='W', help='gradient norm clipping (default: disabled)')
48 | parser.add_argument('--no_partialbn', '--npb', default=True, action="store_true")
49 | parser.add_argument('-i', '--iter-size', default=1, type=int,
50 | metavar='N', help='number of iterations before on update')
51 |
52 | # ========================= Monitor Configs ==========================
53 | parser.add_argument('--print-freq', '-p', default=20, type=int,
54 | metavar='N', help='print frequency (default: 10)')
55 | parser.add_argument('--eval-freq', '-ef', default=1, type=int,
56 | metavar='N', help='evaluation frequency (default: 5)')
57 |
58 |
59 | # ========================= Runtime Configs ==========================
60 | parser.add_argument('-j', '--workers', default=8, type=int, metavar='N',
61 | help='number of data loading workers (default: 8)')
62 | parser.add_argument('--resume', default='', type=str, metavar='PATH',
63 | help='path to latest checkpoint (default: none)')
64 | parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
65 | help='evaluate model on validation set')
66 | parser.add_argument('--snapshot_pref', type=str, default="")
67 | parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
68 | help='manual epoch number (useful on restarts)')
69 | parser.add_argument('--gpus', nargs='+', type=int, default=None)
70 | parser.add_argument('--flow_prefix', default="", type=str)
71 | parser.add_argument('--root_log',type=str, default='log')
72 | parser.add_argument('--root_model', type=str, default='checkpoint')
73 |
74 | parser.add_argument('--shift', default=False, action="store_true", help='use shift for models')
75 | parser.add_argument('--shift_div', default=8, type=int, help='number of div for shift (default: 8)')
76 | parser.add_argument('--shift_place', default='blockres', type=str, help='place for shift (default: stageres)')
77 |
78 | parser.add_argument('--temporal_pool', default=False, action="store_true", help='add temporal pooling')
79 | parser.add_argument('--non_local', default=False, action="store_true", help='add non local block')
80 |
81 | parser.add_argument('--dense_sample', default=False, action="store_true", help='use dense sample for video dataset')
82 |
83 | parser.add_argument('--VAP', default=True, action="store_true", help='use VAP for various-timescale aggregation')
84 |
--------------------------------------------------------------------------------
/pretrained/models_urls.md:
--------------------------------------------------------------------------------
1 | ## Pretrained Models
2 |
3 | Here, we provide the pretrained models of PAN models on Something-Something-V1 & V2 datasets. Recognizing actions in these datasets requires strong temporal modeling ability, as many action classes are symmetrical. PAN achieves state-of-the-art performance on these datasets. Notably, our method even surpasses optical flow based methods while with only RGB frames as input.
4 |
5 | ### Something-Something-V1
6 |
7 |
8 |
9 |
10 |
11 | Model |
12 | Backbone |
13 | FLOPs * views |
14 | Val Top1 |
15 | Val Top5 |
16 | Checkpoints |
17 |
18 |
19 |
20 |
21 | PANLite |
22 | ResNet-50 |
23 | 35.7G * 1 |
24 | 48.0 |
25 | 76.1 |
26 | [Google Drive] or [Weiyun] |
27 |
28 |
29 | PANFull |
30 | 67.7G * 1 |
31 | 50.5 |
32 | 79.2 |
33 |
34 |
35 | PANEn |
36 | (46.6G+88.4G) * 2 |
37 | 53.4 |
38 | 81.1 |
39 |
40 |
41 | PANEn |
42 | ResNet-101 |
43 | (85.6G+166.1G) * 2 |
44 | 55.3 |
45 | 82.8 |
46 | [Google Drive] or [Weiyun] |
47 |
48 |
49 |
50 |
51 |
52 | ### Something-Something-V2
53 |
54 |
55 |
56 |
57 |
58 | Model |
59 | Backbone |
60 | FLOPs * views |
61 | Val Top1 |
62 | Val Top5 |
63 | Checkpoints |
64 |
65 |
66 |
67 |
68 | PANLite |
69 | ResNet-50 |
70 | 35.7G * 1 |
71 | 60.8 |
72 | 86.7 |
73 | [Google Drive] or [Weiyun] |
74 |
75 |
76 | PANFull |
77 | 67.7G * 1 |
78 | 63.8 |
79 | 88.6 |
80 |
81 |
82 | PANEn |
83 | (46.6G+88.4G) * 2 |
84 | 66.2 |
85 | 90.1 |
86 |
87 |
88 | PANEn |
89 | ResNet-101 |
90 | (85.6G+166.1G) * 2 |
91 | 66.5 |
92 | 90.6 |
93 | [Google Drive] or [Weiyun] |
94 |
95 |
96 |
97 |
98 |
99 |
--------------------------------------------------------------------------------
/scripts/test/sthv1/En.sh:
--------------------------------------------------------------------------------
1 | python test_models.py something \
2 | --VAP --batch_size=64 -j=4 --test_crops=1 --test_segments=8,8,8 \
3 | --weights=pretrained/PAN_Lite_something_resnet50_shift8_blockres_avg_segment8_e80.pth.tar,pretrained/PAN_RGB_something_resnet50_shift8_blockres_avg_segment8_e50.pth.tar,pretrained/PAN_PA_something_resnet50_shift8_blockres_avg_segment8_e80.pth.tar \
4 | --full_res --twice_sample
5 |
--------------------------------------------------------------------------------
/scripts/test/sthv1/Full.sh:
--------------------------------------------------------------------------------
1 | python test_models.py something --VAP --batch_size=64 -j=4 --test_crops=1 --test_segments=8,8 --weights=pretrained/PAN_RGB_something_resnet50_shift8_blockres_avg_segment8_e50.pth.tar,pretrained/PAN_PA_something_resnet50_shift8_blockres_avg_segment8_e80.pth.tar
2 |
--------------------------------------------------------------------------------
/scripts/test/sthv1/Lite.sh:
--------------------------------------------------------------------------------
1 | python test_models.py something --VAP --batch_size=64 -j=4 --test_crops=1 --test_segments=8 --weights=pretrained/PAN_Lite_something_resnet50_shift8_blockres_avg_segment8_e80.pth.tar
2 |
--------------------------------------------------------------------------------
/scripts/test/sthv2/En.sh:
--------------------------------------------------------------------------------
1 | python test_models.py somethingv2 \
2 | --VAP --batch_size=64 -j=4 --test_crops=1 --test_segments=8,8,8 \
3 | --weights=pretrained/PAN_Lite_somethingv2_resnet50_shift8_blockres_avg_segment8_e80.pth.tar,pretrained/PAN_RGB_somethingv2_resnet50_shift8_blockres_avg_segment8_e50.pth.tar,pretrained/PAN_PA_somethingv2_resnet50_shift8_blockres_avg_segment8_e80.pth.tar \
4 | --full_res --twice_sample
5 |
--------------------------------------------------------------------------------
/scripts/test/sthv2/Full.sh:
--------------------------------------------------------------------------------
1 | python test_models.py somethingv2 --VAP --batch_size=64 -j=4 --test_crops=1 --test_segments=8,8 --weights=pretrained/PAN_RGB_somethingv2_resnet50_shift8_blockres_avg_segment8_e50.pth.tar,pretrained/PAN_PA_somethingv2_resnet50_shift8_blockres_avg_segment8_e80.pth.tar
2 |
--------------------------------------------------------------------------------
/scripts/test/sthv2/Lite.sh:
--------------------------------------------------------------------------------
1 | python test_models.py somethingv2 --VAP --batch_size=64 -j=4 --test_crops=1 --test_segments=8 --weights=pretrained/PAN_Lite_somethingv2_resnet50_shift8_blockres_avg_segment8_e80.pth.tar
2 |
--------------------------------------------------------------------------------
/scripts/train/sthv1/Full_PA.sh:
--------------------------------------------------------------------------------
1 | python main.py something PA --arch resnet50 --num_segments 8 --lr 0.01 --lr_steps 30 60 --epochs 80 --batch-size 64 -i 1 -j 8 --dropout 0.5
2 |
--------------------------------------------------------------------------------
/scripts/train/sthv1/Full_RGB.sh:
--------------------------------------------------------------------------------
1 | python main.py something RGB --arch resnet50 --num_segments 8 --lr 0.01 --lr_steps 20 40 --epochs 50 --batch-size 64 -i 1 -j 8 --dropout 0.5
2 |
--------------------------------------------------------------------------------
/scripts/train/sthv1/Lite.sh:
--------------------------------------------------------------------------------
1 | python main.py something Lite --arch resnet50 --num_segments 8 --lr 0.01 --lr_steps 30 60 --epochs 80 --batch-size 64 -i 1 -j 8 --dropout 0.5
2 |
--------------------------------------------------------------------------------
/scripts/train/sthv2/Full_PA.sh:
--------------------------------------------------------------------------------
1 | python main.py somethingv2 PA --arch resnet50 --num_segments 8 --lr 0.01 --lr_steps 30 60 --epochs 80 --batch-size 64 -i 1 -j 8 --dropout 0.5
2 |
--------------------------------------------------------------------------------
/scripts/train/sthv2/Full_RGB.sh:
--------------------------------------------------------------------------------
1 | python main.py somethingv2 RGB --arch resnet50 --num_segments 8 --lr 0.01 --lr_steps 20 40 --epochs 50 --batch-size 64 -i 1 -j 8 --dropout 0.5
2 |
--------------------------------------------------------------------------------
/scripts/train/sthv2/Lite.sh:
--------------------------------------------------------------------------------
1 | python main.py somethingv2 Lite --arch resnet50 --num_segments 8 --lr 0.01 --lr_steps 30 60 --epochs 80 --batch-size 64 -i 1 -j 8 --dropout 0.5
2 |
--------------------------------------------------------------------------------
/test_models.py:
--------------------------------------------------------------------------------
1 | # Code for paper:
2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance"
3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan
4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch
5 |
6 | import argparse
7 | import time
8 |
9 | import torch.nn.parallel
10 | import torch.optim
11 | from sklearn.metrics import confusion_matrix
12 | from ops.dataset import PANDataSet
13 | from ops.models import PAN
14 | from ops.transforms import *
15 | from ops import dataset_config
16 | from torch.nn import functional as F
17 |
18 | # options
19 | parser = argparse.ArgumentParser(description="PAN testing on the full validation set")
20 | parser.add_argument('dataset', type=str)
21 |
22 | # may contain splits
23 | parser.add_argument('--weights', type=str, default=None)
24 | parser.add_argument('--test_segments', type=str, default=25)
25 | parser.add_argument('--dense_sample', default=False, action="store_true", help='use dense sample as I3D')
26 | parser.add_argument('--twice_sample', default=False, action="store_true", help='use twice sample for ensemble')
27 | parser.add_argument('--full_res', default=False, action="store_true",
28 | help='use full resolution 256x256 for test as in Non-local I3D')
29 |
30 | parser.add_argument('--test_crops', type=int, default=1)
31 | parser.add_argument('--coeff', type=str, default=None)
32 | parser.add_argument('--batch_size', type=int, default=1)
33 | parser.add_argument('-j', '--workers', default=8, type=int, metavar='N',
34 | help='number of data loading workers (default: 8)')
35 |
36 | # for true test
37 | parser.add_argument('--test_list', type=str, default=None)
38 | parser.add_argument('--csv_file', type=str, default=None)
39 |
40 | parser.add_argument('--softmax', default=False, action="store_true", help='use softmax')
41 |
42 | parser.add_argument('--max_num', type=int, default=-1)
43 | parser.add_argument('--input_size', type=int, default=224)
44 | parser.add_argument('--crop_fusion_type', type=str, default='avg')
45 | parser.add_argument('--gpus', nargs='+', type=int, default=None)
46 | parser.add_argument('--img_feature_dim',type=int, default=256)
47 | parser.add_argument('--num_set_segments',type=int, default=1,help='TODO: select multiply set of n-frames from a video')
48 | parser.add_argument('--pretrain', type=str, default='imagenet')
49 | parser.add_argument('--lmdb', default=False, action="store_true", help='use lmdb format dataset')
50 | parser.add_argument('--VAP', default=False, action="store_true", help='use VAP for various-timescale aggregation')
51 | args = parser.parse_args()
52 |
53 |
54 | class AverageMeter(object):
55 | """Computes and stores the average and current value"""
56 | def __init__(self):
57 | self.reset()
58 |
59 | def reset(self):
60 | self.val = 0
61 | self.avg = 0
62 | self.sum = 0
63 | self.count = 0
64 |
65 | def update(self, val, n=1):
66 | self.val = val
67 | self.sum += val * n
68 | self.count += n
69 | self.avg = self.sum / self.count
70 |
71 |
72 | def accuracy(output, target, topk=(1,)):
73 | """Computes the precision@k for the specified values of k"""
74 | maxk = max(topk)
75 | batch_size = target.size(0)
76 | _, pred = output.topk(maxk, 1, True, True)
77 | pred = pred.t()
78 | correct = pred.eq(target.view(1, -1).expand_as(pred))
79 | res = []
80 | for k in topk:
81 | correct_k = correct[:k].view(-1).float().sum(0)
82 | res.append(correct_k.mul_(100.0 / batch_size))
83 | return res
84 |
85 |
86 | def parse_shift_option_from_log_name(log_name):
87 | if 'shift' in log_name:
88 | strings = log_name.split('_')
89 | for i, s in enumerate(strings):
90 | if 'shift' in s:
91 | break
92 | return True, int(strings[i].replace('shift', '')), strings[i + 1]
93 | else:
94 | return False, None, None
95 |
96 |
97 | weights_list = args.weights.split(',')
98 | test_segments_list = [int(s) for s in args.test_segments.split(',')]
99 | assert len(weights_list) == len(test_segments_list)
100 | if args.coeff is None:
101 | coeff_list = [1] * len(weights_list)
102 | else:
103 | coeff_list = [float(c) for c in args.coeff.split(',')]
104 |
105 | if args.test_list is not None:
106 | test_file_list = args.test_list.split(',')
107 | else:
108 | test_file_list = [None] * len(weights_list)
109 |
110 |
111 | data_iter_list = []
112 | net_list = []
113 | modality_list = []
114 |
115 | total_num = None
116 | for this_weights, this_test_segments, test_file in zip(weights_list, test_segments_list, test_file_list):
117 | is_shift, shift_div, shift_place = parse_shift_option_from_log_name(this_weights)
118 | if 'Lite' in this_weights:
119 | modality = 'Lite'
120 | data_length = 4
121 | elif 'RGB' in this_weights:
122 | modality = 'RGB'
123 | data_length = 1
124 | elif 'PA' in this_weights:
125 | modality = 'PA'
126 | data_length = 4
127 | else:
128 | modality = 'Flow'
129 | data_length = 5
130 | this_arch = this_weights.split('PAN_')[1].split('_')[2]
131 | modality_list.append(modality)
132 | num_class, args.train_list, val_list, root_path, prefix = dataset_config.return_dataset(args.dataset,
133 | modality)
134 | print('=> shift: {}, shift_div: {}, shift_place: {}'.format(is_shift, shift_div, shift_place))
135 | net = PAN(num_class, this_test_segments if is_shift else 1, modality,
136 | base_model=this_arch,
137 | consensus_type=args.crop_fusion_type,
138 | img_feature_dim=args.img_feature_dim,
139 | pretrain=args.pretrain,
140 | is_shift=is_shift, shift_div=shift_div, shift_place=shift_place,
141 | non_local='_nl' in this_weights,
142 | data_length=data_length,
143 | has_VAP=args.VAP,
144 | )
145 |
146 | if 'tpool' in this_weights:
147 | from ops.temporal_shift import make_temporal_pool
148 | make_temporal_pool(net.base_model, this_test_segments) # since DataParallel
149 |
150 | checkpoint = torch.load(this_weights)
151 | checkpoint = checkpoint['state_dict']
152 |
153 | # base_dict = {('base_model.' + k).replace('base_model.fc', 'new_fc'): v for k, v in list(checkpoint.items())}
154 | base_dict = {'.'.join(k.split('.')[1:]): v for k, v in list(checkpoint.items())}
155 | replace_dict = {'base_model.classifier.weight': 'new_fc.weight',
156 | 'base_model.classifier.bias': 'new_fc.bias',
157 | }
158 | for k, v in replace_dict.items():
159 | if k in base_dict:
160 | base_dict[v] = base_dict.pop(k)
161 |
162 | net.load_state_dict(base_dict)
163 |
164 | input_size = net.scale_size if args.full_res else net.input_size
165 | if args.test_crops == 1:
166 | cropping = torchvision.transforms.Compose([
167 | GroupScale(net.scale_size),
168 | GroupCenterCrop(input_size),
169 | ])
170 | elif args.test_crops == 3: # do not flip, so only 5 crops
171 | cropping = torchvision.transforms.Compose([
172 | GroupFullResSample(input_size, net.scale_size, flip=False)
173 | ])
174 | elif args.test_crops == 5: # do not flip, so only 5 crops
175 | cropping = torchvision.transforms.Compose([
176 | GroupOverSample(input_size, net.scale_size, flip=False)
177 | ])
178 | elif args.test_crops == 10:
179 | cropping = torchvision.transforms.Compose([
180 | GroupOverSample(input_size, net.scale_size)
181 | ])
182 | else:
183 | raise ValueError("Only 1, 5, 10 crops are supported while we got {}".format(args.test_crops))
184 |
185 | data_loader = torch.utils.data.DataLoader(
186 | PANDataSet(root_path, test_file if test_file is not None else val_list, num_segments=this_test_segments,
187 | new_length=data_length,
188 | modality=modality,
189 | image_tmpl=prefix,
190 | test_mode=True,
191 | remove_missing=len(weights_list) == 1,
192 | transform=torchvision.transforms.Compose([
193 | cropping,
194 | Stack(roll=(this_arch in ['BNInception', 'InceptionV3'])),
195 | ToTorchFormatTensor(div=(this_arch not in ['BNInception', 'InceptionV3'])),
196 | GroupNormalize(net.input_mean, net.input_std),
197 | ]), dense_sample=args.dense_sample, twice_sample=args.twice_sample, is_lmdb=args.lmdb),
198 | batch_size=args.batch_size, shuffle=False,
199 | num_workers=args.workers, pin_memory=True,
200 | )
201 |
202 | if args.gpus is not None:
203 | devices = [args.gpus[i] for i in range(args.workers)]
204 | else:
205 | devices = list(range(args.workers))
206 |
207 | net = torch.nn.DataParallel(net.cuda())
208 | net.eval()
209 |
210 | data_gen = enumerate(data_loader)
211 |
212 | if total_num is None:
213 | total_num = len(data_loader.dataset)
214 | else:
215 | assert total_num == len(data_loader.dataset)
216 |
217 | data_iter_list.append(data_gen)
218 | net_list.append(net)
219 |
220 |
221 | output = []
222 |
223 |
224 | def eval_video(video_data, net, this_test_segments, modality):
225 | net.eval()
226 | with torch.no_grad():
227 | i, data, label = video_data
228 | batch_size = label.numel()
229 | num_crop = args.test_crops
230 | if args.dense_sample:
231 | num_crop *= 10 # 10 clips for testing when using dense sample
232 |
233 | if args.twice_sample:
234 | num_crop *= 2
235 |
236 | if modality == 'RGB':
237 | length = 3
238 | elif modality in ['PA', 'Lite']:
239 | length = 12
240 | elif modality == 'Flow':
241 | length = 10
242 | elif modality == 'RGBDiff':
243 | length = 18
244 | else:
245 | raise ValueError("Unknown modality "+ modality)
246 |
247 | if modality in ['PA', 'Lite']:
248 | PA_length = 4
249 | else:
250 | PA_length = 1
251 |
252 | data_in = data.view(-1, length, data.size(2), data.size(3))
253 | if is_shift:
254 | data_in = data_in.view(batch_size * num_crop, this_test_segments, length, data_in.size(2), data_in.size(3))
255 | rst = net(data_in)
256 | rst = rst.reshape(batch_size, num_crop, -1).mean(1)
257 |
258 | if args.softmax:
259 | # take the softmax to normalize the output to probability
260 | rst = F.softmax(rst, dim=1)
261 |
262 | rst = rst.data.cpu().numpy().copy()
263 |
264 | if net.module.is_shift:
265 | rst = rst.reshape(batch_size, num_class)
266 | else:
267 | rst = rst.reshape((batch_size, -1, num_class)).mean(axis=1).reshape((batch_size, num_class))
268 |
269 | return i, rst, label
270 |
271 |
272 | proc_start_time = time.time()
273 | max_num = args.max_num if args.max_num > 0 else total_num
274 |
275 | top1 = AverageMeter()
276 | top5 = AverageMeter()
277 |
278 | for i, data_label_pairs in enumerate(zip(*data_iter_list)):
279 | with torch.no_grad():
280 | if i >= max_num:
281 | break
282 | this_rst_list = []
283 | this_label = None
284 | for n_seg, (_, (data, label)), net, modality in zip(test_segments_list, data_label_pairs, net_list, modality_list):
285 | rst = eval_video((i, data, label), net, n_seg, modality)
286 | this_rst_list.append(rst[1])
287 | this_label = label
288 | assert len(this_rst_list) == len(coeff_list)
289 | for i_coeff in range(len(this_rst_list)):
290 | this_rst_list[i_coeff] *= coeff_list[i_coeff]
291 | ensembled_predict = sum(this_rst_list) / len(this_rst_list)
292 |
293 | for p, g in zip(ensembled_predict, this_label.cpu().numpy()):
294 | output.append([p[None, ...], g])
295 | cnt_time = time.time() - proc_start_time
296 | prec1, prec5 = accuracy(torch.from_numpy(ensembled_predict), this_label, topk=(1, 5))
297 | top1.update(prec1.item(), this_label.numel())
298 | top5.update(prec5.item(), this_label.numel())
299 | if i % 20 == 0:
300 | print('video {} done, total {}/{}, average {:.3f} sec/video, '
301 | 'moving Prec@1 {:.3f} Prec@5 {:.3f}'.format(i * args.batch_size, i * args.batch_size, total_num,
302 | float(cnt_time) / (i+1) / args.batch_size, top1.avg, top5.avg))
303 |
304 | video_pred = [np.argmax(x[0]) for x in output]
305 | video_pred_top5 = [np.argsort(np.mean(x[0], axis=0).reshape(-1))[::-1][:5] for x in output]
306 |
307 | video_labels = [x[1] for x in output]
308 |
309 |
310 | if args.csv_file is not None:
311 | print('=> Writing result to csv file: {}'.format(args.csv_file))
312 | with open('sth_category.txt') as f:
313 | categories = f.readlines()
314 | categories = sorted([f.strip() for f in categories])
315 | with open(test_file_list[0]) as f:
316 | vid_names = f.readlines()
317 | vid_names = [n.split(' ')[0] for n in vid_names]
318 | assert len(vid_names) == len(video_pred)
319 | if args.dataset != 'somethingv2': # only output top1
320 | with open(args.csv_file, 'w') as f:
321 | for n, pred in zip(vid_names, video_pred):
322 | f.write('{};{}\n'.format(n, categories[pred]))
323 | else:
324 | with open(args.csv_file, 'w') as f:
325 | for n, pred5 in zip(vid_names, video_pred_top5):
326 | fill = [n]
327 | for p in list(pred5):
328 | fill.append(p)
329 | f.write('{};{};{};{};{};{}\n'.format(*fill))
330 |
331 |
332 | cf = confusion_matrix(video_labels, video_pred).astype(float)
333 |
334 | np.save('cm.npy', cf)
335 | cls_cnt = cf.sum(axis=1)
336 | cls_hit = np.diag(cf)
337 |
338 | cls_acc = cls_hit / cls_cnt
339 | print(cls_acc)
340 | upper = np.mean(np.max(cf, axis=1) / cls_cnt)
341 | print('upper bound: {}'.format(upper))
342 |
343 | print('-----Evaluation is finished------')
344 | print('Class Accuracy {:.02f}%'.format(np.mean(cls_acc) * 100))
345 | print('Overall Prec@1 {:.02f}% Prec@5 {:.02f}%'.format(top1.avg, top5.avg))
346 |
347 | # reorder before saving
348 | name_list = [x.strip().split()[0] for x in open(val_list)]
349 |
350 | order_dict = {e:i for i, e in enumerate(sorted(name_list))}
351 |
352 | reorder_output = [None] * len(output)
353 | reorder_label = [None] * len(output)
354 |
355 | for i in range(len(output)):
356 | idx = order_dict[name_list[i]]
357 | reorder_output[idx] = output[i]
358 | reorder_label[idx] = video_labels[i]
359 |
360 | if set(['PA', 'RGB']) == set(modality_list):
361 | modality = 'Full'
362 | elif set(['PA', 'RGB', 'Lite']) == set(modality_list):
363 | modality = 'En'
364 |
365 | np.savez("_".join([args.dataset, modality, str(top1.avg)]), scores=reorder_output, labels=reorder_label)
366 |
--------------------------------------------------------------------------------
/tools/gen_label_kinetics.py:
--------------------------------------------------------------------------------
1 | # Code for paper:
2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance"
3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan
4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch
5 | # ------------------------------------------------------
6 | # Code adapted from https://github.com/metalbubble/TRN-pytorch/blob/master/process_dataset.py
7 |
8 | import os
9 |
10 |
11 | dataset_path = '/ssd/video/kinetics/images256/'
12 | label_path = '/ssd/video/kinetics/labels'
13 |
14 | if __name__ == '__main__':
15 | with open('kinetics_label_map.txt') as f:
16 | categories = f.readlines()
17 | categories = [c.strip().replace(' ', '_').replace('"', '').replace('(', '').replace(')', '').replace("'", '') for c in categories]
18 | assert len(set(categories)) == 400
19 | dict_categories = {}
20 | for i, category in enumerate(categories):
21 | dict_categories[category] = i
22 |
23 | print(dict_categories)
24 |
25 | files_input = ['kinetics_val.csv', 'kinetics_train.csv']
26 | files_output = ['val_videofolder.txt', 'train_videofolder.txt']
27 | for (filename_input, filename_output) in zip(files_input, files_output):
28 | count_cat = {k: 0 for k in dict_categories.keys()}
29 | with open(os.path.join(label_path, filename_input)) as f:
30 | lines = f.readlines()[1:]
31 | folders = []
32 | idx_categories = []
33 | categories_list = []
34 | for line in lines:
35 | line = line.rstrip()
36 | items = line.split(',')
37 | folders.append(items[1] + '_' + items[2])
38 | this_catergory = items[0].replace(' ', '_').replace('"', '').replace('(', '').replace(')', '').replace("'", '')
39 | categories_list.append(this_catergory)
40 | idx_categories.append(dict_categories[this_catergory])
41 | count_cat[this_catergory] += 1
42 | print(max(count_cat.values()))
43 |
44 | assert len(idx_categories) == len(folders)
45 | missing_folders = []
46 | output = []
47 | for i in range(len(folders)):
48 | curFolder = folders[i]
49 | curIDX = idx_categories[i]
50 | # counting the number of frames in each video folders
51 | img_dir = os.path.join(dataset_path, categories_list[i], curFolder)
52 | if not os.path.exists(img_dir):
53 | missing_folders.append(img_dir)
54 | # print(missing_folders)
55 | else:
56 | dir_files = os.listdir(img_dir)
57 | output.append('%s %d %d'%(os.path.join(categories_list[i], curFolder), len(dir_files), curIDX))
58 | print('%d/%d, missing %d'%(i, len(folders), len(missing_folders)))
59 | with open(os.path.join(label_path, filename_output),'w') as f:
60 | f.write('\n'.join(output))
61 | with open(os.path.join(label_path, 'missing_' + filename_output),'w') as f:
62 | f.write('\n'.join(missing_folders))
63 |
--------------------------------------------------------------------------------
/tools/gen_label_sthv1.py:
--------------------------------------------------------------------------------
1 | # Code for paper:
2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance"
3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan
4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch
5 | # ------------------------------------------------------
6 | # Code adapted from https://github.com/metalbubble/TRN-pytorch/blob/master/process_dataset.py
7 | # processing the raw data of the video Something-Something-V1
8 |
9 | import os
10 |
11 | if __name__ == '__main__':
12 | dataset_name = 'something-something-v1' # 'jester-v1'
13 | with open('%s-labels.csv' % dataset_name) as f:
14 | lines = f.readlines()
15 | categories = []
16 | for line in lines:
17 | line = line.rstrip()
18 | categories.append(line)
19 | categories = sorted(categories)
20 | with open('category.txt', 'w') as f:
21 | f.write('\n'.join(categories))
22 |
23 | dict_categories = {}
24 | for i, category in enumerate(categories):
25 | dict_categories[category] = i
26 |
27 | files_input = ['%s-validation.csv' % dataset_name, '%s-train.csv' % dataset_name]
28 | files_output = ['val_videofolder.txt', 'train_videofolder.txt']
29 | for (filename_input, filename_output) in zip(files_input, files_output):
30 | with open(filename_input) as f:
31 | lines = f.readlines()
32 | folders = []
33 | idx_categories = []
34 | for line in lines:
35 | line = line.rstrip()
36 | items = line.split(';')
37 | folders.append(items[0])
38 | idx_categories.append(dict_categories[items[1]])
39 | output = []
40 | for i in range(len(folders)):
41 | curFolder = folders[i]
42 | curIDX = idx_categories[i]
43 | # counting the number of frames in each video folders
44 | dir_files = os.listdir(os.path.join('../img', curFolder))
45 | output.append('%s %d %d' % ('something/v1/img/' + curFolder, len(dir_files), curIDX))
46 | print('%d/%d' % (i, len(folders)))
47 | with open(filename_output, 'w') as f:
48 | f.write('\n'.join(output))
49 |
--------------------------------------------------------------------------------
/tools/gen_label_sthv2.py:
--------------------------------------------------------------------------------
1 | # Code for paper:
2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance"
3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan
4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch
5 | # ------------------------------------------------------
6 | # Code adapted from https://github.com/metalbubble/TRN-pytorch/blob/master/process_dataset.py
7 | # processing the raw data of the video Something-Something-V2
8 |
9 | import os
10 | import json
11 |
12 | if __name__ == '__main__':
13 | dataset_name = 'something-something-v2' # 'jester-v1'
14 | with open('%s-labels.json' % dataset_name) as f:
15 | data = json.load(f)
16 | categories = []
17 | for i, (cat, idx) in enumerate(data.items()):
18 | assert i == int(idx) # make sure the rank is right
19 | categories.append(cat)
20 |
21 | with open('category.txt', 'w') as f:
22 | f.write('\n'.join(categories))
23 |
24 | dict_categories = {}
25 | for i, category in enumerate(categories):
26 | dict_categories[category] = i
27 |
28 | files_input = ['%s-validation.json' % dataset_name, '%s-train.json' % dataset_name, '%s-test.json' % dataset_name]
29 | files_output = ['val_videofolder.txt', 'train_videofolder.txt', 'test_videofolder.txt']
30 | for (filename_input, filename_output) in zip(files_input, files_output):
31 | with open(filename_input) as f:
32 | data = json.load(f)
33 | folders = []
34 | idx_categories = []
35 | for item in data:
36 | folders.append(item['id'])
37 | if 'test' not in filename_input:
38 | idx_categories.append(dict_categories[item['template'].replace('[', '').replace(']', '')])
39 | else:
40 | idx_categories.append(0)
41 | output = []
42 | for i in range(len(folders)):
43 | curFolder = folders[i]
44 | curIDX = idx_categories[i]
45 | # counting the number of frames in each video folders
46 | dir_files = os.listdir(os.path.join('20bn-something-something-v2-frames', curFolder))
47 | output.append('%s %d %d' % (curFolder, len(dir_files), curIDX))
48 | print('%d/%d' % (i, len(folders)))
49 | with open(filename_output, 'w') as f:
50 | f.write('\n'.join(output))
51 |
--------------------------------------------------------------------------------
/tools/kinetics_label_map.txt:
--------------------------------------------------------------------------------
1 | abseiling
2 | air drumming
3 | answering questions
4 | applauding
5 | applying cream
6 | archery
7 | arm wrestling
8 | arranging flowers
9 | assembling computer
10 | auctioning
11 | baby waking up
12 | baking cookies
13 | balloon blowing
14 | bandaging
15 | barbequing
16 | bartending
17 | beatboxing
18 | bee keeping
19 | belly dancing
20 | bench pressing
21 | bending back
22 | bending metal
23 | biking through snow
24 | blasting sand
25 | blowing glass
26 | blowing leaves
27 | blowing nose
28 | blowing out candles
29 | bobsledding
30 | bookbinding
31 | bouncing on trampoline
32 | bowling
33 | braiding hair
34 | breading or breadcrumbing
35 | breakdancing
36 | brush painting
37 | brushing hair
38 | brushing teeth
39 | building cabinet
40 | building shed
41 | bungee jumping
42 | busking
43 | canoeing or kayaking
44 | capoeira
45 | carrying baby
46 | cartwheeling
47 | carving pumpkin
48 | catching fish
49 | catching or throwing baseball
50 | catching or throwing frisbee
51 | catching or throwing softball
52 | celebrating
53 | changing oil
54 | changing wheel
55 | checking tires
56 | cheerleading
57 | chopping wood
58 | clapping
59 | clay pottery making
60 | clean and jerk
61 | cleaning floor
62 | cleaning gutters
63 | cleaning pool
64 | cleaning shoes
65 | cleaning toilet
66 | cleaning windows
67 | climbing a rope
68 | climbing ladder
69 | climbing tree
70 | contact juggling
71 | cooking chicken
72 | cooking egg
73 | cooking on campfire
74 | cooking sausages
75 | counting money
76 | country line dancing
77 | cracking neck
78 | crawling baby
79 | crossing river
80 | crying
81 | curling hair
82 | cutting nails
83 | cutting pineapple
84 | cutting watermelon
85 | dancing ballet
86 | dancing charleston
87 | dancing gangnam style
88 | dancing macarena
89 | deadlifting
90 | decorating the christmas tree
91 | digging
92 | dining
93 | disc golfing
94 | diving cliff
95 | dodgeball
96 | doing aerobics
97 | doing laundry
98 | doing nails
99 | drawing
100 | dribbling basketball
101 | drinking
102 | drinking beer
103 | drinking shots
104 | driving car
105 | driving tractor
106 | drop kicking
107 | drumming fingers
108 | dunking basketball
109 | dying hair
110 | eating burger
111 | eating cake
112 | eating carrots
113 | eating chips
114 | eating doughnuts
115 | eating hotdog
116 | eating ice cream
117 | eating spaghetti
118 | eating watermelon
119 | egg hunting
120 | exercising arm
121 | exercising with an exercise ball
122 | extinguishing fire
123 | faceplanting
124 | feeding birds
125 | feeding fish
126 | feeding goats
127 | filling eyebrows
128 | finger snapping
129 | fixing hair
130 | flipping pancake
131 | flying kite
132 | folding clothes
133 | folding napkins
134 | folding paper
135 | front raises
136 | frying vegetables
137 | garbage collecting
138 | gargling
139 | getting a haircut
140 | getting a tattoo
141 | giving or receiving award
142 | golf chipping
143 | golf driving
144 | golf putting
145 | grinding meat
146 | grooming dog
147 | grooming horse
148 | gymnastics tumbling
149 | hammer throw
150 | headbanging
151 | headbutting
152 | high jump
153 | high kick
154 | hitting baseball
155 | hockey stop
156 | holding snake
157 | hopscotch
158 | hoverboarding
159 | hugging
160 | hula hooping
161 | hurdling
162 | hurling (sport)
163 | ice climbing
164 | ice fishing
165 | ice skating
166 | ironing
167 | javelin throw
168 | jetskiing
169 | jogging
170 | juggling balls
171 | juggling fire
172 | juggling soccer ball
173 | jumping into pool
174 | jumpstyle dancing
175 | kicking field goal
176 | kicking soccer ball
177 | kissing
178 | kitesurfing
179 | knitting
180 | krumping
181 | laughing
182 | laying bricks
183 | long jump
184 | lunge
185 | making a cake
186 | making a sandwich
187 | making bed
188 | making jewelry
189 | making pizza
190 | making snowman
191 | making sushi
192 | making tea
193 | marching
194 | massaging back
195 | massaging feet
196 | massaging legs
197 | massaging person's head
198 | milking cow
199 | mopping floor
200 | motorcycling
201 | moving furniture
202 | mowing lawn
203 | news anchoring
204 | opening bottle
205 | opening present
206 | paragliding
207 | parasailing
208 | parkour
209 | passing American football (in game)
210 | passing American football (not in game)
211 | peeling apples
212 | peeling potatoes
213 | petting animal (not cat)
214 | petting cat
215 | picking fruit
216 | planting trees
217 | plastering
218 | playing accordion
219 | playing badminton
220 | playing bagpipes
221 | playing basketball
222 | playing bass guitar
223 | playing cards
224 | playing cello
225 | playing chess
226 | playing clarinet
227 | playing controller
228 | playing cricket
229 | playing cymbals
230 | playing didgeridoo
231 | playing drums
232 | playing flute
233 | playing guitar
234 | playing harmonica
235 | playing harp
236 | playing ice hockey
237 | playing keyboard
238 | playing kickball
239 | playing monopoly
240 | playing organ
241 | playing paintball
242 | playing piano
243 | playing poker
244 | playing recorder
245 | playing saxophone
246 | playing squash or racquetball
247 | playing tennis
248 | playing trombone
249 | playing trumpet
250 | playing ukulele
251 | playing violin
252 | playing volleyball
253 | playing xylophone
254 | pole vault
255 | presenting weather forecast
256 | pull ups
257 | pumping fist
258 | pumping gas
259 | punching bag
260 | punching person (boxing)
261 | push up
262 | pushing car
263 | pushing cart
264 | pushing wheelchair
265 | reading book
266 | reading newspaper
267 | recording music
268 | riding a bike
269 | riding camel
270 | riding elephant
271 | riding mechanical bull
272 | riding mountain bike
273 | riding mule
274 | riding or walking with horse
275 | riding scooter
276 | riding unicycle
277 | ripping paper
278 | robot dancing
279 | rock climbing
280 | rock scissors paper
281 | roller skating
282 | running on treadmill
283 | sailing
284 | salsa dancing
285 | sanding floor
286 | scrambling eggs
287 | scuba diving
288 | setting table
289 | shaking hands
290 | shaking head
291 | sharpening knives
292 | sharpening pencil
293 | shaving head
294 | shaving legs
295 | shearing sheep
296 | shining shoes
297 | shooting basketball
298 | shooting goal (soccer)
299 | shot put
300 | shoveling snow
301 | shredding paper
302 | shuffling cards
303 | side kick
304 | sign language interpreting
305 | singing
306 | situp
307 | skateboarding
308 | ski jumping
309 | skiing (not slalom or crosscountry)
310 | skiing crosscountry
311 | skiing slalom
312 | skipping rope
313 | skydiving
314 | slacklining
315 | slapping
316 | sled dog racing
317 | smoking
318 | smoking hookah
319 | snatch weight lifting
320 | sneezing
321 | sniffing
322 | snorkeling
323 | snowboarding
324 | snowkiting
325 | snowmobiling
326 | somersaulting
327 | spinning poi
328 | spray painting
329 | spraying
330 | springboard diving
331 | squat
332 | sticking tongue out
333 | stomping grapes
334 | stretching arm
335 | stretching leg
336 | strumming guitar
337 | surfing crowd
338 | surfing water
339 | sweeping floor
340 | swimming backstroke
341 | swimming breast stroke
342 | swimming butterfly stroke
343 | swing dancing
344 | swinging legs
345 | swinging on something
346 | sword fighting
347 | tai chi
348 | taking a shower
349 | tango dancing
350 | tap dancing
351 | tapping guitar
352 | tapping pen
353 | tasting beer
354 | tasting food
355 | testifying
356 | texting
357 | throwing axe
358 | throwing ball
359 | throwing discus
360 | tickling
361 | tobogganing
362 | tossing coin
363 | tossing salad
364 | training dog
365 | trapezing
366 | trimming or shaving beard
367 | trimming trees
368 | triple jump
369 | tying bow tie
370 | tying knot (not on a tie)
371 | tying tie
372 | unboxing
373 | unloading truck
374 | using computer
375 | using remote controller (not gaming)
376 | using segway
377 | vault
378 | waiting in line
379 | walking the dog
380 | washing dishes
381 | washing feet
382 | washing hair
383 | washing hands
384 | water skiing
385 | water sliding
386 | watering plants
387 | waxing back
388 | waxing chest
389 | waxing eyebrows
390 | waxing legs
391 | weaving basket
392 | welding
393 | whistling
394 | windsurfing
395 | wrapping present
396 | wrestling
397 | writing
398 | yawning
399 | yoga
400 | zumba
--------------------------------------------------------------------------------
/tools/vid2img_kinetics.py:
--------------------------------------------------------------------------------
1 | # Code for paper:
2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance"
3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan
4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch
5 |
6 | from __future__ import print_function, division
7 | import os
8 | import sys
9 | import subprocess
10 | from multiprocessing import Pool
11 | from tqdm import tqdm
12 |
13 | n_thread = 100
14 |
15 |
16 | def vid2jpg(file_name, class_path, dst_class_path):
17 | if '.mp4' not in file_name:
18 | return
19 | name, ext = os.path.splitext(file_name)
20 | dst_directory_path = os.path.join(dst_class_path, name)
21 |
22 | video_file_path = os.path.join(class_path, file_name)
23 | try:
24 | if os.path.exists(dst_directory_path):
25 | if not os.path.exists(os.path.join(dst_directory_path, 'img_00001.jpg')):
26 | subprocess.call('rm -r \"{}\"'.format(dst_directory_path), shell=True)
27 | print('remove {}'.format(dst_directory_path))
28 | os.mkdir(dst_directory_path)
29 | else:
30 | print('*** convert has been done: {}'.format(dst_directory_path))
31 | return
32 | else:
33 | os.mkdir(dst_directory_path)
34 | except:
35 | print(dst_directory_path)
36 | return
37 | cmd = 'ffmpeg -i \"{}\" -threads 1 -vf scale=-1:331 -q:v 0 \"{}/img_%05d.jpg\"'.format(video_file_path, dst_directory_path)
38 | # print(cmd)
39 | subprocess.call(cmd, shell=True,
40 | stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
41 |
42 |
43 | def class_process(dir_path, dst_dir_path, class_name):
44 | print('*' * 20, class_name, '*'*20)
45 | class_path = os.path.join(dir_path, class_name)
46 | if not os.path.isdir(class_path):
47 | print('*** is not a dir {}'.format(class_path))
48 | return
49 |
50 | dst_class_path = os.path.join(dst_dir_path, class_name)
51 | if not os.path.exists(dst_class_path):
52 | os.mkdir(dst_class_path)
53 |
54 | vid_list = os.listdir(class_path)
55 | vid_list.sort()
56 | p = Pool(n_thread)
57 | from functools import partial
58 | worker = partial(vid2jpg, class_path=class_path, dst_class_path=dst_class_path)
59 | for _ in tqdm(p.imap_unordered(worker, vid_list), total=len(vid_list)):
60 | pass
61 | # p.map(worker, vid_list)
62 | p.close()
63 | p.join()
64 |
65 | print('\n')
66 |
67 |
68 | if __name__ == "__main__":
69 | dir_path = sys.argv[1]
70 | dst_dir_path = sys.argv[2]
71 |
72 | class_list = os.listdir(dir_path)
73 | class_list.sort()
74 | for class_name in class_list:
75 | class_process(dir_path, dst_dir_path, class_name)
76 |
77 | class_name = 'test'
78 | class_process(dir_path, dst_dir_path, class_name)
79 |
--------------------------------------------------------------------------------
/tools/vid2img_sthv2.py:
--------------------------------------------------------------------------------
1 | # Code for paper:
2 | # [Title] - "PAN: Towards Fast Action Recognition via Learning Persistence of Appearance"
3 | # [Author] - Can Zhang, Yuexian Zou, Guang Chen, Lei Gan
4 | # [Github] - https://github.com/zhang-can/PAN-PyTorch
5 |
6 | import os
7 | import threading
8 |
9 | NUM_THREADS = 100
10 | VIDEO_ROOT = '/ssd/video/something/v2/20bn-something-something-v2' # Downloaded webm videos
11 | FRAME_ROOT = '/ssd/video/something/v2/20bn-something-something-v2-frames' # Directory for extracted frames
12 |
13 |
14 | def split(l, n):
15 | """Yield successive n-sized chunks from l."""
16 | for i in range(0, len(l), n):
17 | yield l[i:i + n]
18 |
19 |
20 | def extract(video, tmpl='%06d.jpg'):
21 | # os.system(f'ffmpeg -i {VIDEO_ROOT}/{video} -vf -threads 1 -vf scale=-1:256 -q:v 0 '
22 | # f'{FRAME_ROOT}/{video[:-5]}/{tmpl}')
23 | cmd = 'ffmpeg -i \"{}/{}\" -threads 1 -vf scale=-1:256 -q:v 0 \"{}/{}/%06d.jpg\"'.format(VIDEO_ROOT, video,
24 | FRAME_ROOT, video[:-5])
25 | os.system(cmd)
26 |
27 |
28 | def target(video_list):
29 | for video in video_list:
30 | os.makedirs(os.path.join(FRAME_ROOT, video[:-5]))
31 | extract(video)
32 |
33 |
34 | if __name__ == '__main__':
35 | if not os.path.exists(VIDEO_ROOT):
36 | raise ValueError('Please download videos and set VIDEO_ROOT variable.')
37 | if not os.path.exists(FRAME_ROOT):
38 | os.makedirs(FRAME_ROOT)
39 |
40 | video_list = os.listdir(VIDEO_ROOT)
41 | splits = list(split(video_list, NUM_THREADS))
42 |
43 | threads = []
44 | for i, split in enumerate(splits):
45 | thread = threading.Thread(target=target, args=(split,))
46 | thread.start()
47 | threads.append(thread)
48 |
49 | for thread in threads:
50 | thread.join()
51 |
--------------------------------------------------------------------------------