├── LICENSE ├── MAINTAINERS.md ├── README.md ├── core ├── models │ ├── __init__.py │ ├── _model_urls.py │ ├── blvnet_tam.py │ └── blvnet_tam_backbone.py ├── video_dataset.py ├── video_transforms.py └── video_utils.py ├── opts.py ├── requirement.txt ├── test.py ├── tools ├── extract_videos_st2st_v1.py └── extract_videos_st2st_v2.py └── train.py /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MAINTAINERS.md: -------------------------------------------------------------------------------- 1 | MAINTAINERS 2 | 3 | Chun-Fu (Richard) Chen - chenrich@us.ibm.com 4 | 5 | Quanfu Fan - qfan@us.ibm.com -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bLVNet-TAM 2 | 3 | This repository holds the code and models for our paper, 4 | 5 | Quanfu Fan*, Chun-Fu (Richard) Chen*, Hilde Kuehne, Marco Pistoia, David Cox, "More Is Less: Learning Efficient Video Representations by Temporal Aggregation Modules" 6 | 7 | If you use the code and models from this repo, please cite our work. Thanks! 8 | ``` 9 | @incollection{ 10 | fan2019blvnet, 11 | title={{More Is Less: Learning Efficient Video Representations by Temporal Aggregation Modules}}, 12 | author={Quanfu Fan and Chun-Fu (Ricarhd) Chen and Hilde Kuehne and Marco Pistoia and David Cox}, 13 | booktitle={Advances in Neural Information Processing Systems 33}, 14 | year={2019} 15 | } 16 | ``` 17 | 18 | ## Requirements 19 | 20 | ``` 21 | pip install -r requirement.txt 22 | ``` 23 | 24 | 25 | ## Pretrained Models on Something-Something 26 | The results below (top-1 accuracy) are reported under the single-crop and single-clip setting. 27 | 28 | ### V1 29 | 30 | | Name | Top-1 Val Acc. | 31 | |------|----------------| 32 | |[bLVNet-TAM-50-a2-b4-f8x2](https://ibm.box.com/v/st2stv1-bLVNet-TAM-50-f8x2) | 46.4 | 33 | |[bLVNet-TAM-50-a2-b4-f16x2](https://ibm.box.com/v/st2stv1-bLVNet-TAM-50-f16x2) | 48.4 | 34 | |[bLVNet-TAM-101-a2-b4-f8x2](https://ibm.box.com/v/st2stv1-bLVNet-TAM-101-f8x2) | 47.8 | 35 | |[bLVNet-TAM-101-a2-b4-f16x2](https://ibm.box.com/v/st2stv1-bLVNet-TAM-101-f16x2) | 49.6 | 36 | |[bLVNet-TAM-101-a2-b4-f24x2](https://ibm.box.com/v/st2stv1-bLVNet-TAM-101-f24x2) | 52.2 | 37 | |[bLVNet-TAM-101-a2-b4-f32x2](https://ibm.box.com/v/st2stv1-bLVNet-TAM-101-f32x2) | 53.1| 38 | 39 | ### V2 40 | 41 | | Name | Top-1 Val Acc. | 42 | |------|------------| 43 | |[bLVNet-TAM-50-a2-b4-f8x2](https://ibm.box.com/v/st2stv2-bLVNet-TAM-50-f8x2) | 59.1 | 44 | |[bLVNet-TAM-50-a2-b4-f16x2](https://ibm.box.com/v/st2stv2-bLVNet-TAM-50-f16x2) | 61.7 | 45 | |[bLVNet-TAM-101-a2-b4-f8x2](https://ibm.box.com/v/st2stv2-bLVNet-TAM-101-f8x2) | 60.2 | 46 | |[bLVNet-TAM-101-a2-b4-f16x2](https://ibm.box.com/v/st2stv2-bLVNet-TAM-101-f16x2) | 61.9 | 47 | |[bLVNet-TAM-101-a2-b4-f24x2](https://ibm.box.com/v/st2stv2-bLVNet-TAM-101-f24x2) | 64.0 | 48 | |[bLVNet-TAM-101-a2-b4-f32x2](https://ibm.box.com/v/st2stv2-bLVNet-TAM-101-f32x2) | 65.2 | 49 | 50 | ## Data Preparation 51 | We provide two scripts in the folder `tools` for prepare input data for model training. The scripts sample an image sequence from a video and then resize each image to have its shorter side to be `256` while keeping the aspect ratio of the image. 52 | You may need to set up `folder_root` accordingly to assure the extraction works correctly. 53 | 54 | ## Training 55 | To reproduce the results in our paper, the pretrained models of bLNet are required and they are available at [here](https://github.com/IBM/BigLittleNet). 56 | 57 | With the pretrained models placed in the folder `pretrained`, the following script can be used to train 58 | a bLVNet-101-TAM-a2-b4-f8x2 model on Something-Something V2 59 | 60 | ``` 61 | python3 train.py --datadir /path/to/folder \ 62 | --dataset st2stv2 -d 101 --groups 16 \ 63 | --logdir /path/to/logdir --lr 0.01 -b 64 --dropout 0.5 -j 36 \ 64 | --blending_frames 3 --epochs 50 --disable_scaleup --imagenet_blnet_pretrained 65 | ``` 66 | 67 | ## Test 68 | 69 | First download the models and put them in the `pretrained` folder. Then follow the example below to evaluate a model. 70 | Example: evaluating the bLVNet-101-TAM-a2-b4-f8x2 model on Something-Something V2 71 | ``` 72 | python3 test.py --datadir /path/to/folder --dataset st2stv2 -d 101 --groups 16 \ 73 | --alpha 2 --beta 4 --evaluate --pretrained --dataset --disable_scaleup \ 74 | --logdir /path/to/logdir 75 | ``` 76 | 77 | You can add `num_crops` and `num_clips` arguments to perform multi-crops and multi-clips evaluation to video-level accuracy. 78 | 79 | Please feel free to let us know if you encounter any issue when using our code and models. 80 | 81 | -------------------------------------------------------------------------------- /core/models/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .blvnet_tam import bLVNet_TAM 3 | 4 | __all__ = ['bLVNet_TAM'] -------------------------------------------------------------------------------- /core/models/_model_urls.py: -------------------------------------------------------------------------------- 1 | 2 | model_urls = { 3 | # something-something v1 4 | 'st2stv1-bLVNet-TAM-50-a2-b4-f8x2': 'pretrained/st2stv1-bLVNet-TAM-50-a2-b4-f8x2.pth.tar', 5 | 'st2stv1-bLVNet-TAM-50-a2-b4-f16x2': 'pretrained/st2stv1-bLVNet-TAM-50-a2-b4-f16x2.pth.tar', 6 | 'st2stv1-bLVNet-TAM-101-a2-b4-f8x2': 'pretrained/st2stv1-bLVNet-TAM-101-a2-b4-f8x2.pth.tar', 7 | 'st2stv1-bLVNet-TAM-101-a2-b4-f16x2': 'pretrained/st2stv1-bLVNet-TAM-101-a2-b4-f16x2.pth.tar', 8 | 'st2stv1-bLVNet-TAM-101-a2-b4-f24x2': 'pretrained/st2stv1-bLVNet-TAM-101-a2-b4-f24x2.pth.tar', 9 | 'st2stv1-bLVNet-TAM-101-a2-b4-f32x2': 'pretrained/st2stv1-bLVNet-TAM-101-a2-b4-f32x2.pth.tar', 10 | # something-something v2 11 | 'st2stv2-bLVNet-TAM-50-a2-b4-f8x2': 'pretrained/st2stv2-bLVNet-TAM-50-a2-b4-f8x2.pth.tar', 12 | 'st2stv2-bLVNet-TAM-50-a2-b4-f16x2': 'pretrained/st2stv2-bLVNet-TAM-50-a2-b4-f16x2.pth.tar', 13 | 'st2stv2-bLVNet-TAM-101-a2-b4-f8x2': 'pretrained/st2stv2-bLVNet-TAM-101-a2-b4-f8x2.pth.tar', 14 | 'st2stv2-bLVNet-TAM-101-a2-b4-f16x2': 'pretrained/st2stv2-bLVNet-TAM-101-a2-b4-f16x2.pth.tar', 15 | 'st2stv2-bLVNet-TAM-101-a2-b4-f24x2': 'pretrained/st2stv2-bLVNet-TAM-101-a2-b4-f24x2.pth.tar', 16 | 'st2stv2-bLVNet-TAM-101-a2-b4-f32x2': 'pretrained/st2stv2-bLVNet-TAM-101-a2-b4-f32x2.pth.tar', 17 | # kinetics 400 18 | 'kinetics400-bLVNet-TAM-50-a2-b4-f8x2': 'pretrained/kinetics400-bLVNet-TAM-50-a2-b4-f8x2.pth.tar', 19 | 'kinetics400-bLVNet-TAM-50-a2-b4-f16x2': 'pretrained/kinetics400-bLVNet-TAM-50-a2-b4-f16x2.pth.tar', 20 | } -------------------------------------------------------------------------------- /core/models/blvnet_tam.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | 4 | from .blvnet_tam_backbone import blvnet_tam_backbone 5 | from ._model_urls import model_urls 6 | 7 | 8 | class DotDict(dict): 9 | """dot.notation access to dictionary attributes""" 10 | __getattr__ = dict.get 11 | __setattr__ = dict.__setitem__ 12 | __delattr__ = dict.__delitem__ 13 | 14 | 15 | class bLVNet_TAM(nn.Module): 16 | 17 | def __init__(self, params): 18 | super().__init__() 19 | params = DotDict(params) 20 | print(params) 21 | self.baseline_model = blvnet_tam_backbone(params.depth, params.alpha, params.beta, 22 | num_frames=params.groups, 23 | blending_frames=params.blending_frames, 24 | input_channels=params.input_channels, 25 | imagenet_blnet_pretrained=params.imagenet_blnet_pretrained) 26 | self.num_frames = params.groups 27 | self.blending_frames = params.blending_frames 28 | self.blending_method = params.blending_method 29 | self.partial_freeze_bn = params.partial_freeze_bn 30 | self.dropout = params.dropout 31 | self.modality = 'rgb' 32 | 33 | # get the dim of feature vec 34 | feature_dim = getattr(self.baseline_model, 'fc').in_features 35 | # update the fc layer and initialize it 36 | self.prepare_baseline(feature_dim, params.num_classes) 37 | 38 | self.model_name = '{dataset}-bLVNet-TAM-{depth}-a{alpha}-b{beta}-f{num_frames}x2'.format( 39 | dataset=params.dataset, depth=params.depth, alpha=params.alpha, beta=params.beta, 40 | num_frames=params.groups // 2) 41 | 42 | if params.pretrained: 43 | checkpoint = torch.load(model_urls[self.model_name], map_location='cpu') 44 | self.load_state_dict(checkpoint) 45 | 46 | def prepare_baseline(self, feature_dim, num_classes): 47 | if self.dropout > 0.0: 48 | # replace the original fc layer as dropout layer 49 | setattr(self.baseline_model, 'fc', nn.Dropout(p=self.dropout)) 50 | self.new_fc = nn.Linear(feature_dim, num_classes) 51 | nn.init.normal_(self.new_fc.weight, 0, 0.001) 52 | nn.init.constant_(self.new_fc.bias, 0) 53 | else: 54 | setattr(self.baseline_model, 'fc', nn.Linear(feature_dim, num_classes)) 55 | nn.init.normal_(getattr(self.baseline_model, 'fc').weight, 0, 0.001) 56 | nn.init.constant_(getattr(self.baseline_model, 'fc').bias, 0) 57 | 58 | def forward(self, x): 59 | n, c_t, h, w = x.shape 60 | batched_input = x.view(n * self.num_frames, c_t // self.num_frames, h, w) 61 | base_out = self.baseline_model(batched_input) 62 | if self.dropout > 0.0: 63 | base_out = self.new_fc(base_out) 64 | n_t, c = base_out.shape 65 | curr_num_frames = n_t // n 66 | base_out = base_out.view(n, curr_num_frames, c) 67 | # dim of base_out: [N, 1, num_classes] 68 | # average all frames 69 | out = torch.mean(base_out, dim=1) 70 | # dim of out: [N, num_classes] 71 | return out 72 | -------------------------------------------------------------------------------- /core/models/blvnet_tam_backbone.py: -------------------------------------------------------------------------------- 1 | 2 | import itertools 3 | from collections import OrderedDict 4 | 5 | import torch.nn as nn 6 | import torch 7 | import torch.nn.functional as F 8 | 9 | __all__ = ['bLVNet_TAM_BACKBONE', 'blvnet_tam_backbone'] 10 | 11 | model_urls = { 12 | 'blresnet50': 'pretrained/ImageNet-bLResNet-50-a2-b4.pth.tar', 13 | 'blresnet101': 'pretrained/ImageNet-bLResNet-101-a2-b4.pth.tar' 14 | } 15 | 16 | 17 | class TAM(nn.Module): 18 | 19 | def __init__(self, duration, channels, blending_frames=3): 20 | super().__init__() 21 | self.blending_frames = blending_frames 22 | 23 | if blending_frames == 3: 24 | self.prev = nn.Conv2d(channels, channels, kernel_size=1, 25 | padding=0, groups=channels, bias=False) 26 | self.next = nn.Conv2d(channels, channels, kernel_size=1, 27 | padding=0, groups=channels, bias=False) 28 | self.curr = nn.Conv2d(channels, channels, kernel_size=1, 29 | padding=0, groups=channels, bias=False) 30 | else: 31 | self.blending_layers = nn.ModuleList([nn.Conv2d(channels, channels, kernel_size=1, 32 | padding=0, groups=channels, bias=False) 33 | for i in range(blending_frames)]) 34 | self.relu = nn.ReLU(inplace=True) 35 | self.duration = duration 36 | 37 | def forward(self, x): 38 | if self.blending_frames == 3: 39 | 40 | prev_x = self.prev(x) 41 | curr_x = self.curr(x) 42 | next_x = self.next(x) 43 | prev_x = prev_x.view((-1, self.duration) + prev_x.size()[1:]) 44 | curr_x = curr_x.view((-1, self.duration) + curr_x.size()[1:]) 45 | next_x = next_x.view((-1, self.duration) + next_x.size()[1:]) 46 | 47 | prev_x = F.pad(prev_x, (0, 0, 0, 0, 0, 0, 1, 0))[:, :-1, ...] 48 | next_x = F.pad(next_x, (0, 0, 0, 0, 0, 0, 0, 1))[:, 1:, ...] 49 | 50 | out = torch.stack([prev_x, curr_x, next_x], dim=0) 51 | else: 52 | # multiple blending 53 | xs = [se(x) for se in self.blending_layers] 54 | xs = [x.view((-1, self.duration) + x.size()[1:]) for x in xs] 55 | 56 | shifted_xs = [] 57 | for i in range(self.blending_frames): 58 | shift = i - (self.blending_frames // 2) 59 | x_temp = xs[i] 60 | n, t, c, h, w = x_temp.shape 61 | start_index = 0 if shift < 0 else shift 62 | end_index = t if shift < 0 else t + shift 63 | padding = None 64 | if shift < 0: 65 | padding = (0, 0, 0, 0, 0, 0, abs(shift), 0) 66 | elif shift > 0: 67 | padding = (0, 0, 0, 0, 0, 0, 0, shift) 68 | shifted_xs.append(F.pad(x_temp, padding)[:, start_index:end_index, ...] 69 | if padding is not None else x_temp) 70 | 71 | out = torch.stack(shifted_xs, dim=0) 72 | out = torch.sum(out, dim=0) 73 | out = self.relu(out) 74 | # [N, T, C, N, H] 75 | out = out.view((-1, ) + out.size()[2:]) 76 | return out 77 | 78 | 79 | def get_frame_list(init_list, num_frames, batch_size): 80 | if batch_size == 0: 81 | return [] 82 | 83 | flist = list() 84 | for i in range(batch_size): 85 | flist.append([k + i * num_frames for k in init_list]) 86 | return list(itertools.chain(*flist)) 87 | 88 | 89 | class Bottleneck(nn.Module): 90 | expansion = 4 91 | 92 | def __init__(self, inplanes, planes, stride=1, downsample=None, last_relu=True, 93 | with_tam=False, num_frames=-1, blending_frames=-1): 94 | 95 | super().__init__() 96 | self.conv1 = nn.Conv2d(inplanes, planes // self.expansion, kernel_size=1, bias=False) 97 | self.bn1 = nn.BatchNorm2d(planes // self.expansion) 98 | self.conv2 = nn.Conv2d(planes // self.expansion, planes // self.expansion, kernel_size=3, 99 | stride=stride, padding=1, bias=False) 100 | self.bn2 = nn.BatchNorm2d(planes // self.expansion) 101 | self.conv3 = nn.Conv2d(planes // self.expansion, planes, kernel_size=1, bias=False) 102 | self.bn3 = nn.BatchNorm2d(planes) 103 | self.relu = nn.ReLU(inplace=True) 104 | self.downsample = downsample 105 | self.stride = stride 106 | self.last_relu = last_relu 107 | 108 | self.tam = TAM(num_frames, inplanes, blending_frames) \ 109 | if with_tam else None 110 | 111 | def forward(self, x): 112 | residual = x 113 | 114 | if self.tam is not None: 115 | x = self.tam(x) 116 | 117 | out = self.conv1(x) 118 | out = self.bn1(out) 119 | out = self.relu(out) 120 | 121 | out = self.conv2(out) 122 | out = self.bn2(out) 123 | out = self.relu(out) 124 | 125 | out = self.conv3(out) 126 | out = self.bn3(out) 127 | 128 | if self.downsample is not None: 129 | residual = self.downsample(x) 130 | 131 | out += residual 132 | if self.last_relu: 133 | out = self.relu(out) 134 | 135 | return out 136 | 137 | 138 | class bLModule(nn.Module): 139 | def __init__(self, block, in_channels, out_channels, blocks, alpha, beta, stride, 140 | num_frames, blending_frames=3): 141 | super(bLModule, self).__init__() 142 | self.num_frames = num_frames 143 | self.blending_frames = blending_frames 144 | 145 | self.relu = nn.ReLU(inplace=True) 146 | self.big = self._make_layer(block, in_channels, out_channels, blocks - 1, 2, last_relu=False) 147 | self.little = self._make_layer(block, in_channels, out_channels // alpha, max(1, blocks // beta - 1)) 148 | self.little_e = nn.Sequential( 149 | nn.Conv2d(out_channels // alpha, out_channels, kernel_size=1, bias=False), 150 | nn.BatchNorm2d(out_channels)) 151 | 152 | self.fusion = self._make_layer(block, out_channels, out_channels, 1, stride=stride) 153 | self.tam = TAM(self.num_frames, in_channels, blending_frames=self.blending_frames) 154 | 155 | def _make_layer(self, block, inplanes, planes, blocks, stride=1, last_relu=True): 156 | downsample = [] 157 | if stride != 1: 158 | downsample.append(nn.AvgPool2d(3, stride=2, padding=1)) 159 | if inplanes != planes: 160 | downsample.append(nn.Conv2d(inplanes, planes, kernel_size=1, stride=1, bias=False)) 161 | downsample.append(nn.BatchNorm2d(planes)) 162 | downsample = None if downsample == [] else nn.Sequential(*downsample) 163 | 164 | layers = [] 165 | if blocks == 1: 166 | layers.append(block(inplanes, planes, stride, downsample)) 167 | else: 168 | layers.append(block(inplanes, planes, stride, downsample)) 169 | for i in range(1, blocks): 170 | layers.append(block(planes, planes, 171 | last_relu=last_relu if i == blocks - 1 else True)) 172 | 173 | return nn.Sequential(*layers) 174 | 175 | def forward(self, x, big_frame_num, big_list, little_frame_num, little_list): 176 | n = x.size()[0] 177 | if self.tam is not None: 178 | x = self.tam(x) 179 | 180 | big = self.big(x[big_list, ::]) 181 | little = self.little(x[little_list, ::]) 182 | little = self.little_e(little) 183 | big = torch.nn.functional.interpolate(big, little.shape[2:]) 184 | 185 | # [0 1] sum up current and next frames 186 | bn = big_frame_num 187 | ln = little_frame_num 188 | 189 | big = big.view((-1, bn) + big.size()[1:]) 190 | little = little.view((-1, ln) + little.size()[1:]) 191 | big += little # left frame 192 | 193 | # only do the big branch 194 | big = big.view((-1,) + big.size()[2:]) 195 | big = self.relu(big) 196 | big = self.fusion(big) 197 | 198 | # distribute big to both 199 | x = torch.zeros((n,) + big.size()[1:], device=big.device, dtype=big.dtype) 200 | x[range(0, n, 2), ::] = big 201 | x[range(1, n, 2), ::] = big 202 | 203 | return x 204 | 205 | 206 | class bLVNet_TAM_BACKBONE(nn.Module): 207 | 208 | def __init__(self, block, layers, alpha, beta, num_frames, num_classes=1000, 209 | blending_frames=3, input_channels=3): 210 | 211 | self.num_frames = num_frames 212 | self.blending_frames = blending_frames 213 | 214 | self.bL_ratio = 2 215 | self.big_list = range(self.bL_ratio // 2, num_frames, self.bL_ratio) 216 | self.little_list = list(set(range(0, num_frames)) - set(self.big_list)) 217 | 218 | num_channels = [64, 128, 256, 512] 219 | self.inplanes = 64 220 | 221 | super().__init__() 222 | 223 | self.conv1 = nn.Conv2d(input_channels, num_channels[0], kernel_size=7, stride=2, padding=3, 224 | bias=False) 225 | self.bn1 = nn.BatchNorm2d(num_channels[0]) 226 | self.relu = nn.ReLU(inplace=True) 227 | self.b_conv0 = nn.Conv2d(num_channels[0], num_channels[0], kernel_size=3, stride=2, 228 | padding=1, bias=False) 229 | self.bn_b0 = nn.BatchNorm2d(num_channels[0]) 230 | self.l_conv0 = nn.Conv2d(num_channels[0], num_channels[0] // alpha, 231 | kernel_size=3, stride=1, padding=1, bias=False) 232 | self.bn_l0 = nn.BatchNorm2d(num_channels[0] // alpha) 233 | self.l_conv1 = nn.Conv2d(num_channels[0] // alpha, num_channels[0] // 234 | alpha, kernel_size=3, stride=2, padding=1, bias=False) 235 | self.bn_l1 = nn.BatchNorm2d(num_channels[0] // alpha) 236 | self.l_conv2 = nn.Conv2d(num_channels[0] // alpha, num_channels[0], kernel_size=1, stride=1, bias=False) 237 | self.bn_l2 = nn.BatchNorm2d(num_channels[0]) 238 | 239 | self.bl_init = nn.Conv2d(num_channels[0], num_channels[0], kernel_size=1, stride=1, bias=False) 240 | self.bn_bl_init = nn.BatchNorm2d(num_channels[0]) 241 | 242 | self.tam = TAM(self.num_frames, num_channels[0], blending_frames=self.blending_frames) 243 | 244 | self.layer1 = bLModule(block, num_channels[0], num_channels[0] * block.expansion, 245 | layers[0], alpha, beta, stride=2, num_frames=self.num_frames, 246 | blending_frames=blending_frames) 247 | self.layer2 = bLModule(block, num_channels[0] * block.expansion, 248 | num_channels[1] * block.expansion, layers[1], alpha, beta, stride=2, 249 | num_frames=self.num_frames, 250 | blending_frames=blending_frames) 251 | self.layer3 = bLModule(block, num_channels[1] * block.expansion, 252 | num_channels[2] * block.expansion, layers[2], alpha, beta, stride=1, 253 | num_frames=self.num_frames, 254 | blending_frames=blending_frames) 255 | # only half frames are used. 256 | self.layer4 = self._make_layer( 257 | block, num_channels[2] * block.expansion, num_channels[3] * block.expansion, layers[3], 258 | num_frames=self.num_frames // 2, stride=2) 259 | 260 | self.gappool = nn.AdaptiveAvgPool2d(1) 261 | self.fc = nn.Linear(num_channels[3] * block.expansion, num_classes) 262 | 263 | for m in self.modules(): 264 | if isinstance(m, nn.Conv2d): 265 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 266 | elif isinstance(m, nn.BatchNorm2d): 267 | nn.init.constant_(m.weight, 1) 268 | nn.init.constant_(m.bias, 0) 269 | 270 | # Zero-initialize the last BN in each block. 271 | # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 272 | for m in self.modules(): 273 | if isinstance(m, Bottleneck): 274 | nn.init.constant_(m.bn3.weight, 0) 275 | 276 | def _make_layer(self, block, inplanes, planes, blocks, num_frames, stride=1, with_tam=True): 277 | downsample = [] 278 | if stride != 1: 279 | downsample.append(nn.AvgPool2d(3, stride=2, padding=1)) 280 | if inplanes != planes: 281 | downsample.append(nn.Conv2d(inplanes, planes, kernel_size=1, stride=1, bias=False)) 282 | downsample.append(nn.BatchNorm2d(planes)) 283 | downsample = None if downsample == [] else nn.Sequential(*downsample) 284 | 285 | layers = [] 286 | layers.append(block(inplanes, planes, stride, downsample, with_tam=with_tam, 287 | num_frames=num_frames, blending_frames=self.blending_frames)) 288 | for i in range(1, blocks): 289 | layers.append(block(planes, planes, with_tam=with_tam, 290 | num_frames=num_frames, blending_frames=self.blending_frames)) 291 | 292 | return nn.Sequential(*layers) 293 | 294 | def _forward_bL_layer0(self, x, big_frame_num, big_list, little_frame_num, little_list): 295 | n = x.size()[0] 296 | if self.tam is not None: 297 | x = self.tam(x) 298 | 299 | bx = self.b_conv0(x[big_list, ::]) 300 | bx = self.bn_b0(bx) 301 | 302 | lx = self.l_conv0(x[little_list, ::]) 303 | lx = self.bn_l0(lx) 304 | lx = self.relu(lx) 305 | lx = self.l_conv1(lx) 306 | lx = self.bn_l1(lx) 307 | lx = self.relu(lx) 308 | lx = self.l_conv2(lx) 309 | lx = self.bn_l2(lx) 310 | 311 | bn = big_frame_num 312 | ln = little_frame_num 313 | bx = bx.view((-1, bn) + bx.size()[1:]) 314 | lx = lx.view((-1, ln) + lx.size()[1:]) 315 | bx += lx # left frame 316 | 317 | bx = bx.view((-1,) + bx.size()[2:]) 318 | 319 | bx = self.relu(bx) 320 | bx = self.bl_init(bx) 321 | bx = self.bn_bl_init(bx) 322 | bx = self.relu(bx) 323 | 324 | x = torch.zeros((n,) + bx.size()[1:], device=bx.device, dtype=bx.dtype) 325 | x[range(0, n, 2), ::] = bx 326 | x[range(1, n, 2), ::] = bx 327 | 328 | return x 329 | 330 | def forward(self, x): 331 | x = self.conv1(x) 332 | x = self.bn1(x) 333 | x = self.relu(x) 334 | 335 | n = x.size()[0] 336 | batch_size = n // self.num_frames 337 | big_list = get_frame_list(self.big_list, self.num_frames, batch_size) 338 | little_list = get_frame_list(self.little_list, self.num_frames, batch_size) 339 | 340 | x = self._forward_bL_layer0(x, len(self.big_list), big_list, len(self.little_list), little_list) 341 | x = self.layer1(x, len(self.big_list), big_list, len(self.little_list), little_list) 342 | x = self.layer2(x, len(self.big_list), big_list, len(self.little_list), little_list) 343 | x = self.layer3(x, len(self.big_list), big_list, len(self.little_list), little_list) 344 | 345 | x = self.layer4(x[big_list, ::]) 346 | 347 | x = self.gappool(x) 348 | x = x.view(x.size(0), -1) 349 | x = self.fc(x) 350 | 351 | return x 352 | 353 | 354 | def blvnet_tam_backbone(depth, alpha, beta, num_frames, blending_frames=3, input_channels=3, 355 | imagenet_blnet_pretrained=True): 356 | layers = { 357 | 50: [3, 4, 6, 3], 358 | 101: [4, 8, 18, 3], 359 | 152: [5, 12, 30, 3] 360 | }[depth] 361 | 362 | model = bLVNet_TAM_BACKBONE(Bottleneck, layers, alpha, beta, num_frames, 363 | blending_frames=blending_frames, input_channels=input_channels) 364 | 365 | if imagenet_blnet_pretrained: 366 | checkpoint = torch.load(model_urls['blresnet{}'.format(depth)], map_location='cpu') 367 | print("loading weights from ImageNet-pretrained blnet, blresnet{}".format(depth), 368 | flush=True) 369 | # fixed parameter names in order to load the weights correctly 370 | state_d = OrderedDict() 371 | if input_channels != 3: # flow 372 | print("Convert RGB model to Flow") 373 | for key, value in checkpoint['state_dict'].items(): 374 | new_key = key.replace('module.', '') 375 | if "conv1.weight" in key: 376 | o_c, in_c, k_h, k_w = value.shape 377 | else: 378 | o_c, in_c, k_h, k_w = 0, 0, 0, 0 379 | if k_h == 7 and k_w == 7: 380 | # average the weights and expand to all channels 381 | new_shape = (o_c, input_channels, k_h, k_w) 382 | new_value = value.mean(dim=1, keepdim=True).expand(new_shape).contiguous() 383 | else: 384 | new_value = value 385 | state_d[new_key] = new_value 386 | else: 387 | print("Loading RGB model") 388 | for key, value in checkpoint['state_dict'].items(): 389 | new_key = key.replace('module.', '') 390 | state_d[new_key] = value 391 | msg = model.load_state_dict(state_d, strict=False) 392 | print(msg, flush=True) 393 | 394 | return model 395 | -------------------------------------------------------------------------------- /core/video_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import torch 5 | from PIL import Image 6 | import torch.utils.data as data 7 | 8 | 9 | def random_clip(video_frames, sampling_rate, frames_per_clip, fixed_offset=False): 10 | """ 11 | 12 | Args: 13 | video_frames (int): total frame number of a video 14 | sampling_rate (int): sampling rate for clip, pick one every k frames 15 | frames_per_clip (int): number of frames of a clip 16 | fixed_offset (bool): used with sample offset to decide the offset value deterministically. 17 | 18 | Returns: 19 | list[int]: frame indices (started from zero) 20 | """ 21 | new_sampling_rate = sampling_rate 22 | highest_idx = video_frames - new_sampling_rate * frames_per_clip 23 | if highest_idx <= 0: 24 | random_offset = 0 25 | else: 26 | if fixed_offset: 27 | random_offset = (video_frames - new_sampling_rate * frames_per_clip) // 2 28 | else: 29 | random_offset = int(np.random.randint(0, highest_idx, 1)) 30 | frame_idx = [int(random_offset + i * sampling_rate) % video_frames for i in range(frames_per_clip)] 31 | return frame_idx 32 | 33 | 34 | class VideoRecord(object): 35 | def __init__(self, path, start_frame, end_frame, label, reverse=False): 36 | self.path = path 37 | self.video_id = os.path.basename(path) 38 | self.start_frame = start_frame 39 | self.end_frame = end_frame 40 | self.label = label 41 | self.reverse = reverse 42 | 43 | @property 44 | def num_frames(self): 45 | return self.end_frame - self.start_frame + 1 46 | 47 | class VideoDataSet(data.Dataset): 48 | 49 | def __init__(self, root_path, list_file, num_groups=64, frames_per_group=4, sample_offset=0, num_clips=1, 50 | modality='rgb', dense_sampling=False, fixed_offset=True, 51 | image_tmpl='{:05d}.jpg', transform=None, is_train=True, test_mode=False, seperator=' ', 52 | filter_video=0, num_classes=None): 53 | """ 54 | 55 | Argments have different meaning when dense_sampling is True: 56 | - num_groups ==> number of frames 57 | - frames_per_group ==> sample every K frame 58 | - sample_offset ==> number of clips used in validation or test mode 59 | 60 | Args: 61 | root_path (str): the file path to the root of video folder 62 | list_file (str): the file list, each line with folder_path, start_frame, end_frame, label_id 63 | num_groups (int): number of frames per data sample 64 | frames_per_group (int): number of frames within one group 65 | sample_offset (int): used in validation/test, the offset when sampling frames from a group 66 | modality (str): rgb or flow 67 | dense_sampling (bool): dense sampling in I3D 68 | fixed_offset (bool): used for generating the same videos used in TSM 69 | image_tmpl (str): template of image ids 70 | transform: the transformer for preprocessing 71 | is_train (bool): shuffle the video but keep the causality 72 | test_mode (bool): testing mode, no label 73 | """ 74 | if modality not in ['flow', 'rgb']: 75 | raise ValueError("modality should be 'flow' or 'rgb'.") 76 | 77 | self.root_path = root_path 78 | self.list_file = list_file 79 | self.num_groups = num_groups 80 | self.num_frames = num_groups 81 | self.frames_per_group = frames_per_group 82 | self.sample_freq = frames_per_group 83 | self.num_clips = num_clips 84 | self.sample_offset = sample_offset 85 | self.fixed_offset = fixed_offset 86 | self.dense_sampling = dense_sampling 87 | self.modality = modality.lower() 88 | self.image_tmpl = image_tmpl 89 | self.transform = transform 90 | self.is_train = is_train 91 | self.test_mode = test_mode 92 | self.seperator = seperator 93 | self.filter_video = filter_video 94 | 95 | if self.modality == 'flow': 96 | self.num_consecutive_frames = 5 97 | else: 98 | self.num_consecutive_frames = 1 99 | 100 | self.multi_label = None 101 | self.video_list = self._parse_list() 102 | self.num_classes = num_classes 103 | 104 | def _image_path(self, directory, idx): 105 | return os.path.join(self.root_path, directory, self.image_tmpl.format(idx)) 106 | 107 | def _load_image(self, directory, idx): 108 | 109 | def _safe_load_image(img_path): 110 | img_tmp = Image.open(img_path) 111 | img = img_tmp.copy() 112 | img_tmp.close() 113 | return img 114 | 115 | num_try = 0 116 | image_path_file = os.path.join(self.root_path, directory, self.image_tmpl.format(idx)) 117 | 118 | img = None 119 | while num_try < 10: 120 | try: 121 | if self.modality == 'rgb': 122 | img = [_safe_load_image(image_path_file)] 123 | else: 124 | ext = image_path_file.split(".")[-1] 125 | flow_x_name = image_path_file.replace(".{}".format(ext), "_x.{}".format(ext)) 126 | flow_y_name = image_path_file.replace(".{}".format(ext), "_y.{}".format(ext)) 127 | img = [_safe_load_image(flow_x_name), _safe_load_image(flow_y_name)] 128 | break 129 | except Exception as e: 130 | print('[Will try load again] error loading image: {}, error: {}'.format(image_path_file, str(e))) 131 | num_try += 1 132 | 133 | if img is None: 134 | raise ValueError('[Fail 10 times] error loading image: {}'.format(image_path_file)) 135 | 136 | return img 137 | 138 | def _parse_list(self): 139 | # usualy it is [video_id, num_frames, class_idx] 140 | # or [video_id, start_frame, end_frame, list of class_idx] 141 | tmp = [] 142 | original_video_numbers = 0 143 | for x in open(self.list_file): 144 | elements = x.strip().split(self.seperator) 145 | start_frame = int(elements[1]) 146 | end_frame = int(elements[2]) 147 | total_frame = end_frame - start_frame + 1 148 | original_video_numbers += 1 149 | if self.test_mode: 150 | tmp.append(elements) 151 | else: 152 | if total_frame >= self.filter_video: 153 | tmp.append(elements) 154 | 155 | num = len(tmp) 156 | print("The number of videos is {} (with more than {} frames) " 157 | "(original: {})".format(num, self.filter_video, original_video_numbers), flush=True) 158 | assert (num > 0) 159 | # TODO: a better way to check if multi-label or not 160 | self.multi_label = True if len(tmp[0]) > 4 else False 161 | file_list = [] 162 | for item in tmp: 163 | if self.test_mode: 164 | file_list.append([item[0], int(item[1]), int(item[2]), -1]) 165 | else: 166 | labels = [] 167 | for i in range(3, len(item)): 168 | labels.append(float(item[i])) 169 | if not self.multi_label: 170 | labels = labels[0] if len(labels) == 1 else labels 171 | file_list.append([item[0], int(item[1]), int(item[2]), labels]) 172 | 173 | video_list = [VideoRecord(item[0], item[1], item[2], item[3]) for item in file_list] 174 | # flow model has one frame less 175 | if self.modality == 'flow': 176 | for i in range(len(video_list)): 177 | video_list[i].end_frame -= 1 178 | 179 | return video_list 180 | 181 | def _sample_indices(self, record): 182 | """ 183 | Used for training. 184 | 185 | Args: 186 | - record (VideoRecord): 187 | 188 | Returns: 189 | list: frame index, index starts from 1. 190 | """ 191 | max_frame_idx = max(1, record.num_frames - self.num_consecutive_frames + 1) 192 | if self.dense_sampling: 193 | frame_idx = np.asarray(random_clip(max_frame_idx, self.sample_freq, self.num_frames)) 194 | else: 195 | total_frames = self.num_groups * self.frames_per_group 196 | ave_frames_per_group = max_frame_idx // self.num_groups 197 | if ave_frames_per_group >= self.frames_per_group: 198 | # randomly sample f images per segement 199 | frame_idx = np.arange(0, self.num_groups) * ave_frames_per_group 200 | frame_idx = np.repeat(frame_idx, repeats=self.frames_per_group) 201 | offsets = np.random.choice(ave_frames_per_group, self.frames_per_group, replace=False) 202 | offsets = np.tile(offsets, self.num_groups) 203 | frame_idx = frame_idx + offsets 204 | elif max_frame_idx < total_frames: 205 | # need to sample the same images 206 | frame_idx = np.random.choice(max_frame_idx, total_frames) 207 | else: 208 | # sample cross all images 209 | frame_idx = np.random.choice(max_frame_idx, total_frames, replace=False) 210 | frame_idx = np.sort(frame_idx) 211 | frame_idx = frame_idx + 1 212 | return frame_idx 213 | 214 | def _get_val_indices(self, record): 215 | max_frame_idx = max(1, record.num_frames - self.num_consecutive_frames + 1) 216 | if self.dense_sampling: 217 | if self.fixed_offset: 218 | sample_pos = max(1, 1 + max_frame_idx - self.sample_freq * self.num_frames) 219 | t_stride = self.sample_freq 220 | start_list = np.linspace(0, sample_pos - 1, num=self.num_clips, dtype=int) 221 | frame_idx = [] 222 | for start_idx in start_list.tolist(): 223 | frame_idx += [(idx * t_stride + start_idx) % max_frame_idx for idx in range(self.num_frames)] 224 | else: 225 | frame_idx = [] 226 | for i in range(self.num_clips): 227 | frame_idx.extend(random_clip(max_frame_idx, self.sample_freq, self.num_frames)) 228 | frame_idx = np.asarray(frame_idx) + 1 229 | else: # uniform sampling 230 | if self.fixed_offset: 231 | frame_idices = [] 232 | sample_offsets = list(range(-self.num_clips // 2 + 1, self.num_clips // 2 + 1)) 233 | for sample_offset in sample_offsets: 234 | if max_frame_idx > self.num_groups: 235 | tick = max_frame_idx / float(self.num_groups) 236 | curr_sample_offset = sample_offset 237 | if curr_sample_offset >= tick / 2.0: 238 | curr_sample_offset = tick / 2.0 - 1e-4 239 | elif curr_sample_offset < -tick / 2.0: 240 | curr_sample_offset = -tick / 2.0 241 | frame_idx = np.array([int(tick / 2.0 + curr_sample_offset + tick * x) for x in range(self.num_groups)]) 242 | else: 243 | np.random.seed(sample_offset - (-self.num_clips // 2 + 1)) 244 | frame_idx = np.random.choice(max_frame_idx, self.num_groups) 245 | frame_idx = np.sort(frame_idx) 246 | frame_idices.extend(frame_idx.tolist()) 247 | else: 248 | frame_idices = [] 249 | for i in range(self.num_clips): 250 | total_frames = self.num_groups * self.frames_per_group 251 | ave_frames_per_group = max_frame_idx // self.num_groups 252 | if ave_frames_per_group >= self.frames_per_group: 253 | # randomly sample f images per segment 254 | frame_idx = np.arange(0, self.num_groups) * ave_frames_per_group 255 | frame_idx = np.repeat(frame_idx, repeats=self.frames_per_group) 256 | offsets = np.random.choice(ave_frames_per_group, self.frames_per_group, replace=False) 257 | offsets = np.tile(offsets, self.num_groups) 258 | frame_idx = frame_idx + offsets 259 | elif max_frame_idx < total_frames: 260 | # need to sample the same images 261 | np.random.seed(i) 262 | frame_idx = np.random.choice(max_frame_idx, total_frames) 263 | else: 264 | # sample cross all images 265 | np.random.seed(i) 266 | frame_idx = np.random.choice(max_frame_idx, total_frames, replace=False) 267 | frame_idx = np.sort(frame_idx) 268 | frame_idices.extend(frame_idx.tolist()) 269 | frame_idx = np.asarray(frame_idices) + 1 270 | return frame_idx 271 | 272 | def __getitem__(self, index): 273 | """ 274 | Returns: 275 | torch.FloatTensor: (3xgxf)xHxW dimension, g is number of groups and f is the frames per group. 276 | torch.FloatTensor: the label 277 | """ 278 | record = self.video_list[index] 279 | # check this is a legit video folder 280 | if self.is_train: 281 | indices = self._sample_indices(record) 282 | else: 283 | indices = self._get_val_indices(record) 284 | 285 | images = [] 286 | for seg_ind in indices: 287 | for i in range(self.num_consecutive_frames): 288 | new_seg_ind = min(seg_ind + record.start_frame - 1 + i, record.num_frames) 289 | seg_imgs = self._load_image(record.path, new_seg_ind) 290 | images.extend(seg_imgs) 291 | 292 | images = self.transform(images) 293 | if self.test_mode: 294 | # in test mode, return the video id as label 295 | label = int(record.video_id) 296 | else: 297 | if not self.multi_label: 298 | label = int(record.label) 299 | else: 300 | # create a binary vector. 301 | label = torch.zeros(self.num_classes, dtype=torch.float) 302 | for x in record.label: 303 | label[int(x)] = 1.0 304 | 305 | # re-order data to targeted format. 306 | return images, label 307 | 308 | def __len__(self): 309 | return len(self.video_list) 310 | -------------------------------------------------------------------------------- /core/video_transforms.py: -------------------------------------------------------------------------------- 1 | import torchvision 2 | import random 3 | from PIL import Image, ImageOps 4 | import numpy as np 5 | import numbers 6 | import math 7 | import torch 8 | 9 | 10 | class GroupRandomCrop(object): 11 | def __init__(self, size): 12 | if isinstance(size, numbers.Number): 13 | self.size = (int(size), int(size)) 14 | else: 15 | self.size = size 16 | 17 | def __call__(self, img_group): 18 | 19 | w, h = img_group[0].size 20 | th, tw = self.size 21 | 22 | out_images = list() 23 | 24 | x1 = random.randint(0, w - tw) 25 | y1 = random.randint(0, h - th) 26 | 27 | for img in img_group: 28 | assert(img.size[0] == w and img.size[1] == h) 29 | if w == tw and h == th: 30 | out_images.append(img) 31 | else: 32 | out_images.append(img.crop((x1, y1, x1 + tw, y1 + th))) 33 | 34 | return out_images 35 | 36 | 37 | class GroupCenterCrop(object): 38 | def __init__(self, size): 39 | self.worker = torchvision.transforms.CenterCrop(size) 40 | 41 | def __call__(self, img_group): 42 | return [self.worker(img) for img in img_group] 43 | 44 | 45 | class GroupRandomHorizontalFlip(object): 46 | """Randomly horizontally flips the given PIL.Image with a probability of 0.5 47 | """ 48 | def __init__(self, is_flow=False): 49 | self.is_flow = is_flow 50 | 51 | def __call__(self, img_group, is_flow=False): 52 | v = random.random() 53 | if v < 0.5: 54 | ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group] 55 | if self.is_flow: 56 | for i in range(0, len(ret), 2): 57 | ret[i] = ImageOps.invert(ret[i]) # invert flow pixel values when flipping 58 | return ret 59 | else: 60 | return img_group 61 | 62 | 63 | class GroupNormalize(object): 64 | def __init__(self, mean, std): 65 | self.mean = mean 66 | self.std = std 67 | 68 | def __call__(self, tensor): 69 | 70 | rep_mean = self.mean * (tensor.size()[0] // len(self.mean)) 71 | rep_std = self.std * (tensor.size()[0] // len(self.std)) 72 | 73 | # TODO: make efficient 74 | for t, m, s in zip(tensor, rep_mean, rep_std): 75 | t.sub_(m).div_(s) 76 | 77 | return tensor 78 | 79 | 80 | class GroupScale(object): 81 | """ Rescales the input PIL.Image to the given 'size'. 82 | 'size' will be the size of the smaller edge. 83 | For example, if height > width, then image will be 84 | rescaled to (size * height / width, size) 85 | size: size of the smaller edge 86 | interpolation: Default: PIL.Image.BILINEAR 87 | """ 88 | 89 | def __init__(self, size, interpolation=Image.BILINEAR): 90 | self.worker = torchvision.transforms.Resize(size, interpolation) 91 | 92 | def __call__(self, img_group): 93 | return [self.worker(img) for img in img_group] 94 | 95 | class GroupRandomScale(object): 96 | """ Rescales the input PIL.Image to the given 'size'. 97 | 'size' will be the size of the smaller edge. 98 | For example, if height > width, then image will be 99 | rescaled to (size * height / width, size) 100 | size: size of the smaller edge 101 | interpolation: Default: PIL.Image.BILINEAR 102 | 103 | Randomly select the smaller edge from the range of 'size'. 104 | """ 105 | def __init__(self, size, interpolation=Image.BILINEAR): 106 | self.size = size 107 | self.interpolation = interpolation 108 | 109 | def __call__(self, img_group): 110 | selected_size = np.random.randint(low=self.size[0], high=self.size[1] + 1, dtype=int) 111 | scale = GroupScale(selected_size, interpolation=self.interpolation) 112 | return scale(img_group) 113 | 114 | class GroupOverSample(object): 115 | def __init__(self, crop_size, scale_size=None, num_crops=5, flip=False): 116 | self.crop_size = crop_size if not isinstance(crop_size, int) else (crop_size, crop_size) 117 | 118 | if scale_size is not None: 119 | self.scale_worker = GroupScale(scale_size) 120 | else: 121 | self.scale_worker = None 122 | 123 | if num_crops not in [1, 3, 5, 10]: 124 | raise ValueError("num_crops should be in [1, 3, 5, 10] but ({})".format(num_crops)) 125 | self.num_crops = num_crops 126 | 127 | self.flip = flip 128 | 129 | def __call__(self, img_group): 130 | 131 | if self.scale_worker is not None: 132 | img_group = self.scale_worker(img_group) 133 | 134 | image_w, image_h = img_group[0].size 135 | crop_w, crop_h = self.crop_size 136 | 137 | if self.num_crops == 3: 138 | w_step = (image_w - crop_w) // 4 139 | h_step = (image_h - crop_h) // 4 140 | offsets = list() 141 | if image_w < image_h: 142 | offsets.append((2 * w_step, 0 * h_step)) # top 143 | offsets.append((2 * w_step, 4 * h_step)) # bottom 144 | offsets.append((2 * w_step, 2 * h_step)) # center 145 | else: 146 | offsets.append((0 * w_step, 2 * h_step)) # left 147 | offsets.append((4 * w_step, 2 * h_step)) # right 148 | offsets.append((2 * w_step, 2 * h_step)) # center 149 | 150 | else: 151 | offsets = GroupMultiScaleCrop.fill_fix_offset(False, image_w, image_h, crop_w, crop_h) 152 | 153 | oversample_group = list() 154 | for o_w, o_h in offsets: 155 | normal_group = list() 156 | flip_group = list() 157 | for i, img in enumerate(img_group): 158 | crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h)) 159 | normal_group.append(crop) 160 | if self.flip: 161 | flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT) 162 | 163 | if img.mode == 'L' and i % 2 == 0: 164 | flip_group.append(ImageOps.invert(flip_crop)) 165 | else: 166 | flip_group.append(flip_crop) 167 | 168 | oversample_group.extend(normal_group) 169 | if self.flip: 170 | oversample_group.extend(flip_group) 171 | return oversample_group 172 | 173 | 174 | class GroupMultiScaleCrop(object): 175 | 176 | def __init__(self, input_size, scales=None, max_distort=1, fix_crop=True, more_fix_crop=True): 177 | self.scales = scales if scales is not None else [1, 875, .75, .66] 178 | self.max_distort = max_distort 179 | self.fix_crop = fix_crop 180 | self.more_fix_crop = more_fix_crop 181 | self.input_size = input_size if not isinstance(input_size, int) else [input_size, input_size] 182 | self.interpolation = Image.BILINEAR 183 | 184 | def __call__(self, img_group): 185 | 186 | im_size = img_group[0].size 187 | 188 | crop_w, crop_h, offset_w, offset_h = self._sample_crop_size(im_size) 189 | crop_img_group = [img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h)) for img in img_group] 190 | ret_img_group = [img.resize((self.input_size[0], self.input_size[1]), self.interpolation) 191 | for img in crop_img_group] 192 | return ret_img_group 193 | 194 | def _sample_crop_size(self, im_size): 195 | image_w, image_h = im_size[0], im_size[1] 196 | 197 | # find a crop size 198 | base_size = min(image_w, image_h) 199 | crop_sizes = [int(base_size * x) for x in self.scales] 200 | crop_h = [self.input_size[1] if abs(x - self.input_size[1]) < 3 else x for x in crop_sizes] 201 | crop_w = [self.input_size[0] if abs(x - self.input_size[0]) < 3 else x for x in crop_sizes] 202 | 203 | pairs = [] 204 | for i, h in enumerate(crop_h): 205 | for j, w in enumerate(crop_w): 206 | if abs(i - j) <= self.max_distort: 207 | pairs.append((w, h)) 208 | 209 | crop_pair = random.choice(pairs) 210 | if not self.fix_crop: 211 | w_offset = random.randint(0, image_w - crop_pair[0]) 212 | h_offset = random.randint(0, image_h - crop_pair[1]) 213 | else: 214 | w_offset, h_offset = self._sample_fix_offset(image_w, image_h, crop_pair[0], crop_pair[1]) 215 | 216 | return crop_pair[0], crop_pair[1], w_offset, h_offset 217 | 218 | def _sample_fix_offset(self, image_w, image_h, crop_w, crop_h): 219 | offsets = self.fill_fix_offset(self.more_fix_crop, image_w, image_h, crop_w, crop_h) 220 | return random.choice(offsets) 221 | 222 | @staticmethod 223 | def fill_fix_offset(more_fix_crop, image_w, image_h, crop_w, crop_h): 224 | w_step = (image_w - crop_w) // 4 225 | h_step = (image_h - crop_h) // 4 226 | 227 | ret = list() 228 | ret.append((0, 0)) # upper left 229 | ret.append((4 * w_step, 0)) # upper right 230 | ret.append((0, 4 * h_step)) # lower left 231 | ret.append((4 * w_step, 4 * h_step)) # lower right 232 | ret.append((2 * w_step, 2 * h_step)) # center 233 | 234 | if more_fix_crop: 235 | ret.append((0, 2 * h_step)) # center left 236 | ret.append((4 * w_step, 2 * h_step)) # center right 237 | ret.append((2 * w_step, 4 * h_step)) # lower center 238 | ret.append((2 * w_step, 0 * h_step)) # upper center 239 | 240 | ret.append((1 * w_step, 1 * h_step)) # upper left quarter 241 | ret.append((3 * w_step, 1 * h_step)) # upper right quarter 242 | ret.append((1 * w_step, 3 * h_step)) # lower left quarter 243 | ret.append((3 * w_step, 3 * h_step)) # lower righ quarter 244 | 245 | return ret 246 | 247 | 248 | class GroupRandomSizedCrop(object): 249 | """Random crop the given PIL.Image to a random size of (0.08 to 1.0) of the original size 250 | and and a random aspect ratio of 3/4 to 4/3 of the original aspect ratio 251 | This is popularly used to train the Inception networks 252 | size: size of the smaller edge 253 | interpolation: Default: PIL.Image.BILINEAR 254 | """ 255 | def __init__(self, size, interpolation=Image.BILINEAR): 256 | self.size = size 257 | self.interpolation = interpolation 258 | 259 | def __call__(self, img_group): 260 | for attempt in range(10): 261 | area = img_group[0].size[0] * img_group[0].size[1] 262 | target_area = random.uniform(0.08, 1.0) * area 263 | aspect_ratio = random.uniform(3. / 4, 4. / 3) 264 | 265 | w = int(round(math.sqrt(target_area * aspect_ratio))) 266 | h = int(round(math.sqrt(target_area / aspect_ratio))) 267 | 268 | if random.random() < 0.5: 269 | w, h = h, w 270 | 271 | if w <= img_group[0].size[0] and h <= img_group[0].size[1]: 272 | x1 = random.randint(0, img_group[0].size[0] - w) 273 | y1 = random.randint(0, img_group[0].size[1] - h) 274 | found = True 275 | break 276 | else: 277 | found = False 278 | x1 = 0 279 | y1 = 0 280 | 281 | if found: 282 | out_group = list() 283 | for img in img_group: 284 | img = img.crop((x1, y1, x1 + w, y1 + h)) 285 | assert(img.size == (w, h)) 286 | out_group.append(img.resize((self.size, self.size), self.interpolation)) 287 | return out_group 288 | else: 289 | # Fallback 290 | scale = GroupScale(self.size, interpolation=self.interpolation) 291 | crop = GroupRandomCrop(self.size) 292 | return crop(scale(img_group)) 293 | 294 | 295 | class Stack(object): 296 | 297 | def __init__(self, roll=False): 298 | self.roll = roll 299 | 300 | def __call__(self, img_group): 301 | if img_group[0].mode == 'L': 302 | return np.concatenate([np.expand_dims(x, 2) for x in img_group], axis=2) 303 | elif img_group[0].mode == 'RGB': 304 | if self.roll: 305 | return np.concatenate([np.array(x)[:, :, ::-1] for x in img_group], axis=2) 306 | else: 307 | return np.concatenate(img_group, axis=2) 308 | 309 | 310 | class ToTorchFormatTensor(object): 311 | """ Converts a PIL.Image (RGB) or numpy.ndarray (H x W x C) in the range [0, 255] 312 | to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] """ 313 | def __init__(self, div=True, num_clips_crops=1): 314 | self.div = div 315 | self.num_clips_crops = num_clips_crops 316 | 317 | def __call__(self, pic): 318 | if isinstance(pic, np.ndarray): 319 | # handle numpy array 320 | if len(pic.shape) == 4: # the input format is ((NF)xCxHxW), output should be ((NC)xFxHxW) 321 | # ((NF)xCxHxW) --> (Cx(NF)xHxW) 322 | img = torch.from_numpy(pic).permute(3, 0, 1, 2).contiguous() 323 | c, nf, h, w = img.shape 324 | f = nf // self.num_clips_crops 325 | img = img.view((-1, f, h, w)) 326 | else: # data is HW(FC) 327 | img = torch.from_numpy(pic).permute(2, 0, 1).contiguous() 328 | else: 329 | # handle PIL Image 330 | img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes())) 331 | img = img.view(pic.size[1], pic.size[0], len(pic.mode)) 332 | # put it from HWC to CHW format 333 | # yikes, this transpose takes 80% of the loading time/CPU 334 | img = img.transpose(0, 1).transpose(0, 2).contiguous() 335 | return img.float().div(255) if self.div else img.float() 336 | 337 | 338 | class IdentityTransform(object): 339 | 340 | def __call__(self, data): 341 | return data 342 | -------------------------------------------------------------------------------- /core/video_utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | import multiprocessing 3 | 4 | import torch 5 | import torch.nn.parallel 6 | import torch.optim 7 | import torch.utils.data 8 | import torch.utils.data.distributed 9 | import torchvision.transforms as transforms 10 | from tqdm import tqdm 11 | 12 | from .video_transforms import (GroupRandomHorizontalFlip, 13 | GroupMultiScaleCrop, GroupScale, GroupCenterCrop, 14 | GroupNormalize, Stack, ToTorchFormatTensor) 15 | from .models import bLVNet_TAM 16 | 17 | 18 | def build_model(args, test_mode=False): 19 | backbone_setting = {'depth': args.depth, 'alpha': args.alpha, 'beta': args.beta, 20 | 'groups': args.groups, 'num_classes': args.num_classes, 21 | 'dropout': args.dropout, 'blending_frames': args.blending_frames, 22 | 'input_channels': args.input_channels, 'pretrained': args.pretrained, 23 | 'dataset': args.dataset, 24 | 'imagenet_blnet_pretrained': args.imagenet_blnet_pretrained} 25 | # create model 26 | model = bLVNet_TAM(backbone_setting) 27 | 28 | arch_name = model.model_name 29 | if not test_mode: 30 | arch_name += "-{lr_scheduler}-bs{batch_size}-e{epochs}" 31 | arch_name = arch_name.format(lr_scheduler=args.lr_scheduler, batch_size=args.batch_size, 32 | epochs=args.epochs) 33 | 34 | return model, arch_name 35 | 36 | 37 | class AverageMeter(object): 38 | """Computes and stores the average and current value""" 39 | def __init__(self): 40 | self.val = 0 41 | self.avg = 0 42 | self.sum = 0 43 | self.count = 0 44 | 45 | def reset(self): 46 | self.val = 0 47 | self.avg = 0 48 | self.sum = 0 49 | self.count = 0 50 | 51 | def update(self, val, n=1): 52 | self.val = val 53 | self.sum += val * n 54 | self.count += n 55 | self.avg = self.sum / self.count 56 | 57 | 58 | def accuracy(output, target, topk=(1, 5)): 59 | """Computes the precision@k for the specified values of k""" 60 | with torch.no_grad(): 61 | maxk = max(topk) 62 | batch_size = target.size(0) 63 | 64 | _, pred = output.topk(maxk, 1, True, True) 65 | pred = pred.t() 66 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 67 | 68 | res = [] 69 | for k in topk: 70 | correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) 71 | res.append(correct_k.mul_(100.0 / batch_size)) 72 | return res 73 | 74 | 75 | def get_augmentor(is_train, image_size, mean=None, 76 | std=None, disable_scaleup=False, is_flow=False): 77 | augments = [] 78 | 79 | mean = [0.485, 0.456, 0.406] if mean is None else mean 80 | std = [0.485, 0.456, 0.406] if std is None else std 81 | 82 | if is_train: 83 | augments += [ 84 | GroupMultiScaleCrop(image_size, [1, .875, .75, .66]), 85 | GroupRandomHorizontalFlip(is_flow=is_flow) 86 | ] 87 | else: 88 | scaled_size = image_size if disable_scaleup else int(image_size / 0.875 + 0.5) 89 | augments += [ 90 | GroupScale(scaled_size), 91 | GroupCenterCrop(image_size) 92 | ] 93 | augments += [ 94 | Stack(), 95 | ToTorchFormatTensor(), 96 | GroupNormalize(mean=mean, std=std) 97 | ] 98 | 99 | augmentor = transforms.Compose(augments) 100 | return augmentor 101 | 102 | 103 | def build_dataflow(dataset, is_train, batch_size, workers=36, is_distributed=False): 104 | workers = min(workers, multiprocessing.cpu_count()) 105 | shuffle = False 106 | sampler = torch.utils.data.distributed.DistributedSampler(dataset) if is_distributed else None 107 | 108 | if is_train: 109 | shuffle = sampler is None 110 | 111 | data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, 112 | num_workers=workers, pin_memory=True, sampler=sampler) 113 | 114 | return data_loader 115 | 116 | 117 | def train(data_loader, model, criterion, optimizer, epoch, display=100, 118 | steps_per_epoch=99999999999, gpu_id=None): 119 | batch_time = AverageMeter() 120 | data_time = AverageMeter() 121 | losses = AverageMeter() 122 | top1 = AverageMeter() 123 | top5 = AverageMeter() 124 | 125 | # switch to train mode 126 | model.train() 127 | end = time.time() 128 | num_batch = 0 129 | with tqdm(total=len(data_loader)) as t_bar: 130 | for i, (images, target) in enumerate(data_loader): 131 | # measure data loading time 132 | data_time.update(time.time() - end) 133 | # compute output 134 | if gpu_id is not None: 135 | images = images.cuda(gpu_id, non_blocking=True) 136 | 137 | output = model(images) 138 | #TODO check label_smoothing 139 | target = target.cuda(gpu_id, non_blocking=True) 140 | loss = criterion(output, target) 141 | 142 | # measure accuracy and record loss 143 | prec1, prec5 = accuracy(output, target) 144 | 145 | losses.update(loss.item(), images.size(0)) 146 | top1.update(prec1[0], images.size(0)) 147 | top5.update(prec5[0], images.size(0)) 148 | # compute gradient and do SGD step 149 | loss.backward() 150 | 151 | optimizer.step() 152 | optimizer.zero_grad() 153 | 154 | # measure elapsed time 155 | batch_time.update(time.time() - end) 156 | end = time.time() 157 | if i % display == 0: 158 | print('Epoch: [{0}][{1}/{2}]\t' 159 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 160 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 161 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 162 | 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 163 | 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( 164 | epoch, i, len(data_loader), batch_time=batch_time, 165 | data_time=data_time, loss=losses, top1=top1, top5=top5), flush=True) 166 | num_batch += 1 167 | t_bar.update(1) 168 | if i > steps_per_epoch: 169 | break 170 | 171 | return top1.avg, top5.avg, losses.avg, batch_time.avg, data_time.avg, num_batch 172 | 173 | 174 | def validate(data_loader, model, criterion, gpu_id=None): 175 | batch_time = AverageMeter() 176 | losses = AverageMeter() 177 | top1 = AverageMeter() 178 | top5 = AverageMeter() 179 | 180 | # switch to evaluate mode 181 | model.eval() 182 | 183 | with torch.no_grad(), tqdm(total=len(data_loader)) as t_bar: 184 | end = time.time() 185 | for i, (images, target) in enumerate(data_loader): 186 | 187 | if gpu_id is not None: 188 | images = images.cuda(gpu_id, non_blocking=True) 189 | target = target.cuda(gpu_id, non_blocking=True) 190 | 191 | # compute output 192 | output = model(images) 193 | loss = criterion(output, target) 194 | 195 | # measure accuracy and record loss 196 | prec1, prec5 = accuracy(output, target) 197 | losses.update(loss.item(), images.size(0)) 198 | top1.update(prec1[0], images.size(0)) 199 | top5.update(prec5[0], images.size(0)) 200 | 201 | # measure elapsed time 202 | batch_time.update(time.time() - end) 203 | end = time.time() 204 | t_bar.update(1) 205 | 206 | return top1.avg, top5.avg, losses.avg, batch_time.avg 207 | -------------------------------------------------------------------------------- /opts.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def arg_parser(): 5 | parser = argparse.ArgumentParser(description='PyTorch Action Recognition Training') 6 | 7 | # model definition 8 | parser.add_argument('-d', '--depth', default=50, type=int, metavar='N', 9 | help='depth of blresnet (default: 50)', choices=[50, 101]) 10 | parser.add_argument('--dropout', default=0.5, type=float) 11 | parser.add_argument('--groups', default=16, type=int) 12 | parser.add_argument('--frames_per_group', default=1, type=int) 13 | parser.add_argument('--alpha', default=2, type=int, metavar='N', help='ratio of channels') 14 | parser.add_argument('--beta', default=4, type=int, metavar='N', help='ratio of layers') 15 | parser.add_argument('--blending_frames', default=3, type=int) 16 | # training setting 17 | parser.add_argument('--gpu', default=None, type=int, help='GPU id to use.') 18 | parser.add_argument('-b', '--batch-size', default=256, type=int, 19 | metavar='N', help='mini-batch size (default: 256)') 20 | parser.add_argument('--lr', '--learning-rate', default=0.01, type=float, 21 | metavar='LR', help='initial learning rate') 22 | parser.add_argument('--lr_scheduler', default='cosine', type=str, 23 | help='learning rate scheduler', choices=['step', 'multisteps', 'cosine', 'plateau']) 24 | parser.add_argument('--lr_steps', default=[15, 30, 45], type=float, nargs="+", 25 | metavar='LRSteps', help='epochs to decay learning rate by 10') 26 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum') 27 | parser.add_argument('--weight-decay', '--wd', default=5e-4, type=float, 28 | metavar='W', help='weight decay (default: 1e-4)') 29 | parser.add_argument('--epochs', default=50, type=int, metavar='N', 30 | help='number of total epochs to run') 31 | parser.add_argument('--resume', default=None, type=str, metavar='PATH', 32 | help='path to latest checkpoint (default: none)') 33 | parser.add_argument('--pretrained', action='store_true', 34 | help='use pre-trained model') 35 | parser.add_argument('--start-epoch', default=0, type=int, metavar='N', 36 | help='manual epoch number (useful on restarts)') 37 | parser.add_argument('--imagenet_blnet_pretrained', action='store_true', 38 | help='use imagenet-pretrained blnet model') 39 | 40 | # data-related 41 | parser.add_argument('-j', '--workers', default=18, type=int, metavar='N', 42 | help='number of data loading workers (default: 4)') 43 | parser.add_argument('--datadir', metavar='DIR', help='path to dataset file list') 44 | parser.add_argument('--dataset', default='st2stv2', 45 | choices=['st2stv2', 'st2stv1', 'kinetics400', 'moments_30fps'], 46 | help='path to dataset file list') 47 | parser.add_argument('--input_shape', default=224, type=int, metavar='N', help='input image size') 48 | parser.add_argument('--disable_scaleup', action='store_true', 49 | help='do not scale up and then crop a small region, directly crop the input_shape size') 50 | parser.add_argument('--random_sampling', action='store_true', help='perform determinstic sampling for data loader') 51 | parser.add_argument('--dense_sampling', action='store_true', help='perform dense sampling for data loader') 52 | parser.add_argument('--modality', default='rgb', type=str, help='rgb or flow', choices=['rgb', 'flow']) 53 | # logging 54 | parser.add_argument('--logdir', default='', type=str, help='log path') 55 | parser.add_argument('--print-freq', default=100, type=int, 56 | help='frequency to print the log during the training') 57 | parser.add_argument('--show_model', action='store_true', help='show model summary') 58 | 59 | # for testing 60 | parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', 61 | help='evaluate model on validation set') 62 | parser.add_argument('--num_crops', default=1, type=int, choices=[1, 3, 5, 10]) 63 | parser.add_argument('--num_clips', default=1, type=int) 64 | return parser 65 | -------------------------------------------------------------------------------- /requirement.txt: -------------------------------------------------------------------------------- 1 | pytorch>=1.0.0,<1.3.0 2 | tensorboard_logger 3 | tqdm -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | from torch.nn import functional as F 5 | import torch.nn.parallel 6 | import torch.backends.cudnn as cudnn 7 | import torch.optim 8 | import torch.utils.data 9 | import torch.utils.data.distributed 10 | import torchvision.transforms as transforms 11 | from tqdm import tqdm 12 | 13 | from core.video_utils import build_dataflow, build_model 14 | from core.video_transforms import * 15 | from core.video_dataset import VideoDataSet 16 | from opts import arg_parser 17 | 18 | 19 | class AverageMeter(object): 20 | """Computes and stores the average and current value""" 21 | 22 | def __init__(self): 23 | self.val = 0 24 | self.avg = 0 25 | self.sum = 0 26 | self.count = 0 27 | 28 | def reset(self): 29 | self.val = 0 30 | self.avg = 0 31 | self.sum = 0 32 | self.count = 0 33 | 34 | def update(self, val, n=1): 35 | self.val = val 36 | self.sum += val * n 37 | self.count += n 38 | self.avg = self.sum / self.count 39 | 40 | 41 | def accuracy(output, target, topk=(1, 5)): 42 | """Computes the precision@k for the specified values of k""" 43 | with torch.no_grad(): 44 | maxk = max(topk) 45 | batch_size = target.size(0) 46 | 47 | _, pred = output.topk(maxk, 1, True, True) 48 | pred = pred.t() 49 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 50 | 51 | res = [] 52 | for k in topk: 53 | correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) 54 | res.append(correct_k.mul_(100.0 / batch_size)) 55 | return res 56 | 57 | 58 | def load_categories(file_path): 59 | id_to_label = {} 60 | label_to_id = {} 61 | with open(file_path) as f: 62 | cls_id = 0 63 | for label in f.readlines(): 64 | label = label.strip() 65 | if label == "": 66 | continue 67 | id_to_label[cls_id] = label 68 | label_to_id[label] = cls_id 69 | cls_id += 1 70 | return id_to_label, label_to_id 71 | 72 | 73 | def eval_a_batch(data, model, num_clips=1, num_crops=1, softmax=False): 74 | with torch.no_grad(): 75 | batch_size = data.shape[0] 76 | data = data.view((batch_size * num_crops * num_clips, -1) + data.size()[2:]) 77 | result = model(data) 78 | result = result.reshape(batch_size, num_crops * num_clips, -1).mean(dim=1) 79 | if softmax: 80 | # take the softmax to normalize the output to probability 81 | result = F.softmax(result, dim=1) 82 | 83 | return result 84 | 85 | 86 | def main(): 87 | global args 88 | parser = arg_parser() 89 | args = parser.parse_args() 90 | cudnn.benchmark = True 91 | id_to_label = {} 92 | 93 | if args.dataset == 'st2stv2': 94 | num_classes = 174 95 | data_list_name = 'validation_256.txt' if args.evaluate else 'testing_256.txt' 96 | filename_seperator = " " 97 | image_tmpl = '{:05d}.jpg' 98 | filter_video = 3 99 | elif args.dataset == 'st2stv1': 100 | num_classes = 174 101 | data_list_name = 'validation_256.txt' if args.evaluate else 'testing_256.txt' 102 | filename_seperator = " " 103 | image_tmpl = '{:05d}.jpg' 104 | label_file = 'something-something-v1-labels.csv' 105 | filter_video = 3 106 | id_to_label, label_to_id = load_categories(os.path.join(args.datadir, label_file)) 107 | else: # 'kinetics400' 108 | num_classes = 400 109 | data_list_name = 'val_400_331.txt' if args.evaluate else 'test_400_331.txt' 110 | filename_seperator = ";" 111 | image_tmpl = '{:05d}.jpg' 112 | filter_video = 30 113 | 114 | args.num_classes = num_classes 115 | 116 | if args.gpu: 117 | os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu 118 | 119 | if args.modality == 'rgb': 120 | mean = [0.485, 0.456, 0.406] 121 | std = [0.229, 0.224, 0.225] 122 | else: # flow 123 | mean = [0.5] 124 | std = [np.mean([0.229, 0.224, 0.225])] 125 | 126 | if args.modality == 'rgb': 127 | args.input_channels = 3 128 | elif args.modality == 'flow': 129 | args.input_channels = 2 * 5 130 | 131 | model, arch_name = build_model(args, test_mode=True) 132 | if args.pretrained is not None: 133 | print("=> using pre-trained model '{}'".format(arch_name)) 134 | else: 135 | print("=> creating model '{}'".format(arch_name)) 136 | 137 | model = model.cuda() 138 | model = torch.nn.DataParallel(model).cuda() 139 | 140 | # augmentor 141 | if args.disable_scaleup: 142 | scale_size = args.input_shape 143 | else: 144 | scale_size = int(args.input_shape / 0.875 + 0.5) 145 | 146 | augments = [] 147 | if args.num_crops == 1: 148 | augments += [ 149 | GroupScale(scale_size), 150 | GroupCenterCrop(args.input_shape) 151 | ] 152 | else: 153 | flip = True if args.num_crops == 10 else False 154 | augments += [ 155 | GroupOverSample(args.input_shape, scale_size, num_crops=args.num_crops, flip=flip), 156 | ] 157 | augments += [ 158 | Stack(), 159 | ToTorchFormatTensor(num_clips_crops=args.num_clips * args.num_crops), 160 | GroupNormalize(mean=mean, std=std) 161 | ] 162 | 163 | augmentor = transforms.Compose(augments) 164 | 165 | # Data loading code 166 | data_list = os.path.join(args.datadir, data_list_name) 167 | sample_offsets = list(range(-args.num_clips // 2 + 1, args.num_clips // 2 + 1)) 168 | print("Image is scaled to {} and crop {}".format(scale_size, args.input_shape)) 169 | print("Number of crops: {}".format(args.num_crops)) 170 | print("Number of clips: {}, offset from center with {}".format(args.num_clips, sample_offsets)) 171 | 172 | val_dataset = VideoDataSet("", data_list, args.groups, args.frames_per_group, 173 | num_clips=args.num_clips, modality=args.modality, 174 | image_tmpl=image_tmpl, 175 | dense_sampling=args.dense_sampling, 176 | fixed_offset=not args.random_sampling, 177 | transform=augmentor, is_train=False, test_mode=not args.evaluate, 178 | seperator=filename_seperator, filter_video=filter_video) 179 | 180 | data_loader = build_dataflow(val_dataset, is_train=False, batch_size=args.batch_size, 181 | workers=args.workers) 182 | 183 | log_folder = os.path.join(args.logdir, arch_name) 184 | if not os.path.exists(log_folder): 185 | os.makedirs(log_folder) 186 | 187 | batch_time = AverageMeter() 188 | if args.evaluate: 189 | logfile = open(os.path.join(log_folder, 'evaluate_log.log'), 'a') 190 | top1 = AverageMeter() 191 | top5 = AverageMeter() 192 | else: 193 | logfile = open(os.path.join(log_folder, 194 | 'test_{}crops_{}clips_{}.csv'.format(args.num_crops, 195 | args.num_clips, 196 | args.input_shape)), 'w') 197 | 198 | total_outputs = 0 199 | outputs = np.zeros((len(data_loader) * args.batch_size, num_classes)) 200 | # switch to evaluate mode 201 | model.eval() 202 | total_batches = len(data_loader) 203 | with torch.no_grad(), tqdm(total=total_batches) as t_bar: 204 | end = time.time() 205 | for i, (video, label) in enumerate(data_loader): 206 | output = eval_a_batch(video, model, num_clips=args.num_clips, num_crops=args.num_crops, 207 | softmax=True) 208 | if args.evaluate: 209 | label = label.cuda(non_blocking=True) 210 | # measure accuracy 211 | prec1, prec5 = accuracy(output, label, topk=(1, 5)) 212 | top1.update(prec1[0], video.size(0)) 213 | top5.update(prec5[0], video.size(0)) 214 | output = output.data.cpu().numpy().copy() 215 | batch_size = output.shape[0] 216 | outputs[total_outputs:total_outputs + batch_size, :] = output 217 | else: 218 | # testing, store output to prepare csv file 219 | # measure elapsed time 220 | output = output.data.cpu().numpy().copy() 221 | batch_size = output.shape[0] 222 | outputs[total_outputs:total_outputs + batch_size, :] = output 223 | predictions = np.argsort(output, axis=1) 224 | for ii in range(len(predictions)): 225 | temp = predictions[ii][::-1][:5] 226 | preds = [str(pred) for pred in temp] 227 | if args.dataset == 'st2stv1': 228 | print("{};{}".format(label[ii], id_to_label[int(preds[0])]), file=logfile) 229 | else: 230 | print("{};{}".format(label[ii], ";".join(preds)), file=logfile) 231 | total_outputs += video.shape[0] 232 | batch_time.update(time.time() - end) 233 | end = time.time() 234 | t_bar.update(1) 235 | 236 | # if not args.evaluate: 237 | outputs = outputs[:total_outputs] 238 | print("Predict {} videos.".format(total_outputs), flush=True) 239 | np.save(os.path.join(log_folder, '{}_{}crops_{}clips_{}_details.npy'.format( 240 | "val" if args.evaluate else "test", args.num_crops, args.num_clips, args.input_shape)), 241 | outputs) 242 | 243 | if args.evaluate: 244 | print( 245 | 'Val@{}({}) (# crops = {}, # clips = {}): \tTop@1: {:.4f}\tTop@5: {:.4f}\t'.format( 246 | args.input_shape, scale_size, args.num_crops, args.num_clips, top1.avg, top5.avg 247 | ), flush=True) 248 | print( 249 | 'Val@{}({}) (# crops = {}, # clips = {}): \tTop@1: {:.4f}\tTop@5: {:.4f}\t'.format( 250 | args.input_shape, scale_size, args.num_crops, args.num_clips, top1.avg, top5.avg 251 | ), flush=True, file=logfile) 252 | 253 | logfile.close() 254 | 255 | 256 | if __name__ == '__main__': 257 | main() 258 | -------------------------------------------------------------------------------- /tools/extract_videos_st2st_v1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import cv2 5 | import concurrent.futures 6 | 7 | # input 8 | folder_root = "" 9 | 10 | label_file = "{}/something-something-v1-labels.csv".format(folder_root) 11 | train_file = "{}/something-something-v1-train.csv".format(folder_root) 12 | val_file = "{}/something-something-v1-validation.csv".format(folder_root) 13 | test_file = "{}/something-something-v1-test.csv".format(folder_root) 14 | video_folder = "{}/20bn-something-something-v1/".format(folder_root) 15 | 16 | # output 17 | train_img_folder = "{}/training_256/".format(folder_root) 18 | val_img_folder = "{}/validation_256/".format(folder_root) 19 | test_img_folder = "{}/testing_256/".format(folder_root) 20 | train_file_list = "{}/training_256.txt".format(folder_root) 21 | val_file_list = "{}/validation_256.txt".format(folder_root) 22 | test_file_list = "{}/testing_256.txt".format(folder_root) 23 | 24 | def load_categories(file_path): 25 | id_to_label = {} 26 | label_to_id = {} 27 | with open(file_path) as f: 28 | cls_id = 0 29 | for label in f.readlines(): 30 | label = label.strip() 31 | if label == "": 32 | continue 33 | id_to_label[cls_id] = label 34 | label_to_id[label] = cls_id 35 | cls_id += 1 36 | return id_to_label, label_to_id 37 | 38 | id_to_label, label_to_id = load_categories(label_file) 39 | 40 | def load_video_list(file_path): 41 | videos = [] 42 | with open(file_path) as f: 43 | for line in f.readlines(): 44 | line = line.strip() 45 | if line == "": 46 | continue 47 | video_id, label_name = line.split(";") 48 | label_name = label_name.strip() 49 | videos.append([video_id, label_name]) 50 | return videos 51 | 52 | 53 | def load_test_video_list(file_path): 54 | videos = [] 55 | with open(file_path) as f: 56 | for line in f.readlines(): 57 | line = line.strip() 58 | if line == "": 59 | continue 60 | videos.append([line]) 61 | return videos 62 | 63 | 64 | train_videos = load_video_list(train_file) 65 | val_videos = load_video_list(val_file) 66 | test_videos = load_test_video_list(test_file) 67 | 68 | 69 | def resize_to_short_side(h, w, short_side=256): 70 | newh, neww = h, w 71 | if h < w: 72 | newh = short_side 73 | neww = (w / h) * newh 74 | else: 75 | neww = short_side 76 | newh = (h / w) * neww 77 | neww = int(neww + 0.5) 78 | newh = int(newh + 0.5) 79 | return newh, neww 80 | 81 | def video_to_images(video, basedir, targetdir, short_side=256): 82 | try: 83 | cls_id = label_to_id[video[1]] 84 | except: 85 | cls_id = -1 86 | filename = os.path.join(basedir, video[0]) 87 | output_foldername = os.path.join(targetdir, video[0]) 88 | if not os.path.exists(filename): 89 | print("{} is not existed.".format(filename)) 90 | return video[0], cls_id, 0 91 | else: 92 | if not os.path.exists(output_foldername): 93 | os.makedirs(output_foldername) 94 | # get frame num 95 | i = 0 96 | while True: 97 | img_name = os.path.join(filename + "/{:05d}.jpg".format(i + 1)) 98 | if os.path.exists(img_name): 99 | output_filename = os.path.join(output_foldername + "/{:05d}.jpg".format(i + 1)) 100 | img = cv2.imread(img_name) 101 | width = img.shape[1] 102 | height = img.shape[0] 103 | newh, neww = resize_to_short_side(height, width, short_side) 104 | img = cv2.resize(img, (neww, newh), interpolation=cv2.INTER_LINEAR) 105 | cv2.imwrite(output_filename, img) 106 | i += 1 107 | else: 108 | break 109 | 110 | frame_num = i 111 | print("Finish {}, id: {} frames: {}".format(filename, cls_id, frame_num)) 112 | return video[0], cls_id, frame_num 113 | 114 | 115 | def create_train_video(short_side): 116 | with open(train_file_list, 'w') as f, concurrent.futures.ProcessPoolExecutor(max_workers=36) as executor: 117 | futures = [executor.submit(video_to_images, video, video_folder, train_img_folder, int(short_side)) 118 | for video in train_videos] 119 | total_videos = len(futures) 120 | curr_idx = 0 121 | for future in concurrent.futures.as_completed(futures): 122 | video_id, label_id, frame_num = future.result() 123 | if frame_num == 0: 124 | print("Something wrong: {}".format(video_id)) 125 | else: 126 | print("{} 1 {} {}".format(os.path.join(train_img_folder, video_id), frame_num, label_id), file=f, flush=True) 127 | print("{}/{}".format(curr_idx, total_videos), flush=True) 128 | curr_idx += 1 129 | print("Completed") 130 | 131 | 132 | def create_val_video(short_side): 133 | with open(val_file_list, 'w') as f, concurrent.futures.ProcessPoolExecutor(max_workers=36) as executor: 134 | futures = [executor.submit(video_to_images, video, video_folder, val_img_folder, int(short_side)) 135 | for video in val_videos] 136 | total_videos = len(futures) 137 | curr_idx = 0 138 | for future in concurrent.futures.as_completed(futures): 139 | video_id, label_id, frame_num = future.result() 140 | if frame_num == 0: 141 | print("Something wrong: {}".format(video_id)) 142 | else: 143 | print("{} 1 {} {}".format(os.path.join(val_img_folder, video_id), frame_num, label_id), file=f, flush=True) 144 | print("{}/{}".format(curr_idx, total_videos)) 145 | curr_idx += 1 146 | print("Completed") 147 | 148 | 149 | def create_test_video(short_side): 150 | with open(test_file_list, 'w') as f, concurrent.futures.ProcessPoolExecutor(max_workers=36) as executor: 151 | futures = [executor.submit(video_to_images, video, video_folder, test_img_folder, int(short_side)) 152 | for video in test_videos] 153 | total_videos = len(futures) 154 | curr_idx = 0 155 | for future in concurrent.futures.as_completed(futures): 156 | video_id, label_id, frame_num = future.result() 157 | if frame_num == 0: 158 | print("Something wrong: {}".format(video_id)) 159 | else: 160 | print("{} 1 {}".format(os.path.join(test_img_folder, video_id), frame_num), file=f, flush=True) 161 | print("{}/{}".format(curr_idx, total_videos)) 162 | curr_idx += 1 163 | print("Completed") 164 | 165 | 166 | create_train_video(256) 167 | create_val_video(256) 168 | create_test_video(256) 169 | -------------------------------------------------------------------------------- /tools/extract_videos_st2st_v2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import json 5 | import skvideo.io 6 | import concurrent.futures 7 | import subprocess 8 | 9 | folder_root = "" 10 | 11 | # input 12 | label_file = "{}/something-something-v2-labels.json".format(folder_root) 13 | train_file = "{}/something-something-v2-train.json".format(folder_root) 14 | val_file = "{}/something-something-v2-validation.json".format(folder_root) 15 | test_file = "{}/something-something-v2-test.json".format(folder_root) 16 | video_folder = "{}/20bn-something-something-v2".format(folder_root) 17 | 18 | # output 19 | train_img_folder = "{}-new/training_256".format(folder_root) 20 | val_img_folder = "{}-new/validation_256".format(folder_root) 21 | test_img_folder = "{}-new/testing_256".format(folder_root) 22 | train_file_list = "{}-new/training_256.txt".format(folder_root) 23 | val_file_list = "{}-new/validation_256.txt".format(folder_root) 24 | test_file_list = "{}-new/testing_256.txt".format(folder_root) 25 | 26 | def load_categories(file_path): 27 | id_to_label = {} 28 | label_to_id = {} 29 | with open(file_path) as f: 30 | labels = json.load(f) 31 | for label, cls_id in labels.items(): 32 | label = label 33 | id_to_label[int(cls_id)] = label 34 | label_to_id[label] = int(cls_id) 35 | return id_to_label, label_to_id 36 | 37 | 38 | id_to_label, label_to_id = load_categories(label_file) 39 | 40 | 41 | def load_video_list(file_path): 42 | videos = [] 43 | with open(file_path) as f: 44 | file_list = json.load(f) 45 | for temp in file_list: 46 | videos.append([temp['id'], temp['template'].replace( 47 | "[", "").replace("]", ""), temp['label'], temp['placeholders']]) 48 | return videos 49 | 50 | 51 | def load_test_video_list(file_path): 52 | videos = [] 53 | with open(file_path) as f: 54 | file_list = json.load(f) 55 | for temp in file_list: 56 | videos.append([temp['id']]) 57 | return videos 58 | 59 | 60 | train_videos = load_video_list(train_file) 61 | val_videos = load_video_list(val_file) 62 | test_videos = load_test_video_list(test_file) 63 | 64 | 65 | def resize_to_short_side(h, w, short_side=360): 66 | newh, neww = h, w 67 | if h < w: 68 | newh = short_side 69 | neww = (w / h) * newh 70 | else: 71 | neww = short_side 72 | newh = (h / w) * neww 73 | neww = int(neww + 0.5) 74 | newh = int(newh + 0.5) 75 | return newh, neww 76 | 77 | 78 | def video_to_images(video, basedir, targetdir, short_side=256): 79 | try: 80 | cls_id = label_to_id[video[1]] 81 | except: 82 | cls_id = -1 83 | filename = os.path.join(basedir, video[0] + ".webm") 84 | output_foldername = os.path.join(targetdir, video[0]) 85 | if not os.path.exists(filename): 86 | print("{} is not existed.".format(filename)) 87 | return video[0], cls_id, 0 88 | else: 89 | try: 90 | video_meta = skvideo.io.ffprobe(filename) 91 | height = int(video_meta['video']['@height']) 92 | width = int(video_meta['video']['@width']) 93 | except: 94 | print("Can not get video info: {}".format(filename)) 95 | return video[0], cls_id, 0 96 | 97 | if width > height: 98 | scale = "scale=-1:{}".format(short_side) 99 | else: 100 | scale = "scale={}:-1".format(short_side) 101 | if not os.path.exists(output_foldername): 102 | os.makedirs(output_foldername) 103 | 104 | command = ['ffmpeg', 105 | '-i', '"%s"' % filename, 106 | '-vf', scale, 107 | '-threads', '1', 108 | '-loglevel', 'panic', '-qmin', '1', '-qmax', '1', 109 | '-q:v', '0', 110 | '{}/'.format(output_foldername) + '"%05d.jpg"'] 111 | command = ' '.join(command) 112 | try: 113 | subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT) 114 | except: 115 | print("fail to convert {}".format(filename)) 116 | return video[0], cls_id, 0 117 | 118 | # get frame num 119 | i = 0 120 | while True: 121 | img_name = os.path.join(output_foldername + "/{:05d}.jpg".format(i + 1)) 122 | if os.path.exists(img_name): 123 | i += 1 124 | else: 125 | break 126 | 127 | frame_num = i 128 | print("Finish {}, id: {} frames: {}".format(filename, cls_id, frame_num)) 129 | return video[0], cls_id, frame_num 130 | 131 | 132 | def create_train_video(short_side): 133 | with open(train_file_list, 'w') as f, concurrent.futures.ProcessPoolExecutor(max_workers=36) as executor: 134 | futures = [executor.submit(video_to_images, video, video_folder, train_img_folder, int(short_side)) 135 | for video in train_videos] 136 | total_videos = len(futures) 137 | curr_idx = 0 138 | for future in concurrent.futures.as_completed(futures): 139 | video_id, label_id, frame_num = future.result() 140 | if frame_num == 0: 141 | print("Something wrong: {}".format(video_id)) 142 | else: 143 | print("{} 1 {} {}".format(os.path.join(train_img_folder, video_id), frame_num, label_id), file=f, flush=True) 144 | print("{}/{}".format(curr_idx, total_videos), flush=True) 145 | curr_idx += 1 146 | print("Completed") 147 | 148 | 149 | def create_val_video(short_side): 150 | with open(val_file_list, 'w') as f, concurrent.futures.ProcessPoolExecutor(max_workers=36) as executor: 151 | futures = [executor.submit(video_to_images, video, video_folder, val_img_folder, int(short_side)) 152 | for video in val_videos] 153 | total_videos = len(futures) 154 | curr_idx = 0 155 | for future in concurrent.futures.as_completed(futures): 156 | video_id, label_id, frame_num = future.result() 157 | if frame_num == 0: 158 | print("Something wrong: {}".format(video_id)) 159 | else: 160 | print("{} 1 {} {}".format(os.path.join(val_img_folder, video_id), frame_num, label_id), file=f, flush=True) 161 | print("{}/{}".format(curr_idx, total_videos)) 162 | curr_idx += 1 163 | print("Completed") 164 | 165 | 166 | def create_test_video(short_side): 167 | with open(test_file_list, 'w') as f, concurrent.futures.ProcessPoolExecutor(max_workers=36) as executor: 168 | futures = [executor.submit(video_to_images, video, video_folder, test_img_folder, int(short_side)) 169 | for video in test_videos] 170 | total_videos = len(futures) 171 | curr_idx = 0 172 | for future in concurrent.futures.as_completed(futures): 173 | video_id, label_id, frame_num = future.result() 174 | if frame_num == 0: 175 | print("Something wrong: {}".format(video_id)) 176 | else: 177 | print("{} 1 {}".format(os.path.join(test_img_folder, video_id), frame_num), file=f, flush=True) 178 | print("{}/{}".format(curr_idx, total_videos)) 179 | curr_idx += 1 180 | print("Completed") 181 | 182 | 183 | create_train_video(256) 184 | create_val_video(256) 185 | create_test_video(256) 186 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import time 4 | import numpy as np 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.parallel 9 | import torch.backends.cudnn as cudnn 10 | import torch.optim 11 | import torch.utils.data 12 | import torch.utils.data.distributed 13 | from torch.optim import lr_scheduler 14 | import tensorboard_logger 15 | 16 | from core.video_utils import (train, validate, build_dataflow, get_augmentor, build_model) 17 | from core.video_dataset import VideoDataSet 18 | from opts import arg_parser 19 | 20 | 21 | def save_checkpoint(state, is_best, filepath=''): 22 | torch.save(state, os.path.join(filepath, 'checkpoint.pth.tar')) 23 | if is_best: 24 | shutil.copyfile(os.path.join(filepath, 'checkpoint.pth.tar'), 25 | os.path.join(filepath, 'model_best.pth.tar')) 26 | 27 | 28 | def main(): 29 | global args 30 | parser = arg_parser() 31 | args = parser.parse_args() 32 | cudnn.benchmark = True 33 | 34 | if args.dataset == 'st2stv2': 35 | num_classes = 174 36 | train_list_name = 'training_256.txt' 37 | val_list_name = 'validation_256.txt' 38 | filename_seperator = " " 39 | image_tmpl = '{:05d}.jpg' 40 | filter_video = 3 41 | elif args.dataset == 'st2stv1': 42 | num_classes = 174 43 | train_list_name = 'training_256.txt' 44 | val_list_name = 'validation_256.txt' 45 | filename_seperator = " " 46 | image_tmpl = '{:05d}.jpg' 47 | filter_video = 3 48 | else: # kinetics400 49 | num_classes = 400 50 | train_list_name = 'train_400_331.txt' 51 | val_list_name = 'val_400_331.txt' 52 | filename_seperator = ";" 53 | image_tmpl = '{:05d}.jpg' 54 | filter_video = 30 55 | # elif args.dataset == 'moments_30fps': 56 | # num_classes = 339 57 | # train_list_name = 'training_256.txt' 58 | # val_list_name = 'validation_256.txt' 59 | # filename_seperator = " " 60 | # image_tmpl = '{:05d}.jpg' 61 | 62 | args.num_classes = num_classes 63 | 64 | if args.gpu: 65 | os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu 66 | 67 | if args.modality == 'rgb': 68 | mean = [0.485, 0.456, 0.406] 69 | std = [0.229, 0.224, 0.225] 70 | elif args.modality == 'flow': 71 | mean = [0.5] 72 | std = [np.mean([0.229, 0.224, 0.225])] 73 | 74 | if args.modality == 'rgb': 75 | args.input_channels = 3 76 | elif args.modality == 'flow': 77 | args.input_channels = 2 * 5 78 | 79 | model, arch_name = build_model(args) 80 | 81 | if args.pretrained is not None: 82 | print("=> using pre-trained model '{}'".format(arch_name)) 83 | else: 84 | print("=> creating model '{}'".format(arch_name)) 85 | 86 | model = model.cuda() 87 | model = torch.nn.DataParallel(model).cuda() 88 | # define loss function (criterion) and optimizer 89 | train_criterion = nn.CrossEntropyLoss().cuda() 90 | val_criterion = nn.CrossEntropyLoss().cuda() 91 | 92 | # Data loading code 93 | val_list = os.path.join(args.datadir, val_list_name) 94 | 95 | val_augmentor = get_augmentor(False, args.input_shape, mean=mean, std=std, 96 | disable_scaleup=args.disable_scaleup, 97 | is_flow=True if args.modality == 'flow' else False) 98 | 99 | val_dataset = VideoDataSet("", val_list, args.groups, args.frames_per_group, 100 | num_clips=args.num_clips, 101 | modality=args.modality, image_tmpl=image_tmpl, 102 | dense_sampling=args.dense_sampling, 103 | transform=val_augmentor, is_train=False, test_mode=False, 104 | seperator=filename_seperator, filter_video=filter_video, 105 | num_classes=args.num_classes) 106 | 107 | val_loader = build_dataflow(val_dataset, is_train=False, batch_size=args.batch_size, 108 | workers=args.workers) 109 | 110 | log_folder = os.path.join(args.logdir, arch_name) 111 | if not os.path.exists(log_folder): 112 | os.makedirs(log_folder) 113 | 114 | if args.evaluate: 115 | logfile = open(os.path.join(log_folder, 'evaluate_log.log'), 'a') 116 | val_top1, val_top5, val_losses, val_speed = validate(val_loader, model, val_criterion) 117 | print( 118 | 'Val@{}: \tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\tSpeed: {:.2f} ms/batch'.format( 119 | args.input_shape, val_losses, val_top1, val_top5, val_speed * 1000.0), flush=True) 120 | print( 121 | 'Val@{}: \tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\tSpeed: {:.2f} ms/batch'.format( 122 | args.input_shape, val_losses, val_top1, val_top5, val_speed * 1000.0), flush=True, file=logfile) 123 | return 124 | 125 | train_list = os.path.join(args.datadir, train_list_name) 126 | 127 | train_augmentor = get_augmentor(True, args.input_shape, mean=mean, std=std, 128 | disable_scaleup=args.disable_scaleup, 129 | is_flow=True if args.modality == 'flow' else False) 130 | 131 | train_dataset = VideoDataSet("", train_list, args.groups, args.frames_per_group, 132 | num_clips=args.num_clips, 133 | modality=args.modality, image_tmpl=image_tmpl, 134 | dense_sampling=args.dense_sampling, 135 | transform=train_augmentor, is_train=True, test_mode=False, 136 | seperator=filename_seperator, filter_video=filter_video, 137 | num_classes=args.num_classes) 138 | 139 | train_loader = build_dataflow(train_dataset, is_train=True, batch_size=args.batch_size, 140 | workers=args.workers) 141 | 142 | optimizer = torch.optim.SGD(model.parameters(), args.lr, 143 | momentum=args.momentum, 144 | weight_decay=args.weight_decay, 145 | nesterov=True) 146 | 147 | if args.lr_scheduler == 'step': 148 | scheduler = lr_scheduler.StepLR(optimizer, args.lr_steps[0], gamma=0.1) 149 | elif args.lr_scheduler == 'multisteps': 150 | scheduler = lr_scheduler.MultiStepLR(optimizer, args.lr_steps, gamma=0.1) 151 | elif args.lr_scheduler == 'cosine': 152 | scheduler = lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min=0) 153 | elif args.lr_scheduler == 'plateau': 154 | scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', verbose=True) 155 | 156 | best_top1 = 0.0 157 | tensorboard_logger.configure(os.path.join(log_folder)) 158 | # optionally resume from a checkpoint 159 | if args.resume: 160 | logfile = open(os.path.join(log_folder, 'log.log'), 'a') 161 | if os.path.isfile(args.resume): 162 | print("=> loading checkpoint '{}'".format(args.resume)) 163 | checkpoint = torch.load(args.resume) 164 | args.start_epoch = checkpoint['epoch'] 165 | best_top1 = checkpoint['best_top1'] 166 | model.load_state_dict(checkpoint['state_dict']) 167 | optimizer.load_state_dict(checkpoint['optimizer']) 168 | if args.lr_scheduler == 'plateau': 169 | scheduler.load_state_dict(checkpoint['scheduler']) 170 | print("=> loaded checkpoint '{}' (epoch {})" 171 | .format(args.resume, checkpoint['epoch'])) 172 | else: 173 | print("=> no checkpoint found at '{}'".format(args.resume)) 174 | else: 175 | if os.path.exists(os.path.join(log_folder, 'log.log')): 176 | shutil.copyfile(os.path.join(log_folder, 'log.log'), os.path.join( 177 | log_folder, 'log.log.{}'.format(int(time.time())))) 178 | logfile = open(os.path.join(log_folder, 'log.log'), 'w') 179 | 180 | print(args, flush=True) 181 | print(model, flush=True) 182 | 183 | print(args, file=logfile, flush=True) 184 | 185 | if args.resume is None: 186 | print(model, file=logfile, flush=True) 187 | 188 | for epoch in range(args.start_epoch, args.epochs): 189 | if args.lr_scheduler == 'plateau': 190 | scheduler.step(val_losses, epoch) 191 | else: 192 | scheduler.step(epoch) 193 | try: 194 | # get_lr get all lrs for every layer of current epoch, assume the lr for all layers are identical 195 | lr = scheduler.optimizer.param_groups[0]['lr'] 196 | except: 197 | lr = None 198 | # set current learning rate 199 | # train for one epoch 200 | train_top1, train_top5, train_losses, train_speed, speed_data_loader, train_steps = \ 201 | train(train_loader, model, train_criterion, optimizer, epoch + 1, display=args.print_freq) 202 | print( 203 | 'Train: [{:03d}/{:03d}]\tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\tSpeed: {:.2f} ms/batch\tData loading: {:.2f} ms/batch'.format( 204 | epoch + 1, args.epochs, train_losses, train_top1, train_top5, train_speed * 1000.0, 205 | speed_data_loader * 1000.0), file=logfile, flush=True) 206 | print( 207 | 'Train: [{:03d}/{:03d}]\tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\tSpeed: {:.2f} ms/batch\tData loading: {:.2f} ms/batch'.format( 208 | epoch + 1, args.epochs, train_losses, train_top1, train_top5, train_speed * 1000.0, 209 | speed_data_loader * 1000.0), flush=True) 210 | 211 | # evaluate on validation set 212 | val_top1, val_top5, val_losses, val_speed = validate(val_loader, model, val_criterion) 213 | print( 214 | 'Val : [{:03d}/{:03d}]\tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\tSpeed: {:.2f} ms/batch'.format( 215 | epoch + 1, args.epochs, val_losses, val_top1, val_top5, val_speed * 1000.0), 216 | file=logfile, flush=True) 217 | print( 218 | 'Val : [{:03d}/{:03d}]\tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\tSpeed: {:.2f} ms/batch'.format( 219 | epoch + 1, args.epochs, val_losses, val_top1, val_top5, val_speed * 1000.0), 220 | flush=True) 221 | # remember best prec@1 and save checkpoint 222 | is_best = val_top1 > best_top1 223 | best_top1 = max(val_top1, best_top1) 224 | 225 | save_dict = {'epoch': epoch + 1, 226 | 'arch': arch_name, 227 | 'state_dict': model.state_dict(), 228 | 'best_top1': best_top1, 229 | 'optimizer': optimizer.state_dict(), 230 | } 231 | if args.lr_scheduler == 'plateau': 232 | save_dict['scheduler'] = scheduler.state_dict() 233 | 234 | save_checkpoint(save_dict, is_best, filepath=log_folder) 235 | 236 | if lr is not None: 237 | tensorboard_logger.log_value('learning-rate', lr, epoch + 1) 238 | tensorboard_logger.log_value('val-top1', val_top1, epoch + 1) 239 | tensorboard_logger.log_value('val-loss', val_losses, epoch + 1) 240 | tensorboard_logger.log_value('train-top1', train_top1, epoch + 1) 241 | tensorboard_logger.log_value('train-loss', train_losses, epoch + 1) 242 | tensorboard_logger.log_value('best-val-top1', best_top1, epoch + 1) 243 | 244 | logfile.close() 245 | 246 | 247 | if __name__ == '__main__': 248 | main() 249 | --------------------------------------------------------------------------------