├── .gitignore ├── .gitmodules ├── .isort.cfg ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── annotator └── README.md ├── checkpoints ├── detector │ └── .gitkeep ├── har │ ├── audioonly_64x1x1.py │ ├── slowonly_u54_kinetics.py │ └── timesformer_divST_16x12x1_kinetics.py └── pose │ └── .gitkeep ├── configs ├── audio │ ├── audioonly_r101_64x1x1_200e_audio_feature.py │ ├── tsn_r18_64x1x1_100e_kinetics200_audio_feature.py │ └── tsn_r50_64x1x1_100e_kinetics400_audio.py ├── i3d │ └── i3d_r50_video_32x2x1_256e_kinetics400_rgb.py ├── omnisourced │ └── slowonly_r50_8x8x1_256e_omnisource_rgb.py ├── skeleton │ ├── agcn │ │ └── 2sagcn_640e_p300_keypoint_2d.py │ └── posec3d │ │ └── slowonly_r50_u54_640e_pr-kinetics.py ├── slowfast │ └── slowfast_r50_video_4x16x1_256e_kinetics400_rgb.py ├── slowonly │ └── slowonly_nl_embedded_gaussian_r50_8x8x1_150e.py └── timesformer │ └── timesformer_divST_16x12x1_15e_kinetics400_rgb.py ├── docker └── Dockerfile ├── requirements ├── extra.txt └── requirements.txt ├── resources ├── ann_dist_clip.jpg ├── ann_dist_clips.json ├── annotation_distribution(min).json ├── annotation_distribution.jpg ├── annotations │ ├── annotations.txt │ ├── annotations_audio.txt │ ├── annotations_pose.txt │ ├── current_annotations.txt │ └── temp.txt ├── audio │ ├── db_20_config.yml │ └── db_30_config.yml └── metrics │ ├── audio_cm.png │ ├── audio_loss.jpg │ ├── posec3d_loss.jpg │ ├── skeleton_cm.png │ └── timesformer_loss.jpg └── src ├── __int__.py ├── analysis ├── __int__.py ├── audio_filter.py ├── class_distribution_clips.py ├── class_distribution_time.py ├── evaluate_acc_per_cls.py ├── pose_feasibility.py └── print_layers.py ├── data ├── README.md ├── __int__.py ├── augment_dataset.py ├── build_file_list.py ├── generate_dataset.py ├── generate_dataset_pose.py └── pose_extraction.py ├── demo ├── __int__.py ├── demo_audio.py ├── demo_skeleton.py ├── long_video_demo_clips.py ├── multimodial_demo.py └── visualize_heatmap_volume.py ├── late_fusion.py ├── misc.py ├── record_experiment.py ├── schedule_stuff.py ├── top_tags.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # vscode 132 | .vscode/ 133 | 134 | # data 135 | dataset/ 136 | dataset_2/ 137 | mmaction2/data/ 138 | mmaction2/work_dir/ 139 | mlruns/ 140 | demos/ 141 | annotator/via-3.0.11/ 142 | annotator/via_video_annotator.html 143 | checkpoints/har/audio.pth 144 | checkpoints/har/posec3d.pth 145 | checkpoints/har/timeSformer.pth 146 | checkpoints/pose/hrnet_w32_coco_256x192.pth 147 | checkpoints/detector/faster_rcnn_r50_fpn_1x_coco-person.pth 148 | temp/ 149 | tmp/ 150 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "mmaction2"] 2 | path = mmaction2 3 | url = https://github.com/open-mmlab/mmaction2.git 4 | [submodule "mmdetection"] 5 | path = mmdetection 6 | url = https://github.com/open-mmlab/mmdetection.git 7 | [submodule "mmpose"] 8 | path = mmpose 9 | url = https://github.com/open-mmlab/mmpose.git 10 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | known_third_party = cv2,data,decord,demo,har,mlflow,mmaction,mmcv,moviepy,numpy,pandas,pyloudnorm,rich,schedule,scipy,seaborn,soundfile,torch,tqdm,utils,vidaug,yaml 3 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: ^tests/data/ 2 | repos: 3 | - repo: https://gitlab.com/pycqa/flake8.git 4 | rev: 3.8.3 5 | hooks: 6 | - id: flake8 7 | - repo: https://github.com/asottile/seed-isort-config 8 | rev: v2.2.0 9 | hooks: 10 | - id: seed-isort-config 11 | - repo: https://github.com/timothycrosley/isort 12 | rev: 4.3.21 13 | hooks: 14 | - id: isort 15 | - repo: https://github.com/pre-commit/mirrors-yapf 16 | rev: v0.30.0 17 | hooks: 18 | - id: yapf 19 | - repo: https://github.com/pre-commit/pre-commit-hooks 20 | rev: v3.1.0 21 | hooks: 22 | - id: trailing-whitespace 23 | - id: check-yaml 24 | - id: end-of-file-fixer 25 | - id: requirements-txt-fixer 26 | - id: double-quote-string-fixer 27 | - id: check-merge-conflict 28 | - id: fix-encoding-pragma 29 | args: ["--remove"] 30 | - id: mixed-line-ending 31 | args: ["--fix=lf"] 32 | - repo: https://github.com/markdownlint/markdownlint 33 | rev: v0.11.0 34 | hooks: 35 | - id: markdownlint 36 | args: [ "-r", "~MD002,~MD013,~MD024,~MD029,~MD033,~MD034,~MD036" ] 37 | - repo: https://github.com/myint/docformatter 38 | rev: v1.3.1 39 | hooks: 40 | - id: docformatter 41 | args: ["--in-place", "--wrap-descriptions", "79"] 42 | - repo: https://github.com/codespell-project/codespell 43 | rev: v2.1.0 44 | hooks: 45 | - id: codespell 46 | args: ["--skip", "*.ipynb,tools/data/hvu/label_map.json", "-L", "te,nd,thre,Gool,gool"] 47 | # - repo: https://github.com/open-mmlab/pre-commit-hooks 48 | # rev: v0.1.0 # Use the ref you want to point at 49 | # hooks: 50 | # - id: check-algo-readme 51 | # - id: check-copyright 52 | # args: ["mmaction", "tools", "tests"] # these directories will be checked 53 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /annotator/README.md: -------------------------------------------------------------------------------- 1 | # VIA 2 | 3 | Simple but powerful annotator tool. Check it out, [link](https://www.robots.ox.ac.uk/~vgg/software/via/). 4 | -------------------------------------------------------------------------------- /checkpoints/detector/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/checkpoints/detector/.gitkeep -------------------------------------------------------------------------------- /checkpoints/har/audioonly_64x1x1.py: -------------------------------------------------------------------------------- 1 | dataset_type = 'AudioFeatureDataset' 2 | data_root = 'phar/mmaction2/data/phar/audio_feature/filtered_20/' 3 | data_root_val = 'mmaction2/data/phar/audio_feature/filtered_20/' 4 | data_root_test = 'mmaction2/data/phar/audio_feature/filtered_20/' 5 | ann_file_train = 'mmaction2/data/phar/audio_feature/filtered_20//train.txt' 6 | ann_file_val = 'mmaction2/data/phar/audio_feature/filtered_20//val.txt' 7 | ann_file_test = 'mmaction2/data/phar/audio_feature/filtered_20//val.txt' 8 | num_classes = 4 9 | model = dict(type='AudioRecognizer', 10 | backbone=dict(type='ResNetAudio', 11 | depth=101, 12 | pretrained=None, 13 | in_channels=1, 14 | norm_eval=False), 15 | cls_head=dict(type='AudioTSNHead', 16 | num_classes=4, 17 | in_channels=1024, 18 | dropout_ratio=0.5, 19 | init_std=0.01), 20 | train_cfg=None, 21 | test_cfg=dict(average_clips='prob')) 22 | train_pipeline = [ 23 | dict(type='LoadAudioFeature'), 24 | dict(type='SampleFrames', clip_len=64, frame_interval=1, num_clips=1), 25 | dict(type='AudioFeatureSelector'), 26 | dict(type='FormatAudioShape', input_format='NCTF'), 27 | dict(type='Collect', keys=['audios', 'label'], meta_keys=[]), 28 | dict(type='ToTensor', keys=['audios']) 29 | ] 30 | val_pipeline = [ 31 | dict(type='LoadAudioFeature'), 32 | dict(type='SampleFrames', 33 | clip_len=64, 34 | frame_interval=1, 35 | num_clips=1, 36 | test_mode=True), 37 | dict(type='AudioFeatureSelector'), 38 | dict(type='FormatAudioShape', input_format='NCTF'), 39 | dict(type='Collect', keys=['audios', 'label'], meta_keys=[]), 40 | dict(type='ToTensor', keys=['audios']) 41 | ] 42 | test_pipeline = [ 43 | dict(type='LoadAudioFeature'), 44 | dict(type='SampleFrames', 45 | clip_len=64, 46 | frame_interval=1, 47 | num_clips=10, 48 | test_mode=True), 49 | dict(type='AudioFeatureSelector'), 50 | dict(type='FormatAudioShape', input_format='NCTF'), 51 | dict(type='Collect', keys=['audios', 'label'], meta_keys=[]), 52 | dict(type='ToTensor', keys=['audios']) 53 | ] 54 | data = dict(videos_per_gpu=16, 55 | workers_per_gpu=1, 56 | test_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1), 57 | val_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1), 58 | train=dict(type='AudioFeatureDataset', 59 | ann_file='train.txt', 60 | data_prefix='', 61 | pipeline=[ 62 | dict(type='LoadAudioFeature'), 63 | dict(type='SampleFrames', 64 | clip_len=64, 65 | frame_interval=1, 66 | num_clips=1), 67 | dict(type='AudioFeatureSelector'), 68 | dict(type='FormatAudioShape', input_format='NCTF'), 69 | dict(type='Collect', 70 | keys=['audios', 'label'], 71 | meta_keys=[]), 72 | dict(type='ToTensor', keys=['audios']) 73 | ]), 74 | val=dict(type='AudioFeatureDataset', 75 | ann_file='val.txt', 76 | data_prefix='', 77 | pipeline=[ 78 | dict(type='LoadAudioFeature'), 79 | dict(type='SampleFrames', 80 | clip_len=64, 81 | frame_interval=1, 82 | num_clips=1, 83 | test_mode=True), 84 | dict(type='AudioFeatureSelector'), 85 | dict(type='FormatAudioShape', input_format='NCTF'), 86 | dict(type='Collect', 87 | keys=['audios', 'label'], 88 | meta_keys=[]), 89 | dict(type='ToTensor', keys=['audios']) 90 | ]), 91 | test=dict(type='AudioFeatureDataset', 92 | ann_file='val.txt', 93 | data_prefix='', 94 | pipeline=[ 95 | dict(type='LoadAudioFeature'), 96 | dict(type='SampleFrames', 97 | clip_len=64, 98 | frame_interval=1, 99 | num_clips=10, 100 | test_mode=True), 101 | dict(type='AudioFeatureSelector'), 102 | dict(type='FormatAudioShape', input_format='NCTF'), 103 | dict(type='Collect', 104 | keys=['audios', 'label'], 105 | meta_keys=[]), 106 | dict(type='ToTensor', keys=['audios']) 107 | ])) 108 | evaluation = dict(interval=5, 109 | metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 110 | 5)))) 111 | eval_config = dict(metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 112 | 5)))) 113 | optimizer = dict(type='SGD', lr=0.025, momentum=0.9, weight_decay=0.0001) 114 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 115 | lr_config = dict(policy='CosineAnnealing', min_lr=0) 116 | total_epochs = 240 117 | checkpoint_config = dict(interval=20) 118 | log_config = dict(interval=20, hooks=[dict(type='TextLoggerHook')]) 119 | dist_params = dict(backend='nccl') 120 | log_level = 'INFO' 121 | load_from = None 122 | resume_from = None 123 | workflow = [('train', 1)] 124 | opencv_num_threads = 0 125 | mp_start_method = 'fork' 126 | work_dir = 'mmaction2/work_dir/audio/' 127 | gpu_ids = range(0, 1) 128 | omnisource = False 129 | module_hooks = [] 130 | -------------------------------------------------------------------------------- /checkpoints/har/slowonly_u54_kinetics.py: -------------------------------------------------------------------------------- 1 | dataset_type = 'PoseDataset' 2 | data_root = '/mmaction2/data/' 3 | data_root_val = '/mmaction2/data/' 4 | data_root_test = '/mmaction2/data/' 5 | ann_file_train = '/mmaction2/data/train.pkl' 6 | ann_file_val = '/mmaction2/data/val.pkl' 7 | ann_file_test = '/mmaction2/data/val.pkl' 8 | num_classes = 6 9 | left_kp = [1, 3, 5, 7, 9, 11, 13, 15] 10 | right_kp = [2, 4, 6, 8, 10, 12, 14, 16] 11 | model = dict(type='Recognizer3D', 12 | backbone=dict(type='ResNet3dSlowOnly', 13 | depth=50, 14 | pretrained=None, 15 | in_channels=17, 16 | base_channels=32, 17 | num_stages=3, 18 | out_indices=(2, ), 19 | stage_blocks=(4, 6, 3), 20 | conv1_stride_s=1, 21 | pool1_stride_s=1, 22 | inflate=(0, 1, 1), 23 | spatial_strides=(2, 2, 2), 24 | temporal_strides=(1, 1, 2), 25 | dilations=(1, 1, 1)), 26 | cls_head=dict(type='I3DHead', 27 | in_channels=512, 28 | num_classes=6, 29 | spatial_type='avg', 30 | dropout_ratio=0.7), 31 | train_cfg=dict(), 32 | test_cfg=dict(average_clips='prob')) 33 | train_pipeline = [ 34 | dict(type='UniformSampleFrames', clip_len=54), 35 | dict(type='PoseDecode'), 36 | dict(type='PoseCompact', hw_ratio=1.0, allow_imgpad=True), 37 | dict(type='Resize', scale=(-1, 64)), 38 | dict(type='RandomResizedCrop', area_range=(0.56, 1.0)), 39 | dict(type='Resize', scale=(56, 56), keep_ratio=False), 40 | dict(type='Flip', 41 | flip_ratio=0.5, 42 | left_kp=[1, 3, 5, 7, 9, 11, 13, 15], 43 | right_kp=[2, 4, 6, 8, 10, 12, 14, 16]), 44 | dict(type='GeneratePoseTarget', 45 | sigma=0.6, 46 | use_score=True, 47 | with_kp=True, 48 | with_limb=False), 49 | dict(type='FormatShape', input_format='NCTHW'), 50 | dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), 51 | dict(type='ToTensor', keys=['imgs', 'label']) 52 | ] 53 | val_pipeline = [ 54 | dict(type='UniformSampleFrames', clip_len=54, num_clips=1, test_mode=True), 55 | dict(type='PoseDecode'), 56 | dict(type='PoseCompact', hw_ratio=1.0, allow_imgpad=True), 57 | dict(type='Resize', scale=(-1, 64)), 58 | dict(type='CenterCrop', crop_size=64), 59 | dict(type='GeneratePoseTarget', 60 | sigma=0.6, 61 | use_score=True, 62 | with_kp=True, 63 | with_limb=False), 64 | dict(type='FormatShape', input_format='NCTHW'), 65 | dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), 66 | dict(type='ToTensor', keys=['imgs']) 67 | ] 68 | test_pipeline = [ 69 | dict(type='UniformSampleFrames', clip_len=54, num_clips=10, 70 | test_mode=True), 71 | dict(type='PoseDecode'), 72 | dict(type='PoseCompact', hw_ratio=1.0, allow_imgpad=True), 73 | dict(type='Resize', scale=(-1, 64)), 74 | dict(type='CenterCrop', crop_size=64), 75 | dict(type='GeneratePoseTarget', 76 | sigma=0.6, 77 | use_score=True, 78 | with_kp=True, 79 | with_limb=False, 80 | double=True, 81 | left_kp=[1, 3, 5, 7, 9, 11, 13, 15], 82 | right_kp=[2, 4, 6, 8, 10, 12, 14, 16]), 83 | dict(type='FormatShape', input_format='NCTHW'), 84 | dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), 85 | dict(type='ToTensor', keys=['imgs']) 86 | ] 87 | data = dict(videos_per_gpu=16, 88 | workers_per_gpu=2, 89 | test_dataloader=dict(videos_per_gpu=1), 90 | train=dict(type='PoseDataset', 91 | ann_file='/mmaction2/data//train.pkl', 92 | data_prefix='', 93 | pipeline=[ 94 | dict(type='UniformSampleFrames', clip_len=54), 95 | dict(type='PoseDecode'), 96 | dict(type='PoseCompact', 97 | hw_ratio=1.0, 98 | allow_imgpad=True), 99 | dict(type='Resize', scale=(-1, 64)), 100 | dict(type='RandomResizedCrop', 101 | area_range=(0.56, 1.0)), 102 | dict(type='Resize', 103 | scale=(56, 56), 104 | keep_ratio=False), 105 | dict(type='Flip', 106 | flip_ratio=0.5, 107 | left_kp=[1, 3, 5, 7, 9, 11, 13, 15], 108 | right_kp=[2, 4, 6, 8, 10, 12, 14, 16]), 109 | dict(type='GeneratePoseTarget', 110 | sigma=0.6, 111 | use_score=True, 112 | with_kp=True, 113 | with_limb=False), 114 | dict(type='FormatShape', input_format='NCTHW'), 115 | dict(type='Collect', 116 | keys=['imgs', 'label'], 117 | meta_keys=[]), 118 | dict(type='ToTensor', keys=['imgs', 'label']) 119 | ]), 120 | val=dict(type='PoseDataset', 121 | ann_file='/mmaction2/data//val.pkl', 122 | data_prefix='', 123 | pipeline=[ 124 | dict(type='UniformSampleFrames', 125 | clip_len=54, 126 | num_clips=1, 127 | test_mode=True), 128 | dict(type='PoseDecode'), 129 | dict(type='PoseCompact', 130 | hw_ratio=1.0, 131 | allow_imgpad=True), 132 | dict(type='Resize', scale=(-1, 64)), 133 | dict(type='CenterCrop', crop_size=64), 134 | dict(type='GeneratePoseTarget', 135 | sigma=0.6, 136 | use_score=True, 137 | with_kp=True, 138 | with_limb=False), 139 | dict(type='FormatShape', input_format='NCTHW'), 140 | dict(type='Collect', 141 | keys=['imgs', 'label'], 142 | meta_keys=[]), 143 | dict(type='ToTensor', keys=['imgs']) 144 | ]), 145 | test=dict(type='PoseDataset', 146 | ann_file='/mmaction2/data//val.pkl', 147 | data_prefix='', 148 | pipeline=[ 149 | dict(type='UniformSampleFrames', 150 | clip_len=54, 151 | num_clips=10, 152 | test_mode=True), 153 | dict(type='PoseDecode'), 154 | dict(type='PoseCompact', 155 | hw_ratio=1.0, 156 | allow_imgpad=True), 157 | dict(type='Resize', scale=(-1, 64)), 158 | dict(type='CenterCrop', crop_size=64), 159 | dict(type='GeneratePoseTarget', 160 | sigma=0.6, 161 | use_score=True, 162 | with_kp=True, 163 | with_limb=False, 164 | double=True, 165 | left_kp=[1, 3, 5, 7, 9, 11, 13, 15], 166 | right_kp=[2, 4, 6, 8, 10, 12, 14, 16]), 167 | dict(type='FormatShape', input_format='NCTHW'), 168 | dict(type='Collect', 169 | keys=['imgs', 'label'], 170 | meta_keys=[]), 171 | dict(type='ToTensor', keys=['imgs']) 172 | ])) 173 | optimizer = dict(type='SGD', lr=0.05, momentum=0.9, weight_decay=0.0003) 174 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 175 | lr_config = dict(policy='CosineAnnealing', by_epoch=False, min_lr=0) 176 | total_epochs = 480 177 | checkpoint_config = dict(interval=20) 178 | workflow = [('train', 10)] 179 | evaluation = dict(interval=5, 180 | metrics=['top_k_accuracy', 'mean_class_accuracy'], 181 | topk=(1, 2, 3, 4, 5)) 182 | eval_config = dict(metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 183 | 5)))) 184 | log_config = dict(interval=20, hooks=[dict(type='TextLoggerHook')]) 185 | dist_params = dict(backend='nccl') 186 | log_level = 'INFO' 187 | load_from = ('https://download.openmmlab.com/mmaction/skeleton/posec3d/' 188 | 'slowonly_kinetics400_pretrained_r50_u48_120e_ucf101_split1_' 189 | 'keypoint/slowonly_kinetics400_pretrained_r50_u48_120e_ucf101' 190 | '_split1_keypoint-cae8aa4a.pth') 191 | resume_from = None 192 | find_unused_parameters = False 193 | work_dir = 'work_dir/posec3d/' 194 | gpu_ids = range(0, 1) 195 | omnisource = False 196 | module_hooks = [] 197 | -------------------------------------------------------------------------------- /checkpoints/har/timesformer_divST_16x12x1_kinetics.py: -------------------------------------------------------------------------------- 1 | # * dataset settings 2 | dataset_type = 'VideoDataset' 3 | data_root = 'mmaction2/data/phar/' 4 | data_root_val = data_root 5 | data_root_test = data_root 6 | ann_file_train = f'{data_root}/train_aug.txt' 7 | ann_file_val = f'{data_root_val}/val.txt' 8 | ann_file_test = f'{data_root_test}/val.txt' 9 | num_classes = 17 10 | img_norm_cfg = dict(mean=[127.5, 127.5, 127.5], 11 | std=[127.5, 127.5, 127.5], 12 | to_bgr=False) 13 | 14 | # * model settings 15 | model = dict( 16 | type='Recognizer3D', 17 | backbone=dict( 18 | type='TimeSformer', 19 | pretrained= # noqa: E251 20 | 'https://download.openmmlab.com/mmaction/recognition/timesformer/vit_base_patch16_224.pth', # noqa: E501 21 | num_frames=16, 22 | img_size=224, 23 | patch_size=16, 24 | embed_dims=768, 25 | in_channels=3, 26 | dropout_ratio=0.2, 27 | transformer_layers=None, 28 | # divided attention is the best strategy 29 | attention_type='divided_space_time', 30 | norm_cfg=dict(type='LN', eps=1e-6)), 31 | cls_head=dict(type='TimeSformerHead', 32 | num_classes=num_classes, 33 | in_channels=768, 34 | topk=(1, 2, 3, 4, 5)), 35 | # model training and testing settings 36 | train_cfg=None, 37 | test_cfg=dict(average_clips='prob')) 38 | 39 | train_pipeline = [ 40 | dict(type='DecordInit'), 41 | # * frame_interval has been selected for 7s clips 42 | dict(type='SampleFrames', clip_len=16, frame_interval=12, num_clips=1), 43 | dict(type='DecordDecode'), 44 | dict(type='RandomRescale', scale_range=(256, 320)), 45 | dict(type='RandomCrop', size=224), 46 | dict(type='Flip', flip_ratio=0.5), 47 | dict(type='Normalize', **img_norm_cfg), 48 | dict(type='FormatShape', input_format='NCTHW'), 49 | dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), 50 | dict(type='ToTensor', keys=['imgs', 'label']) 51 | ] 52 | val_pipeline = [ 53 | dict(type='DecordInit'), 54 | dict(type='SampleFrames', 55 | clip_len=16, 56 | frame_interval=12, 57 | num_clips=1, 58 | test_mode=True), 59 | dict(type='DecordDecode'), 60 | dict(type='Resize', scale=(-1, 256)), 61 | dict(type='CenterCrop', crop_size=224), 62 | dict(type='Normalize', **img_norm_cfg), 63 | dict(type='FormatShape', input_format='NCTHW'), 64 | dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), 65 | dict(type='ToTensor', keys=['imgs', 'label']) 66 | ] 67 | test_pipeline = [ 68 | dict(type='DecordInit'), 69 | dict(type='SampleFrames', 70 | clip_len=16, 71 | frame_interval=12, 72 | num_clips=1, 73 | test_mode=True), 74 | dict(type='DecordDecode'), 75 | dict(type='Resize', scale=(-1, 224)), 76 | dict(type='ThreeCrop', crop_size=224), 77 | dict(type='Normalize', **img_norm_cfg), 78 | dict(type='FormatShape', input_format='NCTHW'), 79 | dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), 80 | dict(type='ToTensor', keys=['imgs', 'label']) 81 | ] 82 | data = dict(videos_per_gpu=1, 83 | workers_per_gpu=1, 84 | test_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1), 85 | val_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1), 86 | train=dict(type=dataset_type, 87 | ann_file=ann_file_train, 88 | data_prefix='', 89 | pipeline=train_pipeline), 90 | val=dict(type=dataset_type, 91 | ann_file=ann_file_val, 92 | data_prefix='', 93 | pipeline=val_pipeline), 94 | test=dict(type=dataset_type, 95 | ann_file=ann_file_test, 96 | data_prefix='', 97 | pipeline=test_pipeline)) 98 | 99 | # set the top-k accuracy during validation 100 | evaluation = dict( 101 | interval=1, 102 | metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 5))), 103 | ) 104 | # set the top-k accuracy during testing 105 | eval_config = dict(metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 106 | 5))), ) 107 | 108 | # optimizer 109 | optimizer = dict(type='SGD', 110 | lr=0.0015625, 111 | momentum=0.9, 112 | paramwise_cfg=dict( 113 | custom_keys={ 114 | '.backbone.cls_token': dict(decay_mult=0.0), 115 | '.backbone.pos_embed': dict(decay_mult=0.0), 116 | '.backbone.time_embed': dict(decay_mult=0.0) 117 | }), 118 | weight_decay=1e-4, 119 | nesterov=True) # this lr is used for 8 gpus 120 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 121 | 122 | # learning policy 123 | lr_config = dict(policy='step', step=[5, 10]) 124 | total_epochs = 25 125 | 126 | # * runtime settings 127 | checkpoint_config = dict(interval=1) 128 | log_config = dict( 129 | interval=1000, 130 | hooks=[ 131 | dict(type='TextLoggerHook'), 132 | # dict(type='TensorboardLoggerHook'), 133 | ]) 134 | dist_params = dict(backend='nccl') 135 | log_level = 'INFO' 136 | load_from = ('https://download.openmmlab.com/mmaction/recognition/timesformer/' 137 | 'timesformer_divST_8x32x1_15e_kinetics400_rgb/' 138 | 'timesformer_divST_8x32x1_15e_kinetics400_rgb-3f8e5d03.pth') 139 | resume_from = None 140 | workflow = [('train', 1)] 141 | 142 | # disable opencv multithreading to avoid system being overloaded 143 | opencv_num_threads = 0 144 | # set multi-process start method as `fork` to speed up the training 145 | mp_start_method = 'fork' 146 | -------------------------------------------------------------------------------- /checkpoints/pose/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/checkpoints/pose/.gitkeep -------------------------------------------------------------------------------- /configs/audio/audioonly_r101_64x1x1_200e_audio_feature.py: -------------------------------------------------------------------------------- 1 | # * dataset settings 2 | dataset_type = 'AudioFeatureDataset' 3 | data_root = ('/home/rejnald/projects/side_projects/phar/mmaction2/data/phar/' 4 | 'audio_feature/filtered_30/') 5 | data_root_val = data_root 6 | data_root_test = data_root 7 | ann_file_train = f'{data_root}/train.txt' 8 | ann_file_val = f'{data_root_val}/val.txt' 9 | ann_file_test = f'{data_root_test}/val.txt' 10 | num_classes = 4 11 | 12 | # * model settings 13 | model = dict( 14 | type='AudioRecognizer', 15 | backbone=dict(type='ResNetAudio', 16 | depth=101, 17 | pretrained=None, 18 | in_channels=1, 19 | norm_eval=False), 20 | cls_head=dict( 21 | type='AudioTSNHead', 22 | num_classes=num_classes, 23 | in_channels=1024, 24 | dropout_ratio=0.5, # TODO: 0.6 - 0.8 25 | init_std=0.01, 26 | topk=(1, 2, 3, 4, 5)), 27 | # model training and testing settings 28 | train_cfg=None, 29 | test_cfg=dict(average_clips='prob')) 30 | 31 | train_pipeline = [ 32 | dict(type='LoadAudioFeature'), 33 | dict(type='SampleFrames', clip_len=64, frame_interval=1, num_clips=1), 34 | dict(type='AudioFeatureSelector'), 35 | dict(type='FormatAudioShape', input_format='NCTF'), 36 | dict(type='Collect', keys=['audios', 'label'], meta_keys=[]), 37 | dict(type='ToTensor', keys=['audios']) 38 | ] 39 | val_pipeline = [ 40 | dict(type='LoadAudioFeature'), 41 | dict(type='SampleFrames', 42 | clip_len=64, 43 | frame_interval=1, 44 | num_clips=1, 45 | test_mode=True), 46 | dict(type='AudioFeatureSelector'), 47 | dict(type='FormatAudioShape', input_format='NCTF'), 48 | dict(type='Collect', keys=['audios', 'label'], meta_keys=[]), 49 | dict(type='ToTensor', keys=['audios']) 50 | ] 51 | test_pipeline = [ 52 | dict(type='LoadAudioFeature'), 53 | dict(type='SampleFrames', 54 | clip_len=64, 55 | frame_interval=1, 56 | num_clips=10, 57 | test_mode=True), 58 | dict(type='AudioFeatureSelector'), 59 | dict(type='FormatAudioShape', input_format='NCTF'), 60 | dict(type='Collect', keys=['audios', 'label'], meta_keys=[]), 61 | dict(type='ToTensor', keys=['audios']) 62 | ] 63 | data = dict(videos_per_gpu=16, 64 | workers_per_gpu=1, 65 | test_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1), 66 | val_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1), 67 | train=dict(type=dataset_type, 68 | ann_file=ann_file_train, 69 | data_prefix='', 70 | pipeline=train_pipeline), 71 | val=dict(type=dataset_type, 72 | ann_file=ann_file_val, 73 | data_prefix='', 74 | pipeline=val_pipeline), 75 | test=dict(type=dataset_type, 76 | ann_file=ann_file_test, 77 | data_prefix='', 78 | pipeline=test_pipeline)) 79 | # set the top-k accuracy during validation 80 | evaluation = dict( 81 | interval=5, # Interval to perform evaluation 82 | metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 5))), 83 | ) 84 | # set the top-k accuracy during testing 85 | eval_config = dict(metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 86 | 5))), ) 87 | 88 | # optimizer 89 | optimizer = dict(type='SGD', lr=0.0025, momentum=0.9, 90 | weight_decay=0.0001) # this lr is used for 8 gpus 91 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 92 | # learning policy 93 | lr_config = dict(policy='CosineAnnealing', min_lr=0) 94 | total_epochs = 320 95 | 96 | # * runtime settings 97 | checkpoint_config = dict(interval=20) 98 | log_config = dict( 99 | interval=100, 100 | hooks=[ 101 | dict(type='TextLoggerHook'), 102 | # dict(type='TensorboardLoggerHook'), 103 | ]) 104 | # runtime settings 105 | dist_params = dict(backend='nccl') 106 | log_level = 'INFO' 107 | load_from = None 108 | resume_from = None 109 | workflow = [('train', 1)] 110 | 111 | # disable opencv multithreading to avoid system being overloaded 112 | opencv_num_threads = 0 113 | # set multi-process start method as `fork` to speed up the training 114 | mp_start_method = 'fork' 115 | -------------------------------------------------------------------------------- /configs/audio/tsn_r18_64x1x1_100e_kinetics200_audio_feature.py: -------------------------------------------------------------------------------- 1 | # * dataset settings 2 | dataset_type = 'AudioFeatureDataset' 3 | data_root = ('/home/rejnald/projects/side_projects/phar/mmaction2/data/phar/' 4 | 'audio_feature/filtered_20/') 5 | data_root_val = data_root 6 | data_root_test = data_root 7 | ann_file_train = f'{data_root}/train.txt' 8 | ann_file_val = f'{data_root_val}/val.txt' 9 | ann_file_test = f'{data_root_test}/test.txt' 10 | num_classes = 3 11 | 12 | # * model settings 13 | model = dict( 14 | type='AudioRecognizer', 15 | backbone=dict(type='ResNet', depth=18, in_channels=1, norm_eval=False), 16 | cls_head=dict(type='AudioTSNHead', 17 | num_classes=num_classes, 18 | in_channels=512, 19 | dropout_ratio=0.7, 20 | init_std=0.01, 21 | topk=(1, 2, 3, 4, 5)), 22 | # model training and testing settings 23 | train_cfg=None, 24 | test_cfg=dict(average_clips='prob')) 25 | 26 | train_pipeline = [ 27 | dict(type='LoadAudioFeature'), 28 | dict(type='SampleFrames', clip_len=64, frame_interval=2, num_clips=1), 29 | dict(type='AudioFeatureSelector'), 30 | dict(type='FormatAudioShape', input_format='NCTF'), 31 | dict(type='Collect', keys=['audios', 'label'], meta_keys=[]), 32 | dict(type='ToTensor', keys=['audios']) 33 | ] 34 | val_pipeline = [ 35 | dict(type='LoadAudioFeature'), 36 | dict(type='SampleFrames', 37 | clip_len=64, 38 | frame_interval=2, 39 | num_clips=1, 40 | test_mode=True), 41 | dict(type='AudioFeatureSelector'), 42 | dict(type='FormatAudioShape', input_format='NCTF'), 43 | dict(type='Collect', keys=['audios', 'label'], meta_keys=[]), 44 | dict(type='ToTensor', keys=['audios']) 45 | ] 46 | test_pipeline = [ 47 | dict(type='LoadAudioFeature'), 48 | dict(type='SampleFrames', 49 | clip_len=64, 50 | frame_interval=2, 51 | num_clips=1, 52 | test_mode=True), 53 | dict(type='AudioFeatureSelector'), 54 | dict(type='FormatAudioShape', input_format='NCTF'), 55 | dict(type='Collect', keys=['audios', 'label'], meta_keys=[]), 56 | dict(type='ToTensor', keys=['audios']) 57 | ] 58 | data = dict(videos_per_gpu=32, 59 | workers_per_gpu=2, 60 | test_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1), 61 | val_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1), 62 | train=dict(type=dataset_type, 63 | ann_file=ann_file_train, 64 | data_prefix='', 65 | pipeline=train_pipeline), 66 | val=dict(type=dataset_type, 67 | ann_file=ann_file_val, 68 | data_prefix='', 69 | pipeline=val_pipeline), 70 | test=dict(type=dataset_type, 71 | ann_file=ann_file_test, 72 | data_prefix='', 73 | pipeline=test_pipeline)) 74 | # set the top-k accuracy during validation 75 | evaluation = dict( 76 | interval=5, # Interval to perform evaluation 77 | metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 5))), 78 | ) 79 | # set the top-k accuracy during testing 80 | eval_config = dict(metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 81 | 5))), ) 82 | 83 | # optimizer 84 | optimizer = dict(type='SGD', lr=0.1, momentum=0.9, 85 | weight_decay=0.0001) # this lr is used for 8 gpus 86 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 87 | # learning policy 88 | lr_config = dict(policy='CosineAnnealing', min_lr=0) 89 | total_epochs = 200 90 | 91 | # * runtime settings 92 | checkpoint_config = dict(interval=20) 93 | log_config = dict( 94 | interval=20, 95 | hooks=[ 96 | dict(type='TextLoggerHook'), 97 | # dict(type='TensorboardLoggerHook'), 98 | ]) 99 | # runtime settings 100 | dist_params = dict(backend='nccl') 101 | log_level = 'INFO' 102 | load_from = ( 103 | 'https://download.openmmlab.com/mmaction/recognition/' 104 | 'audio_recognition/tsn_r18_64x1x1_100e_kinetics400_audio_feature/' 105 | 'tsn_r18_64x1x1_100e_kinetics400_audio_feature_20201012-bf34df6c.pth') 106 | # load_from=None 107 | resume_from = None 108 | workflow = [('train', 1)] 109 | 110 | # disable opencv multithreading to avoid system being overloaded 111 | opencv_num_threads = 0 112 | # set multi-process start method as `fork` to speed up the training 113 | mp_start_method = 'fork' 114 | -------------------------------------------------------------------------------- /configs/audio/tsn_r50_64x1x1_100e_kinetics400_audio.py: -------------------------------------------------------------------------------- 1 | 2 | # * dataset settings 3 | dataset_type = 'AudioFeatureDataset' 4 | data_root = ('/home/rejnald/projects/side_projects/phar/mmaction2/data/phar/' 5 | 'audio/') 6 | data_root_val = data_root 7 | data_root_test = data_root 8 | ann_file_train = f'{data_root}/train.txt' 9 | ann_file_val = f'{data_root_val}/val.txt' 10 | ann_file_test = f'{data_root_test}/test.txt' 11 | num_classes = 4 12 | 13 | # * model settings 14 | model = dict( 15 | type='AudioRecognizer', 16 | backbone=dict(type='ResNet', depth=50, in_channels=1, norm_eval=False), 17 | cls_head=dict( 18 | type='AudioTSNHead', 19 | num_classes=num_classes, 20 | in_channels=2048, 21 | dropout_ratio=0.5, 22 | init_std=0.01), 23 | # model training and testing settings 24 | train_cfg=None, 25 | test_cfg=dict(average_clips='prob')) 26 | 27 | 28 | train_pipeline = [ 29 | dict(type='AudioDecodeInit'), 30 | dict(type='SampleFrames', clip_len=64, frame_interval=1, num_clips=1), 31 | dict(type='AudioDecode'), 32 | dict(type='AudioAmplify', ratio=1.5), 33 | # dict(type='MelLogSpectrogram'), 34 | dict(type='FormatAudioShape', input_format='NCTF'), 35 | dict(type='Collect', keys=['audios', 'label'], meta_keys=[]), 36 | dict(type='ToTensor', keys=['audios']) 37 | ] 38 | val_pipeline = [ 39 | dict(type='AudioDecodeInit'), 40 | dict( 41 | type='SampleFrames', 42 | clip_len=64, 43 | frame_interval=1, 44 | num_clips=1, 45 | test_mode=True), 46 | dict(type='AudioDecode'), 47 | dict(type='AudioAmplify', ratio=1.5), 48 | # dict(type='MelLogSpectrogram'), 49 | dict(type='FormatAudioShape', input_format='NCTF'), 50 | dict(type='Collect', keys=['audios', 'label'], meta_keys=[]), 51 | dict(type='ToTensor', keys=['audios']) 52 | ] 53 | test_pipeline = [ 54 | dict(type='AudioDecodeInit'), 55 | dict( 56 | type='SampleFrames', 57 | clip_len=64, 58 | frame_interval=1, 59 | num_clips=1, 60 | test_mode=True), 61 | dict(type='AudioDecodeInit'), 62 | dict(type='AudioAmplify', ratio=1.5), 63 | # dict(type='MelLogSpectrogram'), 64 | dict(type='FormatAudioShape', input_format='NCTF'), 65 | dict(type='Collect', keys=['audios', 'label'], meta_keys=[]), 66 | dict(type='ToTensor', keys=['audios']) 67 | ] 68 | data = dict( 69 | videos_per_gpu=32, 70 | workers_per_gpu=1, 71 | test_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1), 72 | val_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1), 73 | train=dict( 74 | type=dataset_type, 75 | ann_file=ann_file_train, 76 | data_prefix='', 77 | pipeline=train_pipeline), 78 | val=dict( 79 | type=dataset_type, 80 | ann_file=ann_file_val, 81 | data_prefix='', 82 | pipeline=val_pipeline), 83 | test=dict( 84 | type=dataset_type, 85 | ann_file=ann_file_test, 86 | data_prefix='', 87 | pipeline=test_pipeline)) 88 | # set the top-k accuracy during validation 89 | evaluation = dict( 90 | interval=5, # Interval to perform evaluation 91 | metric_options=dict( 92 | top_k_accuracy=dict(topk=(1, 2, 3, 4, 5))),) 93 | # set the top-k accuracy during testing 94 | eval_config = dict( 95 | metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 5))),) 96 | 97 | # optimizer 98 | optimizer = dict( 99 | type='SGD', lr=0.05, momentum=0.9, 100 | weight_decay=0.0001) # this lr is used for 8 gpus 101 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 102 | # learning policy 103 | lr_config = dict(policy='CosineAnnealing', min_lr=0) 104 | total_epochs = 540 105 | 106 | # * runtime settings 107 | checkpoint_config = dict(interval=10) 108 | log_config = dict( 109 | interval=20, 110 | hooks=[ 111 | dict(type='TextLoggerHook'), 112 | # dict(type='TensorboardLoggerHook'), 113 | ]) 114 | # runtime settings 115 | dist_params = dict(backend='nccl') 116 | log_level = 'INFO' 117 | load_from = None 118 | resume_from = None 119 | workflow = [('train', 1)] 120 | 121 | # disable opencv multithreading to avoid system being overloaded 122 | opencv_num_threads = 0 123 | # set multi-process start method as `fork` to speed up the training 124 | mp_start_method = 'fork' 125 | -------------------------------------------------------------------------------- /configs/i3d/i3d_r50_video_32x2x1_256e_kinetics400_rgb.py: -------------------------------------------------------------------------------- 1 | # * dataset settings 2 | dataset_type = 'VideoDataset' 3 | data_root = '/home/rejnald/projects/side_projects/phar/mmaction2/data/phar/' 4 | data_root_val = data_root 5 | data_root_test = data_root 6 | ann_file_train = f'{data_root}/train_aug.txt' 7 | ann_file_val = f'{data_root_val}/val.txt' 8 | ann_file_test = f'{data_root_test}/val.txt' 9 | num_classes = 17 10 | 11 | # * model settings 12 | model = dict( 13 | type='Recognizer3D', 14 | backbone=dict(type='ResNet3d', 15 | pretrained2d=True, 16 | pretrained='torchvision://resnet50', 17 | depth=50, 18 | conv1_kernel=(5, 7, 7), 19 | conv1_stride_t=2, 20 | pool1_stride_t=2, 21 | conv_cfg=dict(type='Conv3d'), 22 | norm_eval=False, 23 | inflate=((1, 1, 1), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 1, 24 | 0)), 25 | zero_init_residual=False), 26 | cls_head=dict(type='I3DHead', 27 | num_classes=num_classes, 28 | in_channels=2048, 29 | spatial_type='avg', 30 | dropout_ratio=0.8, 31 | init_std=0.01, 32 | topk=(1, 2, 3, 4, 5)), 33 | # model training and testing settings 34 | train_cfg=None, 35 | test_cfg=dict(average_clips='prob')) 36 | 37 | # This setting refers to https://github.com/open-mmlab/mmaction/blob/master/mmaction/models/tenons/backbones/resnet_i3d.py#L329-L332 # noqa: E501 38 | 39 | # * dataset settings 40 | img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], 41 | std=[58.395, 57.12, 57.375], 42 | to_bgr=False) 43 | train_pipeline = [ 44 | dict(type='DecordInit'), 45 | dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), 46 | dict(type='DecordDecode'), 47 | dict(type='Resize', scale=(-1, 256)), 48 | dict(type='MultiScaleCrop', 49 | input_size=224, 50 | scales=(1, 0.8), 51 | random_crop=False, 52 | max_wh_scale_gap=0), 53 | dict(type='Resize', scale=(224, 224), keep_ratio=False), 54 | dict(type='Flip', flip_ratio=0.5), 55 | dict(type='Normalize', **img_norm_cfg), 56 | dict(type='FormatShape', input_format='NCTHW'), 57 | dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), 58 | dict(type='ToTensor', keys=['imgs', 'label']) 59 | ] 60 | val_pipeline = [ 61 | dict(type='DecordInit'), 62 | dict(type='SampleFrames', 63 | clip_len=32, 64 | frame_interval=2, 65 | num_clips=1, 66 | test_mode=True), 67 | dict(type='DecordDecode'), 68 | dict(type='Resize', scale=(-1, 256)), 69 | dict(type='CenterCrop', crop_size=224), 70 | dict(type='Normalize', **img_norm_cfg), 71 | dict(type='FormatShape', input_format='NCTHW'), 72 | dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), 73 | dict(type='ToTensor', keys=['imgs']) 74 | ] 75 | test_pipeline = [ 76 | dict(type='DecordInit'), 77 | dict(type='SampleFrames', 78 | clip_len=32, 79 | frame_interval=2, 80 | num_clips=8, 81 | test_mode=True), 82 | dict(type='DecordDecode'), 83 | dict(type='Resize', scale=(-1, 256)), 84 | dict(type='ThreeCrop', crop_size=256), 85 | dict(type='Normalize', **img_norm_cfg), 86 | dict(type='FormatShape', input_format='NCTHW'), 87 | dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), 88 | dict(type='ToTensor', keys=['imgs']) 89 | ] 90 | 91 | data = dict(videos_per_gpu=4, 92 | workers_per_gpu=1, 93 | test_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1), 94 | val_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1), 95 | train=dict( 96 | type=dataset_type, 97 | ann_file=ann_file_train, 98 | data_prefix='', 99 | pipeline=train_pipeline, 100 | ), 101 | val=dict( 102 | type=dataset_type, 103 | ann_file=ann_file_val, 104 | data_prefix='', 105 | pipeline=val_pipeline, 106 | ), 107 | test=dict( 108 | type=dataset_type, 109 | ann_file=ann_file_test, 110 | data_prefix='', 111 | pipeline=test_pipeline, 112 | )) 113 | # set the top-k accuracy during validation 114 | evaluation = dict( 115 | interval=5, # Interval to perform evaluation 116 | metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 5))), 117 | ) 118 | # set the top-k accuracy during testing 119 | eval_config = dict(metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 120 | 5))), ) 121 | 122 | # * optimizer 123 | optimizer = dict(type='SGD', lr=0.00625, momentum=0.9, weight_decay=0.0001) 124 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 125 | # learning policy 126 | lr_config = dict(policy='step', step=[40, 80]) 127 | total_epochs = 256 128 | 129 | # * runtime settings 130 | checkpoint_config = dict(interval=2) 131 | log_config = dict( 132 | interval=200, 133 | hooks=[ 134 | dict(type='TextLoggerHook'), 135 | # dict(type='TensorboardLoggerHook'), 136 | ]) 137 | dist_params = dict(backend='nccl') 138 | log_level = 'INFO' 139 | load_from = ('https://download.openmmlab.com/mmaction/' 140 | 'recognition/i3d/i3d_r50_video_32x2x1_100e_kinetics400_rgb/' 141 | 'i3d_r50_video_32x2x1_100e_kinetics400_rgb_20200826-e31c6f52.pth') 142 | resume_from = None 143 | workflow = [('train', 1)] 144 | -------------------------------------------------------------------------------- /configs/omnisourced/slowonly_r50_8x8x1_256e_omnisource_rgb.py: -------------------------------------------------------------------------------- 1 | # * dataset settings 2 | dataset_type = 'VideoDataset' 3 | data_root = '/home/rejnald/projects/side_projects/phar/mmaction2/data/phar/' 4 | data_root_val = data_root 5 | data_root_test = data_root 6 | ann_file_train = f'{data_root}/train.txt' 7 | ann_file_val = f'{data_root_val}/val.txt' 8 | ann_file_test = f'{data_root_test}/test.txt' 9 | num_classes = 18 10 | 11 | # * model settings 12 | model = dict( 13 | type='Recognizer3D', 14 | backbone=dict(type='ResNet3dSlowOnly', 15 | depth=50, 16 | pretrained=None, 17 | lateral=False, 18 | conv1_kernel=(1, 7, 7), 19 | conv1_stride_t=1, 20 | pool1_stride_t=1, 21 | inflate=(0, 0, 1, 1), 22 | norm_eval=False), 23 | cls_head=dict(type='I3DHead', 24 | in_channels=2048, 25 | num_classes=num_classes, 26 | spatial_type='avg', 27 | dropout_ratio=0.5, 28 | topk=(1, 2, 3, 4, 5)), 29 | # model training and testing settings 30 | train_cfg=None, 31 | test_cfg=dict(average_clips='prob')) 32 | 33 | # * dataset settings 34 | img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], 35 | std=[58.395, 57.12, 57.375], 36 | to_bgr=False) 37 | train_pipeline = [ 38 | dict(type='DecordInit'), 39 | dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1), 40 | dict(type='DecordDecode'), 41 | dict(type='Resize', scale=(-1, 256)), 42 | dict(type='RandomResizedCrop'), 43 | dict(type='Resize', scale=(224, 224), keep_ratio=False), 44 | dict(type='Flip', flip_ratio=0.5), 45 | dict(type='Normalize', **img_norm_cfg), 46 | dict(type='FormatShape', input_format='NCTHW'), 47 | dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), 48 | dict(type='ToTensor', keys=['imgs', 'label']) 49 | ] 50 | 51 | val_pipeline = [ 52 | dict(type='DecordInit'), 53 | dict(type='SampleFrames', 54 | clip_len=8, 55 | frame_interval=8, 56 | num_clips=1, 57 | test_mode=True), 58 | dict(type='DecordDecode'), 59 | dict(type='Resize', scale=(-1, 256)), 60 | dict(type='CenterCrop', crop_size=256), 61 | dict(type='Normalize', **img_norm_cfg), 62 | dict(type='FormatShape', input_format='NCTHW'), 63 | dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), 64 | dict(type='ToTensor', keys=['imgs']) 65 | ] 66 | 67 | test_pipeline = [ 68 | dict(type='DecordInit'), 69 | dict(type='SampleFrames', 70 | clip_len=8, 71 | frame_interval=8, 72 | num_clips=10, 73 | test_mode=True), 74 | dict(type='DecordDecode'), 75 | dict(type='Resize', scale=(-1, 256)), 76 | dict(type='ThreeCrop', crop_size=256), 77 | dict(type='Normalize', **img_norm_cfg), 78 | dict(type='FormatShape', input_format='NCTHW'), 79 | dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), 80 | dict(type='ToTensor', keys=['imgs']) 81 | ] 82 | 83 | data = dict(videos_per_gpu=6, 84 | workers_per_gpu=1, 85 | test_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1), 86 | val_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1), 87 | train=dict( 88 | type=dataset_type, 89 | ann_file=ann_file_train, 90 | data_prefix='', 91 | pipeline=train_pipeline, 92 | ), 93 | val=dict( 94 | type=dataset_type, 95 | ann_file=ann_file_val, 96 | data_prefix='', 97 | pipeline=val_pipeline, 98 | ), 99 | test=dict( 100 | type=dataset_type, 101 | ann_file=ann_file_test, 102 | data_prefix='', 103 | pipeline=test_pipeline, 104 | )) 105 | # set the top-k accuracy during validation 106 | evaluation = dict( 107 | interval=5, # Interval to perform evaluation 108 | metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 5))), 109 | ) 110 | # set the top-k accuracy during testing 111 | eval_config = dict(metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 112 | 5))), ) 113 | 114 | # * optimizer 115 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, 116 | weight_decay=0.0001) # this lr is used for 8 gpus 117 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 118 | # learning policy 119 | lr_config = dict(policy='CosineAnnealing', min_lr=0) 120 | 121 | # The flag indicates using joint training 122 | # omnisource = True 123 | 124 | # * runtime settings 125 | checkpoint_config = dict(interval=4) 126 | log_config = dict( 127 | interval=20, 128 | hooks=[ 129 | dict(type='TextLoggerHook'), 130 | # dict(type='TensorboardLoggerHook'), 131 | ]) 132 | # runtime settings 133 | total_epochs = 256 134 | find_unused_parameters = False 135 | dist_params = dict(backend='nccl') 136 | log_level = 'INFO' 137 | load_from = ( 138 | 'https://download.openmmlab.com/mmaction/recognition/slowonly' 139 | '/omni/slowonly_r50_omni_4x16x1_kinetics400_rgb_20200926-51b1f7ea.pth') 140 | resume_from = None 141 | workflow = [('train', 1)] 142 | 143 | # disable opencv multithreading to avoid system being overloaded 144 | opencv_num_threads = 0 145 | # set multi-process start method as `fork` to speed up the training 146 | mp_start_method = 'fork' 147 | -------------------------------------------------------------------------------- /configs/skeleton/agcn/2sagcn_640e_p300_keypoint_2d.py: -------------------------------------------------------------------------------- 1 | model = dict(type='SkeletonGCN', 2 | backbone=dict(type='AGCN', 3 | in_channels=3, 4 | graph_cfg=dict(layout='coco', strategy='agcn'), 5 | dropout=0.2), 6 | cls_head=dict(type='STGCNHead', 7 | num_classes=6, 8 | in_channels=256, 9 | loss_cls=dict(type='CrossEntropyLoss'), 10 | topk=(1, 2, 3, 4, 5)), 11 | train_cfg=None, 12 | test_cfg=None) 13 | 14 | dataset_type = 'PoseDataset' 15 | ann_file_train = '/mmaction2/data/kinesphere_train.pkl' 16 | ann_file_val = '/mmaction2/data/kinesphere_val.pkl' 17 | ann_file_test = '/mmaction2/data/kinesphere_val.pkl' 18 | 19 | train_pipeline = [ 20 | dict(type='PaddingWithLoop', clip_len=450), 21 | dict(type='PoseDecode'), 22 | dict(type='FormatGCNInput', input_format='NCTVM'), 23 | dict(type='Collect', keys=['keypoint', 'label'], meta_keys=[]), 24 | dict(type='ToTensor', keys=['keypoint']) 25 | ] 26 | val_pipeline = [ 27 | dict(type='PaddingWithLoop', clip_len=450), 28 | dict(type='PoseDecode'), 29 | dict(type='FormatGCNInput', input_format='NCTVM'), 30 | dict(type='Collect', keys=['keypoint', 'label'], meta_keys=[]), 31 | dict(type='ToTensor', keys=['keypoint']) 32 | ] 33 | test_pipeline = [ 34 | dict(type='PaddingWithLoop', clip_len=450), 35 | dict(type='PoseDecode'), 36 | dict(type='FormatGCNInput', input_format='NCTVM'), 37 | dict(type='Collect', keys=['keypoint', 'label'], meta_keys=[]), 38 | dict(type='ToTensor', keys=['keypoint']) 39 | ] 40 | data = dict(videos_per_gpu=32, 41 | workers_per_gpu=2, 42 | test_dataloader=dict(videos_per_gpu=1), 43 | train=dict(type=dataset_type, 44 | ann_file=ann_file_train, 45 | data_prefix='', 46 | pipeline=train_pipeline), 47 | val=dict(type=dataset_type, 48 | ann_file=ann_file_val, 49 | data_prefix='', 50 | pipeline=val_pipeline), 51 | test=dict(type=dataset_type, 52 | ann_file=ann_file_val, 53 | data_prefix='', 54 | pipeline=test_pipeline)) 55 | 56 | # optimizer 57 | optimizer = dict(type='SGD', 58 | lr=0.1, 59 | momentum=0.9, 60 | weight_decay=0.0001, 61 | nesterov=True) 62 | optimizer_config = dict(grad_clip=None) 63 | # learning policy 64 | lr_config = dict(policy='step', step=[30, 40, 520]) 65 | total_epochs = 640 66 | checkpoint_config = dict(interval=40) 67 | evaluation = dict(interval=5, 68 | metrics=['top_k_accuracy', 'mean_class_accuracy'], 69 | topk=(1, 2, 3, 4, 5)) 70 | eval_config = dict(metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 71 | 5))), ) 72 | log_config = dict(interval=20, hooks=[dict(type='TextLoggerHook')]) 73 | 74 | # runtime settings 75 | dist_params = dict(backend='nccl') 76 | log_level = 'INFO' 77 | load_from = None 78 | resume_from = None 79 | workflow = [('train', 1)] 80 | -------------------------------------------------------------------------------- /configs/skeleton/posec3d/slowonly_r50_u54_640e_pr-kinetics.py: -------------------------------------------------------------------------------- 1 | # * dataset settings 2 | dataset_type = 'PoseDataset' 3 | data_root = ('/home/jovyan/mmaction2/data') 4 | data_root_val = data_root 5 | data_root_test = data_root 6 | ann_file_train = f'{data_root}/kinesphere_train.pkl' 7 | ann_file_val = f'{data_root_val}/kinesphere_val.pkl' 8 | ann_file_test = f'{data_root_test}/kinesphere_val.pkl' 9 | num_classes = 6 10 | left_kp = [1, 3, 5, 7, 9, 11, 13, 15] 11 | right_kp = [2, 4, 6, 8, 10, 12, 14, 16] 12 | 13 | # model settings 14 | model = dict(type='Recognizer3D', 15 | backbone=dict(type='ResNet3dSlowOnly', 16 | depth=50, 17 | pretrained=None, 18 | in_channels=17, 19 | base_channels=32, 20 | num_stages=3, 21 | out_indices=(2, ), 22 | stage_blocks=(4, 6, 3), 23 | conv1_stride_s=1, 24 | pool1_stride_s=1, 25 | inflate=(0, 1, 1), 26 | spatial_strides=(2, 2, 2), 27 | temporal_strides=(1, 1, 2), 28 | dilations=(1, 1, 1)), 29 | cls_head=dict(type='I3DHead', 30 | in_channels=512, 31 | num_classes=num_classes, 32 | spatial_type='avg', 33 | dropout_ratio=0.7, 34 | topk=(1, 2, 3, 4, 5)), 35 | train_cfg=dict(), 36 | test_cfg=dict(average_clips='prob')) 37 | 38 | train_pipeline = [ 39 | # * 54 (25% of 210) sampled frames seems better 40 | # 48 frames = 22.8% 41 | dict(type='UniformSampleFrames', clip_len=54), 42 | dict(type='PoseDecode'), 43 | dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), 44 | dict(type='Resize', scale=(-1, 64)), 45 | dict(type='RandomResizedCrop', area_range=(0.56, 1.0)), 46 | dict(type='Resize', scale=(56, 56), keep_ratio=False), 47 | dict(type='Flip', flip_ratio=0.5, left_kp=left_kp, right_kp=right_kp), 48 | dict(type='GeneratePoseTarget', 49 | sigma=0.6, 50 | use_score=True, 51 | with_kp=True, 52 | with_limb=False), 53 | dict(type='FormatShape', input_format='NCTHW'), 54 | dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), 55 | dict(type='ToTensor', keys=['imgs', 'label']) 56 | ] 57 | val_pipeline = [ 58 | dict(type='UniformSampleFrames', clip_len=54, num_clips=1, test_mode=True), 59 | dict(type='PoseDecode'), 60 | dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), 61 | dict(type='Resize', scale=(-1, 64)), 62 | dict(type='CenterCrop', crop_size=64), 63 | dict(type='GeneratePoseTarget', 64 | sigma=0.6, 65 | use_score=True, 66 | with_kp=True, 67 | with_limb=False), 68 | dict(type='FormatShape', input_format='NCTHW'), 69 | dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), 70 | dict(type='ToTensor', keys=['imgs']) 71 | ] 72 | test_pipeline = [ 73 | dict(type='UniformSampleFrames', clip_len=54, num_clips=10, 74 | test_mode=True), 75 | dict(type='PoseDecode'), 76 | dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), 77 | dict(type='Resize', scale=(-1, 64)), 78 | dict(type='CenterCrop', crop_size=64), 79 | dict(type='GeneratePoseTarget', 80 | sigma=0.6, 81 | use_score=True, 82 | with_kp=True, 83 | with_limb=False, 84 | double=True, 85 | left_kp=left_kp, 86 | right_kp=right_kp), 87 | dict(type='FormatShape', input_format='NCTHW'), 88 | dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), 89 | dict(type='ToTensor', keys=['imgs']) 90 | ] 91 | data = dict(videos_per_gpu=12, 92 | workers_per_gpu=2, 93 | test_dataloader=dict(videos_per_gpu=1), 94 | train=dict(type=dataset_type, 95 | ann_file=ann_file_train, 96 | data_prefix='', 97 | pipeline=train_pipeline), 98 | val=dict(type=dataset_type, 99 | ann_file=ann_file_val, 100 | data_prefix='', 101 | pipeline=val_pipeline), 102 | test=dict(type=dataset_type, 103 | ann_file=ann_file_test, 104 | data_prefix='', 105 | pipeline=test_pipeline)) 106 | 107 | # optimizer 108 | optimizer = dict(type='SGD', lr=0.0375, momentum=0.9, 109 | weight_decay=0.0003) # this lr is used for 8 gpus 110 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 111 | 112 | # learning policy 113 | lr_config = dict(policy='CosineAnnealing', by_epoch=False, min_lr=0) 114 | total_epochs = 640 115 | checkpoint_config = dict(interval=40) 116 | workflow = [('train', 10)] 117 | evaluation = dict(interval=5, 118 | metrics=['top_k_accuracy', 'mean_class_accuracy'], 119 | topk=(1, 2, 3, 4, 5)) 120 | eval_config = dict(metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 121 | 5))), ) 122 | log_config = dict(interval=20, hooks=[ 123 | dict(type='TextLoggerHook'), 124 | ]) 125 | 126 | dist_params = dict(backend='nccl') 127 | log_level = 'INFO' 128 | load_from = ( 129 | 'https://download.openmmlab.com/mmaction/skeleton/posec3d/' 130 | 'slowonly_kinetics400_pretrained_r50_u48_120e_ucf101_split1_keypoint/' 131 | 'slowonly_kinetics400_pretrained_r50_u48_120e_ucf101_split1_keypoint' 132 | '-cae8aa4a.pth') 133 | resume_from = None 134 | find_unused_parameters = False 135 | -------------------------------------------------------------------------------- /configs/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb.py: -------------------------------------------------------------------------------- 1 | # * dataset settings 2 | dataset_type = 'VideoDataset' 3 | data_root = '/home/rejnald/projects/side_projects/phar/mmaction2/data/phar/' 4 | data_root_val = data_root 5 | data_root_test = data_root 6 | ann_file_train = f'{data_root}/train.txt' 7 | ann_file_val = f'{data_root_val}/val.txt' 8 | ann_file_test = f'{data_root_test}/test.txt' 9 | num_classes = 17 10 | img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], 11 | std=[58.395, 57.12, 57.375], 12 | to_bgr=False) 13 | 14 | # * model settings 15 | model = dict( 16 | type='Recognizer3D', 17 | backbone=dict( 18 | type='ResNet3dSlowFast', 19 | pretrained=None, 20 | resample_rate=8, # tau 21 | speed_ratio=8, # alpha 22 | channel_ratio=8, # beta_inv 23 | slow_pathway=dict(type='resnet3d', 24 | depth=50, 25 | pretrained=None, 26 | lateral=True, 27 | conv1_kernel=(1, 7, 7), 28 | dilations=(1, 1, 1, 1), 29 | conv1_stride_t=1, 30 | pool1_stride_t=1, 31 | inflate=(0, 0, 1, 1), 32 | norm_eval=False), 33 | fast_pathway=dict(type='resnet3d', 34 | depth=50, 35 | pretrained=None, 36 | lateral=False, 37 | base_channels=8, 38 | conv1_kernel=(5, 7, 7), 39 | conv1_stride_t=1, 40 | pool1_stride_t=1, 41 | norm_eval=False)), 42 | cls_head=dict( 43 | type='SlowFastHead', 44 | in_channels=2304, # 2048+256 45 | num_classes=num_classes, 46 | spatial_type='avg', 47 | dropout_ratio=0.7, 48 | topk=(1, 2, 3, 4, 5)), 49 | # model training and testing settings 50 | train_cfg=None, 51 | test_cfg=dict(average_clips='prob')) 52 | 53 | # * pipelines 54 | train_pipeline = [ 55 | dict(type='DecordInit'), 56 | dict(type='SampleFrames', clip_len=24, frame_interval=2, num_clips=1), 57 | dict(type='DecordDecode'), 58 | dict(type='Resize', scale=(-1, 256)), 59 | dict(type='RandomResizedCrop'), 60 | dict(type='Resize', scale=(224, 224), keep_ratio=False), 61 | dict(type='Flip', flip_ratio=0.5), 62 | dict(type='Normalize', **img_norm_cfg), 63 | dict(type='FormatShape', input_format='NCTHW'), 64 | dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), 65 | dict(type='ToTensor', keys=['imgs', 'label']) 66 | ] 67 | val_pipeline = [ 68 | dict(type='DecordInit'), 69 | dict(type='SampleFrames', 70 | clip_len=24, 71 | frame_interval=2, 72 | num_clips=1, 73 | test_mode=True), 74 | dict(type='DecordDecode'), 75 | dict(type='Resize', scale=(-1, 256)), 76 | dict(type='CenterCrop', crop_size=224), 77 | dict(type='Normalize', **img_norm_cfg), 78 | dict(type='FormatShape', input_format='NCTHW'), 79 | dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), 80 | dict(type='ToTensor', keys=['imgs']) 81 | ] 82 | test_pipeline = [ 83 | dict(type='DecordInit'), 84 | dict(type='SampleFrames', 85 | clip_len=24, 86 | frame_interval=2, 87 | num_clips=10, 88 | test_mode=True), 89 | dict(type='DecordDecode'), 90 | dict(type='Resize', scale=(-1, 256)), 91 | dict(type='ThreeCrop', crop_size=256), 92 | dict(type='Normalize', **img_norm_cfg), 93 | dict(type='FormatShape', input_format='NCTHW'), 94 | dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), 95 | dict(type='ToTensor', keys=['imgs']) 96 | ] 97 | data = dict(videos_per_gpu=8, 98 | workers_per_gpu=1, 99 | test_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1), 100 | val_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1), 101 | train=dict(type=dataset_type, 102 | ann_file=ann_file_train, 103 | data_prefix='', 104 | pipeline=train_pipeline), 105 | val=dict(type=dataset_type, 106 | ann_file=ann_file_val, 107 | data_prefix='', 108 | pipeline=val_pipeline), 109 | test=dict(type=dataset_type, 110 | ann_file=ann_file_test, 111 | data_prefix='', 112 | pipeline=test_pipeline)) 113 | 114 | # set the top-k accuracy during validation 115 | evaluation = dict( 116 | interval=5, # Interval to perform evaluation 117 | metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 5))), 118 | ) 119 | # set the top-k accuracy during testing 120 | eval_config = dict(metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 121 | 5))), ) 122 | # * optimizer 123 | optimizer = dict(type='SGD', lr=0.025, momentum=0.9, weight_decay=0.0001) 124 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 125 | # learning policy 126 | lr_config = dict(policy='CosineAnnealing', 127 | min_lr=0, 128 | warmup='linear', 129 | warmup_by_epoch=True, 130 | warmup_iters=34) 131 | total_epochs = 256 132 | 133 | # * runtime settings 134 | checkpoint_config = dict(interval=1) 135 | log_config = dict( 136 | interval=200, 137 | hooks=[ 138 | dict(type='TextLoggerHook'), 139 | # dict(type='TensorboardLoggerHook'), 140 | ]) 141 | dist_params = dict(backend='nccl') 142 | log_level = 'INFO' 143 | find_unused_parameters = False 144 | load_from = ( 145 | 'https://download.openmmlab.com/mmaction/recognition/' 146 | 'slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb/' 147 | 'slowfast_r50_video_4x16x1_256e_kinetics400_rgb_20200826-f85b90c5.pth') 148 | resume_from = None 149 | workflow = [('train', 1)] 150 | # disable opencv multithreading to avoid system being overloaded 151 | opencv_num_threads = 0 152 | # set multi-process start method as `fork` to speed up the training 153 | mp_start_method = 'fork' 154 | -------------------------------------------------------------------------------- /configs/slowonly/slowonly_nl_embedded_gaussian_r50_8x8x1_150e.py: -------------------------------------------------------------------------------- 1 | # * dataset settings 2 | dataset_type = 'VideoDataset' 3 | data_root = '/home/rejnald/projects/side_projects/phar/mmaction2/data/phar/' 4 | data_root_val = data_root 5 | data_root_test = data_root 6 | ann_file_train = f'{data_root}/train_aug.txt' 7 | ann_file_val = f'{data_root_val}/val.txt' 8 | ann_file_test = f'{data_root_test}/val.txt' 9 | num_classes = 17 10 | 11 | # * model settings 12 | model = dict( 13 | type='Recognizer3D', 14 | backbone=dict(type='ResNet3dSlowOnly', 15 | depth=50, 16 | pretrained='torchvision://resnet50', 17 | lateral=False, 18 | conv1_kernel=(1, 7, 7), 19 | conv1_stride_t=1, 20 | pool1_stride_t=1, 21 | inflate=(0, 0, 1, 1), 22 | norm_eval=False, 23 | non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), 24 | (0, 0, 0)), 25 | non_local_cfg=dict(sub_sample=True, 26 | use_scale=True, 27 | norm_cfg=dict(type='BN3d', 28 | requires_grad=True), 29 | mode='embedded_gaussian')), 30 | cls_head=dict(type='I3DHead', 31 | in_channels=2048, 32 | num_classes=num_classes, 33 | spatial_type='avg', 34 | dropout_ratio=0.7, 35 | topk=(1, 2, 3, 4, 5)), 36 | # model training and testing settings 37 | train_cfg=None, 38 | test_cfg=dict(average_clips='prob')) 39 | 40 | img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], 41 | std=[58.395, 57.12, 57.375], 42 | to_bgr=False) 43 | train_pipeline = [ 44 | dict(type='DecordInit'), 45 | dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1), 46 | dict(type='DecordDecode'), 47 | dict(type='Resize', scale=(-1, 256)), 48 | dict(type='RandomResizedCrop'), 49 | dict(type='Resize', scale=(224, 224), keep_ratio=False), 50 | dict(type='Flip', flip_ratio=0.5), 51 | dict(type='Normalize', **img_norm_cfg), 52 | dict(type='FormatShape', input_format='NCTHW'), 53 | dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), 54 | dict(type='ToTensor', keys=['imgs', 'label']) 55 | ] 56 | val_pipeline = [ 57 | dict(type='DecordInit'), 58 | dict(type='SampleFrames', 59 | clip_len=8, 60 | frame_interval=8, 61 | num_clips=1, 62 | test_mode=True), 63 | dict(type='DecordDecode'), 64 | dict(type='Resize', scale=(-1, 256)), 65 | dict(type='CenterCrop', crop_size=224), 66 | dict(type='Normalize', **img_norm_cfg), 67 | dict(type='FormatShape', input_format='NCTHW'), 68 | dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), 69 | dict(type='ToTensor', keys=['imgs']) 70 | ] 71 | test_pipeline = [ 72 | dict(type='DecordInit'), 73 | dict(type='SampleFrames', 74 | clip_len=8, 75 | frame_interval=8, 76 | num_clips=10, 77 | test_mode=True), 78 | dict(type='DecordDecode'), 79 | dict(type='Resize', scale=(-1, 256)), 80 | dict(type='ThreeCrop', crop_size=256), 81 | dict(type='Normalize', **img_norm_cfg), 82 | dict(type='FormatShape', input_format='NCTHW'), 83 | dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), 84 | dict(type='ToTensor', keys=['imgs']) 85 | ] 86 | data = dict(videos_per_gpu=4, 87 | workers_per_gpu=1, 88 | test_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1), 89 | val_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1), 90 | train=dict(type=dataset_type, 91 | ann_file=ann_file_train, 92 | data_prefix='', 93 | pipeline=train_pipeline), 94 | val=dict(type=dataset_type, 95 | ann_file=ann_file_val, 96 | data_prefix='', 97 | pipeline=val_pipeline), 98 | test=dict(type=dataset_type, 99 | ann_file=ann_file_test, 100 | data_prefix='', 101 | pipeline=test_pipeline)) 102 | # set the top-k accuracy during validation 103 | evaluation = dict( 104 | interval=5, # Interval to perform evaluation 105 | metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 5))), 106 | ) 107 | # set the top-k accuracy during testing 108 | eval_config = dict(metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 109 | 5))), ) 110 | 111 | # * optimizer 112 | optimizer = dict(type='SGD', lr=0.00625, momentum=0.9, weight_decay=0.0001) 113 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 114 | # learning policy 115 | lr_config = dict(policy='step', 116 | step=[90, 130], 117 | warmup='linear', 118 | warmup_by_epoch=True, 119 | warmup_iters=10) 120 | total_epochs = 150 121 | 122 | # * runtime settings 123 | checkpoint_config = dict(interval=1) 124 | log_config = dict( 125 | interval=200, 126 | hooks=[ 127 | dict(type='TextLoggerHook'), 128 | # dict(type='TensorboardLoggerHook'), 129 | ]) 130 | dist_params = dict(backend='nccl') 131 | log_level = 'INFO' 132 | load_from = ('https://download.openmmlab.com/mmaction/recognition/slowonly/' 133 | 'slowonly_nl_embedded_gaussian_r50_8x8x1_150e_kinetics400_rgb/' 134 | 'slowonly_nl_embedded_gaussian_r50_8x8x1_150e_kinetics400_rgb_' 135 | '20210308-e8dd9e82.pth') 136 | resume_from = None 137 | workflow = [('train', 1)] 138 | 139 | # disable opencv multithreading to avoid system being overloaded 140 | opencv_num_threads = 0 141 | # set multi-process start method as `fork` to speed up the training 142 | mp_start_method = 'fork' 143 | -------------------------------------------------------------------------------- /configs/timesformer/timesformer_divST_16x12x1_15e_kinetics400_rgb.py: -------------------------------------------------------------------------------- 1 | # * dataset settings 2 | dataset_type = 'VideoDataset' 3 | data_root = '/home/rejnald/projects/side_projects/phar/mmaction2/data/phar/' 4 | data_root_val = data_root 5 | data_root_test = data_root 6 | ann_file_train = f'{data_root}/train_aug.txt' 7 | ann_file_val = f'{data_root_val}/val.txt' 8 | ann_file_test = f'{data_root_test}/val.txt' 9 | num_classes = 17 10 | img_norm_cfg = dict(mean=[127.5, 127.5, 127.5], 11 | std=[127.5, 127.5, 127.5], 12 | to_bgr=False) 13 | 14 | # * model settings 15 | model = dict( 16 | type='Recognizer3D', 17 | backbone=dict( 18 | type='TimeSformer', 19 | pretrained= # noqa: E251 20 | 'https://download.openmmlab.com/mmaction/recognition/timesformer/vit_base_patch16_224.pth', # noqa: E501 21 | num_frames=16, 22 | img_size=224, 23 | patch_size=16, 24 | embed_dims=768, 25 | in_channels=3, 26 | dropout_ratio=0.2, 27 | transformer_layers=None, 28 | # divided attention is the best strategy 29 | attention_type='divided_space_time', 30 | norm_cfg=dict(type='LN', eps=1e-6)), 31 | cls_head=dict(type='TimeSformerHead', 32 | num_classes=num_classes, 33 | in_channels=768, 34 | topk=(1, 2, 3, 4, 5)), 35 | # model training and testing settings 36 | train_cfg=None, 37 | test_cfg=dict(average_clips='prob')) 38 | 39 | train_pipeline = [ 40 | dict(type='DecordInit'), 41 | # * frame_interval has been selected for 7s clips 42 | dict(type='SampleFrames', clip_len=16, frame_interval=12, num_clips=1), 43 | dict(type='DecordDecode'), 44 | dict(type='RandomRescale', scale_range=(256, 320)), 45 | dict(type='RandomCrop', size=224), 46 | dict(type='Flip', flip_ratio=0.5), 47 | dict(type='Normalize', **img_norm_cfg), 48 | dict(type='FormatShape', input_format='NCTHW'), 49 | dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), 50 | dict(type='ToTensor', keys=['imgs', 'label']) 51 | ] 52 | val_pipeline = [ 53 | dict(type='DecordInit'), 54 | dict(type='SampleFrames', 55 | clip_len=16, 56 | frame_interval=12, 57 | num_clips=1, 58 | test_mode=True), 59 | dict(type='DecordDecode'), 60 | dict(type='Resize', scale=(-1, 256)), 61 | dict(type='CenterCrop', crop_size=224), 62 | dict(type='Normalize', **img_norm_cfg), 63 | dict(type='FormatShape', input_format='NCTHW'), 64 | dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), 65 | dict(type='ToTensor', keys=['imgs', 'label']) 66 | ] 67 | test_pipeline = [ 68 | dict(type='DecordInit'), 69 | dict(type='SampleFrames', 70 | clip_len=16, 71 | frame_interval=12, 72 | num_clips=1, 73 | test_mode=True), 74 | dict(type='DecordDecode'), 75 | dict(type='Resize', scale=(-1, 224)), 76 | dict(type='ThreeCrop', crop_size=224), 77 | dict(type='Normalize', **img_norm_cfg), 78 | dict(type='FormatShape', input_format='NCTHW'), 79 | dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), 80 | dict(type='ToTensor', keys=['imgs', 'label']) 81 | ] 82 | data = dict(videos_per_gpu=1, 83 | workers_per_gpu=1, 84 | test_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1), 85 | val_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1), 86 | train=dict(type=dataset_type, 87 | ann_file=ann_file_train, 88 | data_prefix='', 89 | pipeline=train_pipeline), 90 | val=dict(type=dataset_type, 91 | ann_file=ann_file_val, 92 | data_prefix='', 93 | pipeline=val_pipeline), 94 | test=dict(type=dataset_type, 95 | ann_file=ann_file_test, 96 | data_prefix='', 97 | pipeline=test_pipeline)) 98 | 99 | # set the top-k accuracy during validation 100 | evaluation = dict( 101 | interval=1, # Interval to perform evaluation 102 | metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 5))), 103 | ) 104 | # set the top-k accuracy during testing 105 | eval_config = dict(metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 106 | 5))), ) 107 | 108 | # optimizer 109 | optimizer = dict(type='SGD', 110 | lr=0.0015625, 111 | momentum=0.9, 112 | paramwise_cfg=dict( 113 | custom_keys={ 114 | '.backbone.cls_token': dict(decay_mult=0.0), 115 | '.backbone.pos_embed': dict(decay_mult=0.0), 116 | '.backbone.time_embed': dict(decay_mult=0.0) 117 | }), 118 | weight_decay=1e-4, 119 | nesterov=True) # this lr is used for 8 gpus 120 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 121 | 122 | # learning policy 123 | lr_config = dict(policy='step', step=[5, 10]) 124 | total_epochs = 25 125 | 126 | # * runtime settings 127 | checkpoint_config = dict(interval=1) 128 | log_config = dict( 129 | interval=1000, 130 | hooks=[ 131 | dict(type='TextLoggerHook'), 132 | # dict(type='TensorboardLoggerHook'), 133 | ]) 134 | # runtime settings 135 | dist_params = dict(backend='nccl') 136 | log_level = 'INFO' 137 | load_from = ('https://download.openmmlab.com/mmaction/recognition/timesformer/' 138 | 'timesformer_divST_8x32x1_15e_kinetics400_rgb/' 139 | 'timesformer_divST_8x32x1_15e_kinetics400_rgb-3f8e5d03.pth') 140 | resume_from = None 141 | workflow = [('train', 1)] 142 | 143 | # disable opencv multithreading to avoid system being overloaded 144 | opencv_num_threads = 0 145 | # set multi-process start method as `fork` to speed up the training 146 | mp_start_method = 'fork' 147 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG PYTORCH="1.8.0" 2 | ARG CUDA="11.1" 3 | ARG CUDNN="8" 4 | FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel 5 | 6 | ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 8.0+PTX" 7 | ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all" 8 | ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" 9 | 10 | # ! https://github.com/NVIDIA/nvidia-docker/issues/1632 11 | # currently image not working properly 12 | RUN apt-key del 7fa2af80 13 | RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub 14 | RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64/7fa2af80.pub 15 | 16 | RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 \ 17 | libxrender-dev libxext6 ffmpeg nano p7zip-full imagemagick wget unzip \ 18 | && apt-get clean \ 19 | && rm -rf /var/lib/apt/lists/* 20 | 21 | RUN pip install mmcv-full==1.3.18 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html 22 | RUN git clone --recurse-submodules https://github.com/rlleshi/phar.git phar 23 | 24 | # install mmaction, mmpose, mmdet 25 | WORKDIR /workspace/phar/mmaction2 26 | ENV FORCE_CUDA="1" 27 | RUN pip install cython --no-cache-dir 28 | RUN pip install --no-cache-dir -e . 29 | WORKDIR /workspace/phar/mmpose 30 | RUN pip install -r requirements.txt 31 | RUN pip install -v -e . 32 | RUN pip install mmdet==2.12.0 33 | 34 | # install extra dependencies 35 | WORKDIR /workspace/phar 36 | RUN pip install -r requirements/extra.txt 37 | 38 | # download models 39 | RUN wget https://github.com/rlleshi/phar/releases/download/v1.0.0/audio.pth -O checkpoints/har/audio.pth \ 40 | && wget https://github.com/rlleshi/phar/releases/download/v1.0.0/posec3d.pth -O checkpoints/har/posec3d.pth \ 41 | && wget https://github.com/rlleshi/phar/releases/download/v1.0.0/timeSformer.pth -O checkpoints/har/timeSformer.pth \ 42 | && wget https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth -O checkpoints/pose/hrnet_w32_coco_256x192.pth \ 43 | && wget http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \ 44 | -O checkpoints/detector/faster_rcnn_r50_fpn_1x_coco-person.pth 45 | -------------------------------------------------------------------------------- /requirements/extra.txt: -------------------------------------------------------------------------------- 1 | librosa==0.8.1 2 | lws==1.2.7 3 | mlflow 4 | moviepy==1.0.3 5 | numpy==1.22.4 6 | pyloudnorm==0.1.0 7 | rich 8 | schedule 9 | SoundFile==0.10.3.post1 10 | -------------------------------------------------------------------------------- /requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | addict==2.4.0 2 | albumentations==1.1.0 3 | alembic==1.4.1 4 | appdirs==1.4.4 5 | attrs==21.4.0 6 | audioread==2.1.9 7 | autopep8==1.6.0 8 | beautifulsoup4==4.9.3 9 | certifi==2020.12.5 10 | cffi==1.15.0 11 | cfgv==3.3.1 12 | chardet==4.0.0 13 | chumpy==0.70 14 | click==7.1.2 15 | cloudpickle==1.6.0 16 | colorama==0.4.4 17 | commonmark==0.9.1 18 | coverage==6.3.1 19 | cycler==0.10.0 20 | Cython==0.29.23 21 | databricks-cli==0.14.3 22 | decorator==4.4.2 23 | decord==0.4.1 24 | defusedxml==0.7.1 25 | distlib==0.3.4 26 | docker==5.0.0 27 | einops==0.3.0 28 | entrypoints==0.3 29 | ez-setup==0.9 30 | ffmpeg-python==0.2.0 31 | filelock==3.4.2 32 | flake8==4.0.1 33 | flake8-import-order==0.18.1 34 | Flask==2.0.0 35 | flatbuffers==2.0 36 | freetype-py==2.2.0 37 | future==0.18.2 38 | gitdb==4.0.7 39 | GitPython==3.1.17 40 | greenlet==1.1.0 41 | gunicorn==20.1.0 42 | identify==2.4.4 43 | idna==2.10 44 | imageio==2.9.0 45 | imageio-ffmpeg==0.4.3 46 | importlib-metadata==4.8.2 47 | iniconfig==1.1.1 48 | interrogate==1.5.0 49 | isort==4.3.21 50 | itsdangerous==2.0.0 51 | Jinja2==3.0.0 52 | joblib==1.1.0 53 | json-tricks==3.15.5 54 | kiwisolver==1.3.1 55 | librosa==0.8.1 56 | llvmlite==0.38.0 57 | lws==1.2.7 58 | Mako==1.1.4 59 | Markdown==3.3.6 60 | MarkupSafe==2.0.0 61 | matplotlib==3.4.2 62 | mccabe==0.6.1 63 | mlflow==1.17.0 64 | -e git+ssh://git@github.com/open-mmlab/mmaction2.git@255bbc08634c21e6400af7b9d1a470b52285ebcd#egg=mmaction2 65 | mmcv-full==1.3.18 66 | mmdet==2.12.0 67 | -e git+https://github.com/open-mmlab/mmpose.git@5c8ba2657b26ee9487451c45ba794823fa607cfd#egg=mmpose 68 | model-index==0.1.11 69 | motmetrics==1.2.0 70 | moviepy==1.0.3 71 | munkres==1.1.4 72 | networkx==2.6.3 73 | nodeenv==1.6.0 74 | numba==0.55.0 75 | numpy==1.20.2 76 | odfpy==1.4.1 77 | onnx==1.10.2 78 | onnxruntime==1.10.0 79 | opencv-contrib-python==4.5.4.60 80 | opencv-python==4.5.4.60 81 | opencv-python-headless==4.5.5.62 82 | openmim==0.1.5 83 | ordered-set==4.0.2 84 | packaging==21.3 85 | pandas==1.1.5 86 | Pillow==8.2.0 87 | platformdirs==2.4.1 88 | pluggy==1.0.0 89 | pooch==1.6.0 90 | poseval==0.1.0 91 | pre-commit==2.17.0 92 | proglog==0.1.9 93 | prometheus-client==0.10.1 94 | prometheus-flask-exporter==0.18.2 95 | protobuf==3.17.0 96 | py==1.11.0 97 | py-cpuinfo==8.0.0 98 | pycocotools==2.0.2 99 | pycodestyle==2.8.0 100 | pycparser==2.21 101 | pyflakes==2.4.0 102 | pyglet==1.5.21 103 | Pygments==2.10.0 104 | pyloudnorm==0.1.0 105 | PyOpenGL==3.1.0 106 | pyparsing==2.4.7 107 | pyrender==0.1.45 108 | pytest==7.0.0 109 | pytest-benchmark==3.4.1 110 | pytest-runner==5.3.1 111 | python-dateutil==2.8.1 112 | python-editor==1.0.4 113 | pytz==2021.1 114 | PyWavelets==1.2.0 115 | PyYAML==5.4.1 116 | qudida==0.0.4 117 | querystring-parser==1.2.4 118 | requests==2.25.1 119 | resampy==0.2.2 120 | rich==10.9.0 121 | scenedetect==0.5.6.1 122 | schedule==1.1.0 123 | scikit-image==0.19.1 124 | scikit-learn==1.0.2 125 | scipy==1.6.3 126 | seaborn==0.11.1 127 | Shapely==1.8.0 128 | six==1.16.0 129 | smmap==4.0.0 130 | smplx==0.1.28 131 | SoundFile==0.10.3.post1 132 | soupsieve==2.2.1 133 | SQLAlchemy==1.4.15 134 | sqlparse==0.4.1 135 | tabulate==0.8.9 136 | terminaltables==3.1.0 137 | threadpoolctl==3.0.0 138 | tifffile==2022.2.2 139 | toml==0.10.2 140 | tomli==2.0.1 141 | torch==1.10.0+cu113 142 | torchaudio==0.10.0+cu113 143 | torchvision==0.11.1+cu113 144 | tqdm==4.60.0 145 | trimesh==3.10.0 146 | typing-extensions==3.10.0.0 147 | urllib3==1.26.4 148 | vidaug==0.1 149 | virtualenv==20.13.0 150 | webcolors==1.11.1 151 | websocket-client==0.59.0 152 | Werkzeug==2.0.0 153 | xdoctest==0.15.10 154 | xmltodict==0.12.0 155 | xtcocotools==1.10 156 | yapf==0.31.0 157 | zipp==3.4.1 158 | -------------------------------------------------------------------------------- /resources/ann_dist_clip.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/resources/ann_dist_clip.jpg -------------------------------------------------------------------------------- /resources/ann_dist_clips.json: -------------------------------------------------------------------------------- 1 | { 2 | "kissing": 598, 3 | "fondling": 409, 4 | "handjob": 1403, 5 | "fingering": 952, 6 | "titjob": 1174, 7 | "blowjob": 1883, 8 | "cunnilingus": 1733, 9 | "deepthroat": 2057, 10 | "doggy": 1883, 11 | "the_snake": 1595, 12 | "anal": 1560, 13 | "missionary": 1882, 14 | "cowgirl": 1663, 15 | "scoop_up": 1336, 16 | "cumshot": 570, 17 | "facial_cumshot": 781, 18 | "69": 1132, 19 | "total": 22611, 20 | "average": 1330 21 | } 22 | -------------------------------------------------------------------------------- /resources/annotation_distribution(min).json: -------------------------------------------------------------------------------- 1 | { 2 | "kissing": 72.7, 3 | "fondling": 48.1, 4 | "handjob": 164.9, 5 | "fingering": 112.7, 6 | "titjob": 137.2, 7 | "blowjob": 222.6, 8 | "cunnilingus": 203.5, 9 | "deepthroat": 250.2, 10 | "doggy": 221.2, 11 | "the_snake": 186.4, 12 | "anal": 184.4, 13 | "missionary": 226.9, 14 | "cowgirl": 193.7, 15 | "scoop_up": 155.3, 16 | "cumshot": 67.7, 17 | "facial_cumshot": 94.4, 18 | "69": 132.7, 19 | "total": 2674.6000000000004 20 | } 21 | -------------------------------------------------------------------------------- /resources/annotation_distribution.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/resources/annotation_distribution.jpg -------------------------------------------------------------------------------- /resources/annotations/annotations.txt: -------------------------------------------------------------------------------- 1 | kissing 2 | fondling 3 | handjob 4 | fingering 5 | titjob 6 | blowjob 7 | cunnilingus 8 | deepthroat 9 | doggy 10 | the-snake 11 | anal 12 | missionary 13 | cowgirl 14 | scoop-up 15 | cumshot 16 | facial-cumshot 17 | 69 18 | -------------------------------------------------------------------------------- /resources/annotations/annotations_audio.txt: -------------------------------------------------------------------------------- 1 | anal 2 | deepthroat 3 | doggy 4 | blowjob 5 | -------------------------------------------------------------------------------- /resources/annotations/annotations_pose.txt: -------------------------------------------------------------------------------- 1 | blowjob 2 | doggy 3 | missionary 4 | cowgirl 5 | 69 6 | kissing 7 | -------------------------------------------------------------------------------- /resources/annotations/current_annotations.txt: -------------------------------------------------------------------------------- 1 | blowjob 2 | ass-eating 3 | deep-throat 4 | cunnilingus 5 | tit-fuck 6 | handjob 7 | cumshot 8 | anal 9 | fingering 10 | kissing 11 | tit-sucking 12 | squirting 13 | gaping 14 | other 15 | doggystyle 16 | cowgirl 17 | 69 18 | missionary 19 | reverse-cowgirl 20 | spooning 21 | -------------------------------------------------------------------------------- /resources/annotations/temp.txt: -------------------------------------------------------------------------------- 1 | kissing 2 | -------------------------------------------------------------------------------- /resources/audio/db_20_config.yml: -------------------------------------------------------------------------------- 1 | anal: -46.23 2 | deepthroat: -45.69 3 | doggy: -48.04 4 | blowjob: -53.72 5 | cumshot: -45.03 6 | cunnilingus: -48.73 7 | miscellaneous: -49.02 8 | kissing: -51.1 9 | -------------------------------------------------------------------------------- /resources/audio/db_30_config.yml: -------------------------------------------------------------------------------- 1 | anal: -40.81 2 | deepthroat: -41.32 3 | doggy: -42.63 4 | blowjob: -45.28 5 | -------------------------------------------------------------------------------- /resources/metrics/audio_cm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/resources/metrics/audio_cm.png -------------------------------------------------------------------------------- /resources/metrics/audio_loss.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/resources/metrics/audio_loss.jpg -------------------------------------------------------------------------------- /resources/metrics/posec3d_loss.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/resources/metrics/posec3d_loss.jpg -------------------------------------------------------------------------------- /resources/metrics/skeleton_cm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/resources/metrics/skeleton_cm.png -------------------------------------------------------------------------------- /resources/metrics/timesformer_loss.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/resources/metrics/timesformer_loss.jpg -------------------------------------------------------------------------------- /src/__int__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/src/__int__.py -------------------------------------------------------------------------------- /src/analysis/__int__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/src/analysis/__int__.py -------------------------------------------------------------------------------- /src/analysis/audio_filter.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import shutil 3 | from argparse import ArgumentParser 4 | from pathlib import Path 5 | 6 | import numpy as np 7 | import pyloudnorm as pyln 8 | import soundfile as sf 9 | from rich.console import Console 10 | from tqdm import tqdm 11 | 12 | CONSOLE = Console() 13 | 14 | EXTS = ['.wav'] 15 | 16 | 17 | def parse_args(): 18 | parser = ArgumentParser(prog='filter audio based on loudness. ' 19 | 'Removes a certain percentile') 20 | parser.add_argument('src_dir', help='src directory') 21 | parser.add_argument('out_dir', help='out directory') 22 | parser.add_argument('--percentile', 23 | type=int, 24 | default=20, 25 | help='thresholding percentile for loudness in db') 26 | parser.add_argument('--level', 27 | type=int, 28 | default=1, 29 | help='directory level of data') 30 | args = parser.parse_args() 31 | return args 32 | 33 | 34 | def main(): 35 | args = parse_args() 36 | Path(args.out_dir).mkdir(parents=True, exist_ok=True) 37 | CONSOLE.print( 38 | f'Thresholding all audios found in {args.src_dir} with the ' 39 | f'{args.percentile}-th percentile', 40 | style='green') 41 | 42 | audios = glob.glob(args.src_dir + '/*' * args.level) 43 | audios = [ 44 | audio for audio in audios if any(audio.endswith(ext) for ext in EXTS) 45 | ] 46 | 47 | # assuming that all audios have same rate 48 | _, rate = sf.read(audios[0]) 49 | meter = pyln.Meter(rate) # meter works with decibels 50 | loudness = [] 51 | 52 | for audio in tqdm(audios): 53 | data, _ = sf.read(audio) 54 | loudness.append((audio, meter.integrated_loudness(data))) 55 | 56 | min_db = np.percentile([loud[1] for loud in loudness], args.percentile) 57 | CONSOLE.print(f'{args.percentile}-th percentile is {min_db}', 58 | style='green') 59 | 60 | filtered_audios = list(filter(lambda x: x[1] > min_db, loudness)) 61 | for audio in filtered_audios: 62 | shutil.copy(audio[0], args.out_dir) 63 | 64 | 65 | if __name__ == '__main__': 66 | main() 67 | -------------------------------------------------------------------------------- /src/analysis/class_distribution_clips.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import os.path as osp 5 | import sys 6 | 7 | import pandas as pd 8 | import seaborn as sns 9 | from rich.console import Console 10 | 11 | sys.path.append('./tools') # noqa 12 | import utils as utils # noqa isort:skip 13 | 14 | CONSOLE = Console() 15 | PLOT_SPLIT_THR = 26 16 | 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser( 20 | description='calculates number of clips / annotation classes') 21 | parser.add_argument('--src-dir', 22 | default='mmaction2/data/phar', 23 | help='the dir that contains all the videos') 24 | parser.add_argument('--splits', 25 | nargs='+', 26 | default=['train', 'val', 'test'], 27 | choices=['train', 'val', 'test'], 28 | help='the splits where clips are found') 29 | parser.add_argument('--ann', 30 | type=str, 31 | default='resources/annotations/annotations.txt', 32 | help='annotation file') 33 | parser.add_argument('--out-dir', 34 | default='resources/', 35 | help='directory to store output files') 36 | args = parser.parse_args() 37 | return args 38 | 39 | 40 | def main(): 41 | args = parse_args() 42 | result = utils.annotations_dic(args.ann) 43 | result = {k: 0 for k, _ in result.items()} 44 | 45 | for split in args.splits: 46 | path_to_label = osp.join(args.src_dir, split) 47 | for label in os.listdir(path_to_label): 48 | result[label] += len(os.listdir(osp.join(path_to_label, label))) 49 | 50 | labels = list(result.keys()) 51 | values = list(result.values()) 52 | result['total'] = sum(values) 53 | result['average'] = round(result['total'] / len(values)) 54 | 55 | # save json 56 | result_json = json.dumps(result, indent=4) 57 | f = open(osp.join(args.out_dir, 'ann_dist_clips.json'), 'w') 58 | print(result_json, file=f) 59 | f.close() 60 | 61 | # save plot 62 | dfs = [] 63 | if len(labels) >= PLOT_SPLIT_THR: 64 | # have to split in at least 2 groups for readability 65 | dfs.append( 66 | pd.DataFrame({ 67 | 'Class': labels[:int(len(labels) / 2)], 68 | 'Value': values[:int(len(values) / 2)] 69 | })) 70 | dfs.append( 71 | pd.DataFrame({ 72 | 'Class': labels[int(len(labels) / 2):], 73 | 'Value': values[int(len(values) / 2):] 74 | })) 75 | else: 76 | dfs.append(pd.DataFrame({'Class': labels, 'Value': values})) 77 | 78 | for df in dfs: 79 | sns.set(rc={'figure.figsize': (15, 13)}) 80 | fig = sns.barplot(x='Class', y='Value', data=df) 81 | fig.set_xticklabels(fig.get_xticklabels(), rotation=30) 82 | fig.axes.set_title('Sample Distribution / Class ', fontsize=40) 83 | fig.set_xlabel('Class', fontsize=30) 84 | fig.set_ylabel('Value', fontsize=20) 85 | output = fig.get_figure() 86 | output.savefig( 87 | osp.join(args.out_dir, f'ann_dist_clips_{utils.gen_id(2)}.jpg')) 88 | 89 | 90 | if __name__ == '__main__': 91 | main() 92 | -------------------------------------------------------------------------------- /src/analysis/class_distribution_time.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import json 3 | import os.path as osp 4 | import sys 5 | from argparse import ArgumentParser 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import seaborn as sns 10 | from rich.console import Console 11 | 12 | sys.path.append('./src') # noqa: E501 13 | import utils as utils # noqa isort:skip 14 | 15 | CONSOLE = Console() 16 | ANN_EXT = '.csv' 17 | 18 | 19 | def get_actions_with_timestamps(path): 20 | """Given the path to a csv file, get its timestamps. 21 | 22 | The function is specific to the temporal csv annotations produced by the 23 | VIA annotator. 24 | """ 25 | results = [] 26 | try: 27 | df = pd.read_csv(path) 28 | except Exception: 29 | CONSOLE.print(f'Error reading {path}', style='error') 30 | for i in range(1, len(df)): 31 | temp = str(df.iloc[i].value_counts()).split(' ') 32 | results.append({ 33 | 'action': 34 | temp[0].split(':"')[1].strip('}"'), 35 | 'video': 36 | ''.join(list(filter(lambda x: x not in '["],', temp[6]))), 37 | 'start': 38 | float(temp[7][:-1]), 39 | 'end': 40 | float(temp[8][:-2]) 41 | }) 42 | return results 43 | 44 | 45 | def parse_args(): 46 | parser = ArgumentParser(prog='time analysis of annotation distribution ' 47 | 'based on the CSV files annotations.') 48 | parser.add_argument('--csv-dir', 49 | default='dataset/', 50 | help='directory of csv annotations') 51 | parser.add_argument('--out_dir', default='resources/') 52 | parser.add_argument('--ann', 53 | type=str, 54 | default='resources/annotations/annotations.txt', 55 | help='annotation file') 56 | parser.add_argument('--level', 57 | type=int, 58 | default=1, 59 | choices=[1, 2], 60 | help='directory level of data') 61 | args = parser.parse_args() 62 | return args 63 | 64 | 65 | def save_results(out, result): 66 | cls = [k for k in result.keys()] 67 | val = [v for v in result.values()] 68 | tot = sum(val) 69 | val = list(map(lambda x: x / tot, val)) 70 | 71 | # save json 72 | result['total'] = tot 73 | result_json = json.dumps(result, indent=4) 74 | f = open(osp.join(out, 'annotation_distribution(min).json'), 'w') 75 | print(result_json, file=f) 76 | f.close() 77 | 78 | # save plot 79 | df = pd.DataFrame({'Class': cls, 'Value': val}) 80 | sns.set(rc={'figure.figsize': (15, 13)}) 81 | fig = sns.barplot(x='Class', y='Value', data=df) 82 | fig.set_xticklabels(fig.get_xticklabels(), rotation=15) 83 | fig.axes.set_title('Sample Distribution / Class ', fontsize=40) 84 | fig.set_xlabel('Class', fontsize=30) 85 | fig.set_ylabel('Total %', fontsize=20) 86 | output = fig.get_figure() 87 | output.savefig(osp.join(out, 'annotation_distribution.jpg')) 88 | 89 | 90 | def main(): 91 | args = parse_args() 92 | ann_count = dict.fromkeys(utils.annotations_dic(args.ann), 0) 93 | if args.level == 1: 94 | search = osp.join(args.csv_dir, '*') 95 | elif args.level == 2: 96 | search = osp.join(args.csv_dir, '*', '*') 97 | annotations = [ 98 | item for item in glob.glob(search) if item.endswith(ANN_EXT) 99 | ] 100 | 101 | for ann in annotations: 102 | for action in get_actions_with_timestamps(ann): 103 | label = action['action'].replace('-', '_') 104 | duration = action['end'] - action['start'] 105 | if np.isnan(duration): 106 | # faulty annotation 107 | continue 108 | try: 109 | ann_count[label] += duration 110 | except KeyError: 111 | CONSOLE.print(f'{ann} has misspelled label {label}', 112 | style='yellow') 113 | 114 | ann_count = {k: round(v / 60, 1) for k, v in ann_count.items()} 115 | save_results(args.out_dir, ann_count) 116 | 117 | 118 | if __name__ == '__main__': 119 | main() 120 | -------------------------------------------------------------------------------- /src/analysis/evaluate_acc_per_cls.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import pickle 4 | import sys 5 | from argparse import ArgumentParser 6 | 7 | import pandas as pd 8 | import seaborn as sns 9 | import torch 10 | from mmaction.apis import inference_recognizer, init_recognizer 11 | from rich.console import Console 12 | from tqdm import tqdm 13 | 14 | # https://stackoverflow.com/questions/4383571/importing-files-from-different-folder 15 | sys.path.append('human-action-recognition/') # noqa 16 | import har.tools.helpers as helpers # noqa isort:skips 17 | 18 | # sys.path.append('/mmaction2/') 19 | CONSOLE = Console() 20 | 21 | 22 | def parse_args(): 23 | parser = ArgumentParser(prog='accuracy per class for a bunch of clips') 24 | parser.add_argument('checkpoint', help='model') 25 | parser.add_argument('split', type=str, help='train/validation/test') 26 | parser.add_argument('--src-dir', 27 | type=str, 28 | default='/mmaction2/data/tanz/videos_val/', 29 | help='source dir of videos to be evaluated as clips') 30 | parser.add_argument('--out', 31 | type=str, 32 | default='/mnt/data_transfer/write/', 33 | help='out dir') 34 | parser.add_argument('--config', type=str, help='model config file') 35 | parser.add_argument( 36 | '--ann', 37 | type=str, 38 | default=('human-action-recognition/har/annotations/BAST/base/' 39 | 'tanz_annotations.txt'), 40 | help='classes/labels') 41 | parser.add_argument('--device', type=str, default='cuda:0', help='cpu/gpu') 42 | parser.add_argument('--type', 43 | default='rgb', 44 | choices=['rgb', 'skeleton'], 45 | help='rgb or skeleton') 46 | parser.add_argument('--topk', 47 | type=int, 48 | nargs='+', 49 | default=[1, 2, 3], 50 | choices=[1, 2, 3, 4, 5], 51 | help='top-k accuracy to evaluate') 52 | args = parser.parse_args() 53 | return args 54 | 55 | 56 | def save(args, results): 57 | # if args.out.endswith('.json'): 58 | # import json 59 | # results_json = json.dumps(results, indent=4) 60 | # f = open(out, 'w') 61 | # print(results , file=f) 62 | # f.close() 63 | out = osp.join(args.out, args.split + '_acc_per_class.csv') 64 | df = pd.DataFrame(results) 65 | df.to_csv(out, index=False) 66 | print('Saved {} csv file'.format(out)) 67 | 68 | sns.set(rc={'figure.figsize': (11.7, 8.27)}) 69 | fig = sns.barplot(x='Class', y='Value', hue='Accuracy', data=df) # 70 | fig.set_xticklabels(fig.get_xticklabels(), rotation=30) # 71 | fig.axes.set_title('Top 3 Accuracy ' + args.split + '-set', fontsize=40) 72 | fig.set_xlabel('Class', fontsize=30) 73 | fig.set_ylabel('Value', fontsize=20) 74 | output = fig.get_figure() # 75 | out = osp.splitext(out)[0] + '.jpg' 76 | output.savefig(out) 77 | print('Saved {} plot'.format(out)) 78 | 79 | 80 | def skeleton( 81 | args, 82 | number_to_label, 83 | model, 84 | ): 85 | total_count = helpers.bast_annotations_to_dict(args.ann) 86 | dist = {k: helpers.bast_annotations_to_dict(args.ann) for k in args.topk} 87 | 88 | for sample in tqdm(os.listdir(args.src_dir)): 89 | with open(osp.join(args.src_dir, sample), 'rb') as f: 90 | ann = pickle.load(f) 91 | ann['start_index'] = 0 92 | ann['modality'] = 'Pose' 93 | label = number_to_label[ann['label']] 94 | total_count[label] += 1 95 | result = inference_recognizer(model, ann) 96 | 97 | previous_k = 0 98 | for k in args.topk: 99 | # if its in top 1 & 2 it will count for top 3 100 | for i in range(previous_k, k): 101 | if number_to_label[result[i][0]] == label: 102 | dist[k][label] += 1 103 | for j in args.topk: 104 | # if its in top 3 it will count for top 4 105 | if (j != k) & (j > k): 106 | dist[j][label] += 1 107 | previous_k = k 108 | 109 | results = [] 110 | for i in dist.keys(): 111 | for k, v in dist[i].items(): 112 | acc = (v / total_count[k]) if total_count[k] != 0 else 0 113 | results.append({'Class': k, 'Accuracy': f'acc_{i}', 'Value': acc}) 114 | save(args, results) 115 | 116 | no_labels = 0 117 | for k in total_count.keys(): 118 | if total_count[k] > 0: 119 | no_labels += 1 120 | for i in dist.keys(): 121 | macro_acc = 0 122 | for k, v in dist[i].items(): 123 | if total_count[k] == 0: 124 | macro_acc += 0 125 | else: 126 | macro_acc += v / total_count[k] 127 | 128 | CONSOLE.print( 129 | f'Macro top-{i} Acc: ' 130 | f'{round(100 * macro_acc / no_labels, 3)}', 131 | style='green') 132 | 133 | 134 | def rgb(args, number_to_label, model): 135 | total_count = helpers.bast_annotations_to_dict(args.ann) 136 | dist = {k: helpers.bast_annotations_to_dict(args.ann) for k in args.topk} 137 | 138 | for label in tqdm(os.listdir(args.src_dir)): 139 | class_dir = osp.join(args.src_dir, label) 140 | 141 | for clip in tqdm(os.listdir(class_dir)): 142 | previous_k = 0 143 | total_count[label] += 1 144 | result = inference_recognizer(model, osp.join(class_dir, clip)) 145 | 146 | for k in args.topk: 147 | # if its in top 1 & 2 it will count for top 3 148 | for i in range(previous_k, k): 149 | if number_to_label[result[i][0]] == label: 150 | dist[k][label] += 1 151 | for j in args.topk: 152 | # if its in top 3 it will count for top 4 153 | if (j != k) & (j > k): 154 | dist[j][label] += 1 155 | previous_k = k 156 | 157 | results = [] 158 | for i in dist.keys(): 159 | for k, v in dist[i].items(): 160 | acc = (v / total_count[k]) if total_count[k] != 0 else 0 161 | results.append({'Class': k, 'Accuracy': f'acc_{i}', 'Value': acc}) 162 | save(args, results) 163 | 164 | no_labels = 0 165 | for k in total_count.keys(): 166 | if total_count[k] > 0: 167 | no_labels += 1 168 | for i in dist.keys(): 169 | macro_acc = 0 170 | for k, v in dist[i].items(): 171 | if total_count[k] == 0: 172 | macro_acc += 0 173 | else: 174 | macro_acc += v / total_count[k] 175 | 176 | CONSOLE.print( 177 | f'Macro top-{i} Acc: ' 178 | f'{round(100 * macro_acc / no_labels, 3)}', 179 | style='green') 180 | 181 | 182 | def main(): 183 | args = parse_args() 184 | model = init_recognizer(args.config, args.checkpoint, 185 | torch.device(args.device)) 186 | CONSOLE.print( 187 | f'# Evaluating accuracy per class for the {args.split}-set ' 188 | f'of config file {args.config.split("/")[-1]}', 189 | style='green') 190 | number_to_label = helpers.bast_number_to_label(args.ann) 191 | callback = rgb if args.type == 'rgb' else skeleton 192 | callback(args, number_to_label, model) 193 | 194 | 195 | if __name__ == '__main__': 196 | main() 197 | -------------------------------------------------------------------------------- /src/analysis/pose_feasibility.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import os 3 | import os.path as osp 4 | import sys 5 | import warnings 6 | from argparse import ArgumentParser 7 | from pathlib import Path 8 | 9 | import pandas as pd 10 | import seaborn as sns 11 | from rich.console import Console 12 | from tqdm import tqdm 13 | 14 | sys.path.append('./src') # noqa: E501 15 | import data.pose_extraction as pose_extraction # noqa isort:skip 16 | 17 | try: 18 | from mmdet.apis import init_detector 19 | from mmpose.apis import init_pose_model 20 | except ImportError: 21 | warnings.warn( 22 | 'Please install MMDet and MMPose for pose extraction.') # noqa: E501 23 | 24 | CONSOLE = Console() 25 | POSE_EXTR_PATH = 'src/data/skeleton/pose_extraction.py' 26 | PROGRESS_FILE = 'pose_feasibility_progress.txt' 27 | 28 | 29 | def parse_args(): 30 | parser = ArgumentParser(prog='check the pose feasibility for a class' 31 | 'Also generates the .pkl pose dicts.') 32 | parser.add_argument('label', help='class/label to examine') 33 | parser.add_argument('--src-dir', 34 | default='mmaction2/data/phar', 35 | help='directory of dataset') 36 | parser.add_argument('--out-dir', default='mmaction2/data/phar/pose') 37 | parser.add_argument('--ann', 38 | type=str, 39 | default='resources/annotations/pose.txt', 40 | help='annotation file') 41 | parser.add_argument('--splits', 42 | nargs='+', 43 | default=['train', 'val', 'test'], 44 | choices=['train', 'val', 'test'], 45 | help='the splits where clips are found') 46 | parser.add_argument('--pose-score-thr', 47 | type=float, 48 | default=0.2, 49 | help='pose estimation score threshold') 50 | parser.add_argument('--resume', 51 | action='store_true', 52 | help='ggf. resume analysis from previous run') 53 | parser.add_argument('--device', default='cuda:0', help='device') 54 | parser.add_argument( 55 | '--det-config', 56 | default=('mmdetection/configs/faster_rcnn/' 57 | 'faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py'), 58 | help='human detector config') 59 | parser.add_argument( 60 | '--det-checkpoint', 61 | default='checkpoints/detector/faster_rcnn_r50_fpn_1x_coco-person.pth', 62 | help='human detector checkpoint') 63 | parser.add_argument( 64 | '--pose-config', 65 | default=('mmpose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/' 66 | 'coco/hrnet_w32_coco_256x192.py'), 67 | help='pose estimation config') 68 | parser.add_argument('--pose-checkpoint', 69 | default='checkpoints/pose/hrnet_w32_coco_256x192.pth', 70 | help='pose estimation checkpoint') 71 | args = parser.parse_args() 72 | return args 73 | 74 | 75 | def get_pose(args, d_model, p_model): 76 | """Perform pose estimation given a video. 77 | 78 | Args: 79 | args (dict): parsed args 80 | d_model: detection model 81 | p_model: pose model 82 | 83 | Returns: 84 | int: correct poses rate 85 | """ 86 | return pose_extraction.main(args, d_model, p_model) 87 | 88 | 89 | def main(): 90 | args = parse_args() 91 | # percentiles of clips having correct poses: 92 | # for a certain percentile it means: {n_videos_in_percentile / total_vids} 93 | # have {percentile %} of their poses with a confidence higher than 94 | # {args.pose_score_thr} 95 | results = {k: 0 for k in range(0, 101, 10)} 96 | det_model = init_detector(args.det_config, args.det_checkpoint, 97 | args.device) 98 | pose_model = init_pose_model(args.pose_config, args.pose_checkpoint, 99 | args.device) 100 | 101 | sub_args = abc.ABC() 102 | sub_args = abc.abstractproperty() 103 | sub_args.device = args.device 104 | sub_args.det_score_thr = 0.5 105 | sub_args.pose_score_thr = args.pose_score_thr 106 | sub_args.ann = args.ann 107 | sub_args.correct_rate = 0.2 108 | sub_args.filter_pose = False 109 | 110 | resume_list = [] 111 | if args.resume: 112 | if osp.exists(PROGRESS_FILE): 113 | with open(PROGRESS_FILE, 'r') as resume_from: 114 | resume_list = resume_from.readlines()[0].split(',') 115 | else: 116 | CONSOLE.print( 117 | f'Resume option selected but {PROGRESS_FILE} not' 118 | ' found.', 119 | style='yellow') 120 | 121 | for split in args.splits: 122 | out_dir = osp.join(args.out_dir, split, args.label) 123 | in_dir = osp.join(args.src_dir, split, args.label) 124 | Path(out_dir).mkdir(parents=True, exist_ok=True) 125 | sub_args.out_dir = out_dir 126 | 127 | for clip in tqdm(os.listdir(in_dir)): 128 | if clip in resume_list: 129 | CONSOLE.print(f'Already processed. Skipping {clip}.', 130 | style='green') 131 | continue 132 | 133 | sub_args.video = osp.join(in_dir, clip) 134 | result = get_pose(sub_args, det_model, pose_model) 135 | if result is None: 136 | CONSOLE.print(f'{clip} already exists. Skipping.', 137 | style='green') 138 | continue 139 | 140 | result *= 100 141 | for k in results.keys(): 142 | if result > k: 143 | results[k] += 1 144 | with open(PROGRESS_FILE, 'a+') as out: 145 | out.write(f'{clip},') 146 | 147 | # plot 148 | df = pd.DataFrame({ 149 | '%': list(results.keys()), 150 | 'Value': list(results.values()) 151 | }) 152 | sns.set(rc={'figure.figsize': (15, 13)}) 153 | fig = sns.barplot(x='%', y='Value', data=df) 154 | fig.set_xticklabels(fig.get_xticklabels(), rotation=30) 155 | fig.axes.set_title(f'Correct Poses ({args.pose_score_thr}-conf-thr)', 156 | fontsize=40) 157 | fig.set_xlabel('%', fontsize=30) 158 | fig.set_ylabel('Values', fontsize=20) 159 | output = fig.get_figure() 160 | 161 | out = osp.join(args.out_dir, f'correct_poses_rate_{args.label}.jpg') 162 | output.savefig(out) 163 | CONSOLE.print(f'Saved @{out}') 164 | 165 | 166 | if __name__ == '__main__': 167 | main() 168 | -------------------------------------------------------------------------------- /src/analysis/print_layers.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from mmaction.apis import init_recognizer 4 | 5 | 6 | def parse_args(): 7 | parser = argparse.ArgumentParser(prog='model layer printer') 8 | parser.add_argument('config', help='config file path') 9 | parser.add_argument('checkpoint', help='checkpoint file') 10 | parser.add_argument('--device', 11 | type=str, 12 | default='cuda:0', 13 | help='CPU/CUDA device option') 14 | args = parser.parse_args() 15 | return args 16 | 17 | 18 | def print_layers(model, layer_name): 19 | if len(model._modules) == 0: 20 | print(layer_name) 21 | else: 22 | for key in model._modules: 23 | name = key if len(layer_name) == 0 else layer_name + '/' + key 24 | print_layers(model._modules[key], name) 25 | 26 | 27 | def main(): 28 | args = parse_args() 29 | model = init_recognizer(args.config, args.checkpoint, device=args.device) 30 | print_layers(model, '') 31 | 32 | 33 | if __name__ == '__main__': 34 | main() 35 | -------------------------------------------------------------------------------- /src/data/README.md: -------------------------------------------------------------------------------- 1 | 2 | # phar 3 | 4 | Make Porn Great Again 5 | 6 | ## Building datasets 7 | 8 | ### Build Video Dataset 9 | 10 | 1. Define the annotations @resources/annotations/annotations.txt 11 | 2. Create the clip dataset using `src/data/generate_dataset.py` 12 | 3. Downgrade the quality of the videos using `src/data/resize_videos.py`. Training will be much faster as resize overhead is removed. 13 | 4. Ggf. use `RepeatDataset` to further speed up training. 14 | 5. Use `mmaction2/src/analysis/check_videos.py` to check if the dataset is valid. 15 | 16 | ### Build Pose Dataset 17 | 18 | 1. Define the annotations @resources/annotations/annotations_pose.txt 19 | 2. Extract the pose information from the videos with `src/analysis/pose_feasibility.py` or `src/data/skeleton/generate_dataset_pose.py` 20 | - Best to extract with `pose_feasibility` as it will not save those poses with low confidence and it also gives feedback on the hardness of the dataset to extract poses. 21 | 3. Merge the poses into lists with `merge_pose` @`src/misc.py` 22 | 23 | ### Build Audio Dataset 24 | 25 | 1. Define the annotations @resources/annotations/annotations_audio.txt 26 | 2. Extract the audio from the videos with `mmaction2/src/data/extract_audio.py` 27 | - `Stream map '0:a' matches no streams` means that the videos have no audio! 28 | 3. Ggf. filter audios based on their loudness with `src/analysis/audio_filter.py` 29 | 4. Extract spectogram features with `mmaction2/src/data/build_audio_features.py` 30 | 5. Generate annotation list with `src/data/audio/build_file_list.py` 31 | -------------------------------------------------------------------------------- /src/data/__int__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/src/data/__int__.py -------------------------------------------------------------------------------- /src/data/augment_dataset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import os.path as osp 4 | import random 5 | import shutil 6 | import sys 7 | from itertools import repeat 8 | from multiprocessing import cpu_count 9 | from pathlib import Path 10 | 11 | import cv2 12 | import numpy as np 13 | from rich.console import Console 14 | from tqdm import tqdm 15 | from tqdm.contrib.concurrent import process_map 16 | from vidaug import augmentors as va 17 | 18 | sys.path.append('./src') # noqa 19 | import utils as utils # noqa isort:skip 20 | 21 | CONSOLE = Console() 22 | AUGS = [ 23 | va.InvertColor(), 24 | va.InvertColor(), 25 | va.Add(value=100), 26 | va.Add(value=-100), 27 | va.Pepper(ratio=45), 28 | va.Pepper(ratio=15), 29 | va.Salt(ratio=100), 30 | va.Salt(ratio=25), 31 | va.GaussianBlur(sigma=1.2), 32 | va.GaussianBlur(sigma=2), 33 | va.GaussianBlur(sigma=3.5), 34 | va.ElasticTransformation(alpha=1.5, sigma=0.5), 35 | va.ElasticTransformation(alpha=3.5, sigma=0.5), 36 | va.PiecewiseAffineTransform(displacement=4, 37 | displacement_kernel=2, 38 | displacement_magnification=3), 39 | va.PiecewiseAffineTransform(displacement=2, 40 | displacement_kernel=1, 41 | displacement_magnification=2) 42 | ] 43 | 44 | 45 | def parse_args(): 46 | parser = argparse.ArgumentParser(description='Augmenting train set script') 47 | parser.add_argument('--src-dir', 48 | default='mmaction2/data/phar/train', 49 | help='source video directory') 50 | parser.add_argument('--out-dir', 51 | default='mmaction2/data/phar/train_aug/', 52 | help='augmented video directory') 53 | parser.add_argument('--rate', 54 | type=float, 55 | default=0.3, 56 | help='replacement rate for videos') 57 | parser.add_argument('--ann', 58 | type=str, 59 | default='resources/annotations/annotations.txt', 60 | help='annotation file') 61 | parser.add_argument('--num-processes', 62 | type=int, 63 | default=(cpu_count() - 2 or 1), 64 | help='number of processes used') 65 | args = parser.parse_args() 66 | return args 67 | 68 | 69 | def augment_video(items): 70 | """Augments a video. 71 | 72 | Args: 73 | clip (str): path to video 74 | out_dir (str): path to out dir 75 | """ 76 | clip, out_dir, random_clips = items 77 | if clip not in random_clips: 78 | # no augmentation, just copy it 79 | shutil.copy(clip, out_dir) 80 | return 81 | 82 | video = cv2.VideoCapture(clip) 83 | out = osp.join(out_dir, osp.basename(clip)) 84 | video_writer = cv2.VideoWriter( 85 | out, cv2.VideoWriter_fourcc(*'mp4v'), video.get(cv2.CAP_PROP_FPS), 86 | (round(video.get(cv2.CAP_PROP_FRAME_WIDTH)), 87 | round(video.get(cv2.CAP_PROP_FRAME_HEIGHT)))) 88 | 89 | frames = [] 90 | while cv2.waitKey(1) < 0: 91 | success, frame = video.read() 92 | if not success: 93 | video.release() 94 | break 95 | frames.append(frame) 96 | 97 | aug = random.choice(AUGS) 98 | frames = aug(np.array(frames)) 99 | for frame in frames: 100 | video_writer.write(frame) 101 | 102 | 103 | def main(): 104 | args = parse_args() 105 | Path(args.out_dir).mkdir(parents=True, exist_ok=True) 106 | assert 0 < args.rate < 1.0 107 | 108 | for label in tqdm(utils.annotations_list(args.ann)): 109 | out_dir_label = osp.join(args.out_dir, label) 110 | Path(out_dir_label).mkdir(parents=True, exist_ok=True) 111 | clips = glob.glob(osp.join(args.src_dir, label, '*')) 112 | random_clips = random.sample(clips, round(len(clips) * args.rate)) 113 | CONSOLE.print(f'Augmenting {len(random_clips)} clips for {label}...', 114 | style='bold green') 115 | 116 | process_map(augment_video, 117 | zip(clips, repeat(out_dir_label), repeat(random_clips)), 118 | max_workers=args.num_processes, 119 | total=len(clips)) 120 | 121 | 122 | if __name__ == '__main__': 123 | main() 124 | -------------------------------------------------------------------------------- /src/data/build_file_list.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import shutil 3 | import sys 4 | from argparse import ArgumentParser 5 | 6 | import numpy as np 7 | from rich.console import Console 8 | 9 | sys.path.append('./tools') # noqa 10 | import utils as utils # noqa isort:skip 11 | 12 | CONSOLE = Console() 13 | SPEC_EXT = '.npy' 14 | 15 | 16 | def parse_args(): 17 | parser = ArgumentParser(prog='generate file list for audio dataset based ' 18 | 'on video list') 19 | parser.add_argument( 20 | 'src_dir', 21 | type=str, 22 | help='root dir for video dataset where the ann files are generated') 23 | parser.add_argument( 24 | '--audio-dir', 25 | type=str, 26 | default='audio_feature', 27 | help='audio subdir inside the src_dir that contains spectograms') 28 | parser.add_argument('--split', 29 | type=str, 30 | nargs='+', 31 | default=['train', 'val', 'test'], 32 | help='splits where the spectograms are located') 33 | parser.add_argument('--ann', 34 | type=str, 35 | default='resources/annotations/annotations_audio.txt', 36 | help='audio annotations') 37 | args = parser.parse_args() 38 | return args 39 | 40 | 41 | def main(): 42 | args = parse_args() 43 | if not osp.exists(args.src_dir): 44 | CONSOLE.print(f'{args.src_dir} not found', style='red') 45 | return 46 | 47 | ann_to_list = utils.annotations_dic(args.ann) 48 | for split in args.split: 49 | split = split + '.txt' 50 | out_dir = osp.join(args.src_dir, args.audio_dir, split) 51 | shutil.copyfile(osp.join(args.src_dir, split), out_dir) 52 | 53 | with open(out_dir, 'r') as out: 54 | content = out.read() 55 | 56 | path = osp.splitext(out_dir)[0] 57 | with open(out_dir, 'w') as out: 58 | 59 | for line in content.split('\n'): 60 | if len(line) == 0: 61 | continue 62 | 63 | _, category, clip = line.rsplit(osp.sep, 2) 64 | new_path = osp.join(path, category, clip).split(' ')[0] 65 | new_class_id = ann_to_list.get(category, None) 66 | 67 | if new_class_id is not None: 68 | new_path = new_path.split('.')[0] + SPEC_EXT 69 | if not osp.exists(new_path): 70 | # corresponding .npy file doesn't exist (e.g. filtered) 71 | continue 72 | 73 | count = len(np.load(new_path)) 74 | out.write(f'{new_path} {count} {new_class_id}\n') 75 | 76 | CONSOLE.print(f'Created list file @{out_dir}', style='green') 77 | 78 | 79 | if __name__ == '__main__': 80 | main() 81 | -------------------------------------------------------------------------------- /src/data/generate_dataset.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os.path as osp 3 | import random 4 | import string 5 | import sys 6 | from argparse import ArgumentParser 7 | from itertools import repeat 8 | from multiprocessing import cpu_count 9 | from pathlib import Path 10 | 11 | import moviepy.editor as mpy 12 | import numpy as np 13 | import pandas as pd 14 | from rich.console import Console 15 | from tqdm.contrib.concurrent import process_map 16 | 17 | sys.path.append('./src') # noqa: E501 18 | import utils as utils # noqa isort:skip 19 | 20 | CONSOLE = Console() 21 | 22 | VIDEO_EXTS = ['mp4'] 23 | ANN_EXT = '.csv' 24 | ANN_TO_INDEX = dict() 25 | 26 | 27 | def gen_id(size=8): 28 | """Generate a random id.""" 29 | chars = string.ascii_uppercase + string.digits 30 | return ''.join(random.choice(chars) for _ in range(size)) 31 | 32 | 33 | def generate_structure(out_dir: str, annotations: str): 34 | """Generate the videos dataset structure. 35 | 36 | Args: 37 | out_dir (str): directory to generate structure for 38 | annotations (str): path to annotation file that has classes 39 | """ 40 | classes = utils.annotations_list(annotations) 41 | Path(out_dir).mkdir(parents=True, exist_ok=True) 42 | 43 | for split in ['train', 'val', 'test']: 44 | for c in classes: 45 | Path(osp.join(out_dir, split, c)).mkdir(parents=True, 46 | exist_ok=True) 47 | open(osp.join(out_dir, f'{split}.txt'), 'w').close() 48 | 49 | 50 | def get_video_annotation(id: int, anns: list) -> str: 51 | """Gets the annotation for a video based on its id. The assumption here is 52 | that both the video and its corresponding annotation have been named with a 53 | number. 54 | 55 | Args: 56 | id (int): video id 57 | anns (list): list of annotation paths 58 | 59 | Returns: 60 | ann (str): path to annotations 61 | """ 62 | return (ann for ann in anns if ann.split(osp.sep)[-1][:-4] == id) 63 | 64 | 65 | def parse_args(): 66 | parser = ArgumentParser(prog='generate video dataset.' 67 | 'Videos have the same name as annotations') 68 | parser.add_argument('--src_dir', 69 | default='dataset/', 70 | help='source video directory') 71 | parser.add_argument('--ann', 72 | type=str, 73 | default='resources/annotations/all.txt', 74 | help='annotation file') 75 | parser.add_argument('--out-dir', 76 | default='mmaction2/data/phar', 77 | help='out video directory') 78 | parser.add_argument('--split', 79 | type=float, 80 | nargs='+', 81 | default=[0.8, 0.2, 0], 82 | help='train/val/test split') 83 | parser.add_argument('--clip-len', 84 | type=int, 85 | default=10, 86 | help='length of each clip') 87 | parser.add_argument('--num-processes', 88 | type=int, 89 | default=(cpu_count() - 2 or 1), 90 | help='number of processes used') 91 | parser.add_argument('--level', 92 | type=int, 93 | default=1, 94 | choices=[1, 2], 95 | help='directory level to find videos') 96 | args = parser.parse_args() 97 | return args 98 | 99 | 100 | def write_annotation(path: str): 101 | """Write the corresponding annotation to the annotation file. The 102 | annotation consists of the path to the video + label number. 103 | 104 | `mmaction2/data/temp/train/the_snake/DOZ9WC51.mp4 9` 105 | 106 | Args: 107 | path (str): path to the clip 108 | """ 109 | path_to_ann_f, label = osp.split(osp.dirname(path)) 110 | with open(f'{path_to_ann_f}.txt', 'a') as out: 111 | out.write(f'{path} {ANN_TO_INDEX[label]}') 112 | out.write('\n') 113 | 114 | 115 | def get_actions_with_timestamps(path: str) -> list: 116 | """Given the path to a csv file, get its timestamps. 117 | 118 | This function is specific to the temporal csv annotations 119 | produced by the VIA annotator: 120 | 121 | `Show/Hide attribute editor` -> Add `Activity` 122 | Name: "Activity"; 123 | Anchor: "Temporal Segment in Video or Audio"; 124 | Description: "Activity" 125 | 126 | Args: 127 | path (str): path to annotation 128 | 129 | Returns: 130 | list: list of timestamps 131 | """ 132 | results = [] 133 | df = pd.read_csv(path) 134 | for i in range(1, len(df)): 135 | temp = str(df.iloc[i].value_counts()).split(' ') 136 | results.append({ 137 | 'action': 138 | temp[0].split(':"')[1].strip('}"'), 139 | 'video': 140 | ''.join(list(filter(lambda x: x not in '["],', temp[6]))), 141 | 'start': 142 | float(temp[7][:-1]), 143 | 'end': 144 | float(temp[8][:-2]) 145 | }) 146 | return results 147 | 148 | 149 | def extract_clips(items): 150 | """Extract clips of length `args.clip_len` given a video and its 151 | annotations.""" 152 | video_f, anns, args = items 153 | ann = get_video_annotation(id=video_f.split(osp.sep)[-1][:-4], anns=anns) 154 | ann = next(ann, None) 155 | if ann is None: 156 | CONSOLE.print(f'Video {video_f} has no annotations.', style='yellow') 157 | return 158 | 159 | clip_len = args.clip_len 160 | min_remainder = clip_len / 2 # ggf. overlap 161 | np.random.seed() 162 | split = np.random.choice(['train', 'val', 'test'], p=args.split) 163 | video = mpy.VideoFileClip(video_f) 164 | 165 | for action in get_actions_with_timestamps(ann): 166 | duration = action['end'] - action['start'] 167 | if np.isnan(duration): 168 | # faulty annotation 169 | continue 170 | if duration < clip_len: 171 | continue 172 | 173 | label = action['action'].replace('-', '_') 174 | 175 | if ANN_TO_INDEX.get(label, None) is None: 176 | # skip if label not found 177 | continue 178 | 179 | n_clips = int(duration / clip_len) 180 | remainder = duration % clip_len 181 | 182 | for i in range(n_clips): 183 | start = action['start'] + i * clip_len 184 | end = start + clip_len 185 | subclip = video.subclip(start, end) 186 | out_f = f'{osp.join(args.out_dir, split, label, gen_id())}.mp4' 187 | 188 | try: 189 | subclip.write_videofile(out_f, logger=None) 190 | write_annotation(out_f) 191 | except OSError: 192 | CONSOLE.print(f'{video_f} has bad annotations', style='red') 193 | continue 194 | 195 | if remainder >= min_remainder: 196 | # small overlap will exist, but we savor`min_remainder` footage 197 | out_f = f'{osp.join(args.out_dir, split, label, gen_id())}.mp4' 198 | subclip = video.subclip(action['end'] - clip_len, action['end']) 199 | try: 200 | subclip.write_videofile(out_f, logger=None) 201 | write_annotation(out_f) 202 | except OSError: 203 | CONSOLE.print(f'{video_f} has bad annotations', style='red') 204 | pass 205 | 206 | 207 | def main(): 208 | args = parse_args() 209 | assert sum(args.split) == 1, 'train/val/test split must equal to 1' 210 | assert osp.exists(args.ann), 'provide label map file' 211 | generate_structure(args.out_dir, args.ann) 212 | global ANN_TO_INDEX 213 | ANN_TO_INDEX = utils.annotations_dic(args.ann) 214 | 215 | if args.level == 1: 216 | items = glob.glob(osp.join(args.src_dir, '*')) 217 | elif args.level == 2: 218 | items = glob.glob(osp.join(args.src_dir, '*', '*')) 219 | 220 | videos = [ 221 | item for item in items if any( 222 | item.endswith(ext) for ext in VIDEO_EXTS) 223 | ] 224 | annotations = [item for item in items if item.endswith(ANN_EXT)] 225 | np.random.shuffle(videos) 226 | 227 | process_map(extract_clips, 228 | zip(videos, repeat(annotations), repeat(args)), 229 | max_workers=args.num_processes, 230 | total=len(videos)) 231 | 232 | 233 | if __name__ == '__main__': 234 | main() 235 | -------------------------------------------------------------------------------- /src/data/generate_dataset_pose.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import os.path as osp 4 | import pickle 5 | import random 6 | import subprocess 7 | import sys 8 | from argparse import ArgumentParser 9 | from collections import defaultdict 10 | from itertools import repeat 11 | from multiprocessing import Pool 12 | from pathlib import Path 13 | 14 | import numpy as np 15 | from tqdm import tqdm 16 | 17 | sys.path.append('human-action-recognition/') # noqa 18 | 19 | CLIPS_PATH = 'clips' 20 | RESULT_PATH = 'results' 21 | 22 | 23 | def merge_pose_data(in_dir, out_dir, split): 24 | """Given the pose estimation of single videos stored as dictionaries in. 25 | 26 | .pkl format, merge them together and form a list of dictionaries. 27 | 28 | Args: 29 | in_dir ([string]): path to the .pkl files for individual clips 30 | out_dir ([string]): path to the out dir 31 | split ([string]): train, val, test 32 | """ 33 | result = [] 34 | for ann in os.listdir(in_dir): 35 | if ann.endswith('.pkl'): 36 | with open(osp.join(in_dir, ann), 'rb') as f: 37 | annotations = pickle.load(f) 38 | result.append(annotations) 39 | 40 | out_file = osp.join(out_dir, f'bast_{split}.pkl') 41 | with open(out_file, 'wb') as out: 42 | pickle.dump(result, out, protocol=pickle.HIGHEST_PROTOCOL) 43 | 44 | 45 | def generate_structure(path): 46 | Path(path).mkdir(parents=True, exist_ok=True) 47 | for split in ['train', 'val', 'test']: 48 | Path(osp.join(path, CLIPS_PATH, split)).mkdir(parents=True, 49 | exist_ok=True) 50 | Path(osp.join(path, RESULT_PATH)).mkdir(parents=True, exist_ok=True) 51 | 52 | 53 | def parse_args(): 54 | parser = ArgumentParser(prog='generate pose data for skeleton-based-har ' 55 | 'based on a VideoDataset directory.') 56 | parser.add_argument('src_dir', type=str, help='VideoDataset directory') 57 | parser.add_argument('split_set', 58 | nargs='+', 59 | choices=['train', 'val', 'test'], 60 | help='type of sets to generate the pose dataset for') 61 | parser.add_argument('--out-dir', 62 | type=str, 63 | default='data/skeleton/bast_base/', 64 | help='resulting dataset dir') 65 | parser.add_argument( 66 | '--ann', 67 | type=str, 68 | default=('human-action-recognition/har/annotations/BAST/base/' 69 | 'tanz_annotations.txt'), 70 | help='annotations') 71 | parser.add_argument('--devices', 72 | nargs='+', 73 | choices=['cuda:0', 'cuda:1', 'cuda:2', 'cuda:3'], 74 | help='gpu to use; can parallelize for each split-set') 75 | args = parser.parse_args() 76 | return args 77 | 78 | 79 | def merge_results(args): 80 | for split in args.split_set: 81 | in_dir = osp.join(args.out_dir, CLIPS_PATH, split) 82 | out_dir = osp.join(args.out_dir, RESULT_PATH) 83 | merge_pose_data(in_dir, out_dir, split) 84 | 85 | 86 | def get_pose(video, args, split, gpu): 87 | script_path = ('human-action-recognition/har/tools/data/skeleton/' 88 | 'pose_extraction.py') 89 | if split == '': 90 | split = 'test' 91 | else: 92 | split = split.split('_')[1] 93 | 94 | out_dir = osp.join(args.out_dir, CLIPS_PATH, split) 95 | subargs = [ 96 | 'python', script_path, video, args.ann, '--out-dir', out_dir, 97 | '--device', gpu 98 | ] 99 | try: 100 | logging.info(subprocess.run(subargs)) 101 | except subprocess.CalledProcessError as e: 102 | logging.exception(f'Error while generating pose data for {video}: {e}') 103 | 104 | 105 | def extract_pose(pose_items): 106 | split_label, gpu, args = pose_items 107 | split, labels = split_label 108 | label_path = osp.join(args.src_dir, split) 109 | 110 | for label in labels: 111 | print(f'Extracting pose for {split} - {label}') 112 | clip_path = osp.join(label_path, label) 113 | 114 | for clip in tqdm(os.listdir(clip_path)): 115 | get_pose(osp.join(clip_path, clip), args, split, gpu) 116 | 117 | 118 | def main(): 119 | logging.basicConfig(filename='skeleton_dataset.log', level=logging.DEBUG) 120 | args = parse_args() 121 | generate_structure(args.out_dir) 122 | n_gpus = len(args.devices) 123 | pool = Pool(n_gpus) 124 | 125 | split_labels = [] 126 | for split in args.split_set: 127 | # based on the current structure of the `data-transfer` volume 128 | if split == 'test': 129 | split = '' 130 | else: 131 | split = 'videos_' + split 132 | 133 | labels = os.listdir(osp.join(args.src_dir, split)) 134 | random.shuffle(labels) 135 | # split_labels = [(train, [walk, ..., stamp]), ... 136 | # (val, [contract_expand, ..., fall])] 137 | n_splits = int(n_gpus / len(args.split_set)) 138 | split_labels += [(split, label_split) 139 | for label_split in np.array_split(labels, n_splits)] 140 | 141 | if len(args.devices) > 1: 142 | pool.map(extract_pose, zip(split_labels, args.devices, repeat(args))) 143 | else: 144 | print('Running on a single GPU') 145 | dd = defaultdict(list) 146 | # merge the splits 147 | for key, value in split_labels: 148 | if len(dd[key]) == 0: 149 | dd[key] = value 150 | else: 151 | for v in value: 152 | dd[key].append(v) 153 | split_labels = list(dd.items()) 154 | for split_label in split_labels: 155 | extract_pose((split_label, args.devices[0], args)) 156 | 157 | merge_results(args) 158 | 159 | 160 | if __name__ == '__main__': 161 | main() 162 | -------------------------------------------------------------------------------- /src/data/pose_extraction.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import argparse 3 | import logging 4 | import os 5 | import os.path as osp 6 | import random as rd 7 | import shutil 8 | import string 9 | import sys 10 | import warnings 11 | from collections import defaultdict 12 | 13 | import cv2 14 | import mmcv 15 | import numpy as np 16 | from rich.console import Console 17 | 18 | try: 19 | from mmdet.apis import inference_detector, init_detector 20 | from mmpose.apis import inference_top_down_pose_model, init_pose_model 21 | except ImportError: 22 | warnings.warn( 23 | 'Please install MMDet and MMPose for pose extraction.') # noqa: E501 24 | 25 | sys.path.append('src/') # noqa 26 | import utils as utils # noqa isort:skip 27 | 28 | MMDET_ROOT = 'mmdetection' 29 | MMPOSE_ROOT = 'mmpose' 30 | args = abc.ABC() 31 | args = abc.abstractproperty() 32 | args.det_config = f'{MMDET_ROOT}/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py' # noqa: E501 33 | args.det_checkpoint = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth' # noqa: E501 34 | args.pose_config = f'{MMPOSE_ROOT}/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192.py' # noqa: E501 35 | args.pose_checkpoint = 'https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth' # noqa: E501 36 | 37 | N_PERSON = 2 # * for bboxes 38 | ANN_TO_INDEX = dict() 39 | CONSOLE = Console() 40 | 41 | 42 | def gen_id(size=8): 43 | chars = string.ascii_uppercase + string.digits 44 | return ''.join(rd.choice(chars) for _ in range(size)) 45 | 46 | 47 | def extract_frame(video_path): 48 | dname = gen_id() 49 | os.makedirs(dname, exist_ok=True) 50 | frame_tmpl = osp.join(dname, 'img_{:05d}.jpg') 51 | vid = cv2.VideoCapture(video_path) 52 | frame_paths = [] 53 | flag, frame = vid.read() 54 | first_frame = frame 55 | cnt = 0 56 | while flag: 57 | frame_path = frame_tmpl.format(cnt + 1) 58 | frame_paths.append(frame_path) 59 | 60 | cv2.imwrite(frame_path, frame) 61 | cnt += 1 62 | flag, frame = vid.read() 63 | 64 | # corrupted video, no frame 65 | if first_frame is None: 66 | return None, None 67 | 68 | return frame_paths, first_frame.shape[:2] 69 | 70 | 71 | def detection_inference(args, frame_paths, det_model=None): 72 | if det_model is None: 73 | model = init_detector(args.det_config, args.det_checkpoint, 74 | args.device) 75 | else: 76 | model = det_model 77 | assert model.CLASSES[0] == 'person', ('We require you to use a detector ' 78 | 'trained on COCO') 79 | results = [] 80 | CONSOLE.print('Performing Human Detection for each frame...', 81 | style='green') 82 | prog_bar = mmcv.ProgressBar(len(frame_paths)) 83 | for frame_path in frame_paths: 84 | result = inference_detector(model, frame_path) 85 | # We only keep human detections with score larger than det_score_thr 86 | result = result[0][result[0][:, 4] >= args.det_score_thr] 87 | results.append(result) 88 | prog_bar.update() 89 | return results 90 | 91 | 92 | def intersection(b0, b1): 93 | l, r = max(b0[0], b1[0]), min(b0[2], b1[2]) 94 | u, d = max(b0[1], b1[1]), min(b0[3], b1[3]) 95 | return max(0, r - l) * max(0, d - u) 96 | 97 | 98 | def iou(b0, b1): 99 | i = intersection(b0, b1) 100 | u = area(b0) + area(b1) - i 101 | return i / u 102 | 103 | 104 | def area(b): 105 | return (b[2] - b[0]) * (b[3] - b[1]) 106 | 107 | 108 | def removedup(bbox): 109 | def inside(box0, box1, thre=0.8): 110 | return intersection(box0, box1) / area(box0) > thre 111 | 112 | num_bboxes = bbox.shape[0] 113 | if num_bboxes == 1 or num_bboxes == 0: 114 | return bbox 115 | valid = [] 116 | for i in range(num_bboxes): 117 | flag = True 118 | for j in range(num_bboxes): 119 | if i != j and inside(bbox[i], 120 | bbox[j]) and bbox[i][4] <= bbox[j][4]: 121 | flag = False 122 | break 123 | if flag: 124 | valid.append(i) 125 | return bbox[valid] 126 | 127 | 128 | def is_easy_example(det_results, num_person): 129 | threshold = 0.95 130 | 131 | def thre_bbox(bboxes, thre=threshold): 132 | shape = [sum(bbox[:, -1] > thre) for bbox in bboxes] 133 | ret = np.all(np.array(shape) == shape[0]) 134 | return shape[0] if ret else -1 135 | 136 | if thre_bbox(det_results) == num_person: 137 | det_results = [x[x[..., -1] > 0.95] for x in det_results] 138 | return True, np.stack(det_results) 139 | return False, thre_bbox(det_results) 140 | 141 | 142 | def bbox2tracklet(bbox): 143 | iou_thre = 0.6 144 | tracklet_id = -1 145 | tracklet_st_frame = {} 146 | tracklets = defaultdict(list) 147 | for t, box in enumerate(bbox): 148 | for idx in range(box.shape[0]): 149 | matched = False 150 | for tlet_id in range(tracklet_id, -1, -1): 151 | cond1 = iou(tracklets[tlet_id][-1][-1], box[idx]) >= iou_thre 152 | cond2 = (t - tracklet_st_frame[tlet_id] - 153 | len(tracklets[tlet_id]) < 10) 154 | cond3 = tracklets[tlet_id][-1][0] != t 155 | if cond1 and cond2 and cond3: 156 | matched = True 157 | tracklets[tlet_id].append((t, box[idx])) 158 | break 159 | if not matched: 160 | tracklet_id += 1 161 | tracklet_st_frame[tracklet_id] = t 162 | tracklets[tracklet_id].append((t, box[idx])) 163 | return tracklets 164 | 165 | 166 | def drop_tracklet(tracklet): 167 | tracklet = {k: v for k, v in tracklet.items() if len(v) > 5} 168 | 169 | def meanarea(track): 170 | boxes = np.stack([x[1] for x in track]).astype(np.float32) 171 | areas = (boxes[..., 2] - boxes[..., 0]) * (boxes[..., 3] - 172 | boxes[..., 1]) 173 | return np.mean(areas) 174 | 175 | tracklet = {k: v for k, v in tracklet.items() if meanarea(v) > 5000} 176 | return tracklet 177 | 178 | 179 | def distance_tracklet(tracklet): 180 | dists = {} 181 | for k, v in tracklet.items(): 182 | bboxes = np.stack([x[1] for x in v]) 183 | c_x = (bboxes[..., 2] + bboxes[..., 0]) / 2. 184 | c_y = (bboxes[..., 3] + bboxes[..., 1]) / 2. 185 | c_x -= 480 186 | c_y -= 270 187 | c = np.concatenate([c_x[..., None], c_y[..., None]], axis=1) 188 | dist = np.linalg.norm(c, axis=1) 189 | dists[k] = np.mean(dist) 190 | return dists 191 | 192 | 193 | def tracklet2bbox(track, num_frame): 194 | # assign_prev 195 | bbox = np.zeros((num_frame, 5)) 196 | trackd = {} 197 | for k, v in track: 198 | bbox[k] = v 199 | trackd[k] = v 200 | for i in range(num_frame): 201 | if bbox[i][-1] <= 0.5: 202 | mind = np.Inf 203 | for k in trackd: 204 | if np.abs(k - i) < mind: 205 | mind = np.abs(k - i) 206 | bbox[i] = bbox[k] 207 | return bbox 208 | 209 | 210 | def tracklets2bbox(tracklet, num_frame): 211 | dists = distance_tracklet(tracklet) 212 | sorted_inds = sorted(dists, key=lambda x: dists[x]) 213 | dist_thre = np.Inf 214 | for i in sorted_inds: 215 | if len(tracklet[i]) >= num_frame / 2: 216 | dist_thre = 2 * dists[i] 217 | break 218 | 219 | dist_thre = max(50, dist_thre) 220 | 221 | bbox = np.zeros((num_frame, 5)) 222 | bboxd = {} 223 | for idx in sorted_inds: 224 | if dists[idx] < dist_thre: 225 | for k, v in tracklet[idx]: 226 | if bbox[k][-1] < 0.01: 227 | bbox[k] = v 228 | bboxd[k] = v 229 | bad = 0 230 | for idx in range(num_frame): 231 | if bbox[idx][-1] < 0.01: 232 | bad += 1 233 | mind = np.Inf 234 | mink = None 235 | for k in bboxd: 236 | if np.abs(k - idx) < mind: 237 | mind = np.abs(k - idx) 238 | mink = k 239 | bbox[idx] = bboxd[mink] 240 | return bad, bbox 241 | 242 | 243 | def bboxes2bbox(bbox, num_frame): 244 | ret = np.zeros((num_frame, 2, 5)) 245 | for t, item in enumerate(bbox): 246 | if item.shape[0] <= 2: 247 | ret[t, :item.shape[0]] = item 248 | else: 249 | inds = sorted(list(range(item.shape[0])), 250 | key=lambda x: -item[x, -1]) 251 | ret[t] = item[inds[:2]] 252 | for t in range(num_frame): 253 | if ret[t, 0, -1] <= 0.01: 254 | ret[t] = ret[t - 1] 255 | elif ret[t, 1, -1] <= 0.01: 256 | if t: 257 | if ret[t - 1, 0, -1] > 0.01 and ret[t - 1, 1, -1] > 0.01: 258 | if iou(ret[t, 0], ret[t - 1, 0]) > iou( 259 | ret[t, 0], ret[t - 1, 1]): 260 | ret[t, 1] = ret[t - 1, 1] 261 | else: 262 | ret[t, 1] = ret[t - 1, 0] 263 | return ret 264 | 265 | 266 | def det_postproc(det_results, vid): 267 | det_results = [removedup(x) for x in det_results] 268 | CONSOLE.print(f'\nn_person={N_PERSON}', style='green') 269 | 270 | is_easy, bboxes = is_easy_example(det_results, N_PERSON) 271 | if is_easy: 272 | msg = f'\n{vid} Easy Example' 273 | logging.info(msg) 274 | CONSOLE.print(msg, style='green') 275 | return bboxes 276 | 277 | tracklets = bbox2tracklet(det_results) 278 | tracklets = drop_tracklet(tracklets) 279 | 280 | msg = (f'\n{vid } Hard {N_PERSON}-person Example, ' 281 | f'found {len(tracklets)} tracklet') 282 | logging.info(msg) 283 | CONSOLE.print(msg, style='green') 284 | 285 | if N_PERSON == 1: 286 | if len(tracklets) == 1: 287 | tracklet = list(tracklets.values())[0] 288 | det_results = tracklet2bbox(tracklet, len(det_results)) 289 | # * return np.stack(det_results) - specific to the NTU dataset 290 | return np.stack( 291 | np.array([np.array([det_res]) for det_res in det_results])) 292 | else: 293 | _, det_results = tracklets2bbox(tracklets, len(det_results)) 294 | return np.array([np.array([det_res]) for det_res in det_results]) 295 | # * return det_results - specific to the NTU dataset 296 | 297 | # * n_person = 2 298 | 299 | if len(tracklets) == 0: 300 | # no bboxes found at all 301 | return [] 302 | 303 | if len(tracklets) <= 2: 304 | tracklets = list(tracklets.values()) 305 | bboxes = [] 306 | for tracklet in tracklets: 307 | bboxes.append(tracklet2bbox(tracklet, len(det_results))[:, None]) 308 | bbox = np.concatenate(bboxes, axis=1) 309 | return bbox 310 | else: 311 | return bboxes2bbox(det_results, len(det_results)) 312 | 313 | 314 | def pose_inference(args, frame_paths, det_results, pose_model=None): 315 | if pose_model is None: 316 | model = init_pose_model(args.pose_config, args.pose_checkpoint, 317 | args.device) 318 | else: 319 | model = pose_model 320 | CONSOLE.print('Performing Human Pose Estimation for each frame...', 321 | style='green') 322 | prog_bar = mmcv.ProgressBar(len(frame_paths)) 323 | 324 | num_frame = len(det_results) 325 | num_person = max([len(x) for x in det_results]) 326 | kp = np.zeros((num_person, num_frame, 17, 3), dtype=np.float32) 327 | 328 | for i, (f, d) in enumerate(zip(frame_paths, det_results)): 329 | # Align input format 330 | d = [dict(bbox=x) for x in list(d) if x[-1] > 0.5] 331 | pose = inference_top_down_pose_model(model, f, d, format='xyxy')[0] 332 | for j, item in enumerate(pose): 333 | kp[j, i] = item['keypoints'] 334 | prog_bar.update() 335 | return kp 336 | 337 | 338 | def pose_extraction(vid, 339 | filter_pose, 340 | thr=None, 341 | det_model=None, 342 | pose_model=None): 343 | frame_paths, img_shape = extract_frame(vid) 344 | if frame_paths is None and img_shape is None: 345 | CONSOLE.print(f'{vid} is corrupted', style='red') 346 | return -1, -1 347 | 348 | det_results = detection_inference(args, frame_paths, det_model) 349 | det_results = det_postproc(det_results, vid) 350 | if 0 == len(det_results): 351 | CONSOLE.print(f'No bounding boxes found for {vid}.', style='yellow') 352 | return None, None 353 | 354 | pose_results = pose_inference(args, frame_paths, det_results, pose_model) 355 | anno = dict() 356 | anno['keypoint'] = pose_results[..., :2] 357 | anno['keypoint_score'] = pose_results[..., 2] 358 | anno['frame_dir'] = osp.splitext(osp.basename(vid))[0] 359 | anno['img_shape'] = img_shape 360 | anno['original_shape'] = img_shape 361 | anno['total_frames'] = pose_results.shape[1] 362 | anno['label'] = ANN_TO_INDEX[vid.split('/')[-2]] 363 | 364 | # filter pose estimation based on threshold 365 | n_person = anno['keypoint_score'].shape[0] 366 | n_frames = len(anno['keypoint_score'][0]) 367 | count_0 = 0 368 | for k in range(0, n_person): 369 | for i in range(0, n_frames): 370 | for j in range(0, 17): # 17 defined keypoints 371 | if anno['keypoint_score'][k][i][j] < thr: 372 | if filter_pose: 373 | anno['keypoint'][k][i][j] = 0 374 | count_0 += 1 375 | 376 | correct_rate = 1 - round(count_0 / (n_person * n_frames * 17), 3) 377 | CONSOLE.print( 378 | f'\n{100*correct_rate}% of poses have a threshold higher ' 379 | f'than {thr}', 380 | style='yellow') 381 | shutil.rmtree(osp.dirname(frame_paths[0])) 382 | 383 | return anno, correct_rate 384 | 385 | 386 | def parse_args(): 387 | parser = argparse.ArgumentParser( 388 | description='Generate Pose Annotation for a single video') 389 | parser.add_argument('video', type=str, help='source video') 390 | parser.add_argument('ann', type=str, help='dataset annotations') 391 | parser.add_argument('--out-dir', 392 | type=str, 393 | default='mmaction2/data/phar/pose', 394 | help='output dir') 395 | parser.add_argument('--det-score-thr', 396 | type=float, 397 | default=0.5, 398 | help='detection score threshold') 399 | parser.add_argument('--pose-score-thr', 400 | type=float, 401 | default=0.5, 402 | help='pose estimation score threshold') 403 | parser.add_argument('--correct-rate', 404 | type=float, 405 | default=0.5, 406 | help=('if less than this rate of frame poses have a ' 407 | 'lower confidence than `poses-score-thr`, do not' 408 | 'save the pkl result')) 409 | parser.add_argument( 410 | '--filter-pose', 411 | action='store_true', 412 | help='whether to set the pose estimation of frames ' 413 | 'with score confidence less than the threshold to zero') 414 | parser.add_argument('--device', type=str, default='cuda:0') 415 | args = parser.parse_args() 416 | return args 417 | 418 | 419 | def main(sub_args, det_model=None, pose_model=None): 420 | out = osp.join(sub_args.out_dir, 421 | osp.splitext(sub_args.video.split('/')[-1])[0]) + '.pkl' 422 | if osp.exists(out): 423 | CONSOLE.print(f'{out} exists. Skipping...', style='yellow') 424 | return 425 | 426 | global ANN_TO_INDEX, args 427 | args = sub_args 428 | ANN_TO_INDEX = utils.annotations_dic(args.ann) 429 | anno, correct_rate = pose_extraction(args.video, args.filter_pose, 430 | args.pose_score_thr, det_model, 431 | pose_model) 432 | if anno is None and correct_rate is None: 433 | return 0 434 | elif anno == -1 and correct_rate == -1: 435 | return 436 | 437 | # save poses if they don't have more than `args.incorrect_thr %` of poses 438 | # with a lower confidence than `args.poses_score_thr` 439 | if correct_rate > args.correct_rate: 440 | mmcv.dump(anno, out) 441 | 442 | return correct_rate 443 | 444 | 445 | if __name__ == '__main__': 446 | logging.basicConfig(filename='pose_extraction.log', level=logging.DEBUG) 447 | global_args = parse_args() 448 | args.device = global_args.device 449 | args.video = global_args.video 450 | args.out_dir = global_args.out_dir 451 | args.det_score_thr = global_args.det_score_thr 452 | args.pose_score_thr = global_args.pose_score_thr 453 | args.ann = global_args.ann 454 | args.correct_rate = global_args.correct_rate 455 | main(args) 456 | -------------------------------------------------------------------------------- /src/demo/__int__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/src/demo/__int__.py -------------------------------------------------------------------------------- /src/demo/demo_audio.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import argparse 3 | 4 | import torch 5 | from mmaction.apis import inference_recognizer, init_recognizer 6 | from mmcv import Config, DictAction 7 | from rich.console import Console 8 | 9 | CONSOLE = Console() 10 | 11 | 12 | def parse_args(): 13 | parser = argparse.ArgumentParser(description='MMAction2 demo') 14 | parser.add_argument('config', help='test config file path') 15 | parser.add_argument('checkpoint', help='checkpoint file/url') 16 | parser.add_argument('audio', help='audio file') 17 | parser.add_argument('--label', 18 | default='resources/annotations/annotations_audio.txt', 19 | help='label file') 20 | parser.add_argument( 21 | '--cfg-options', 22 | nargs='+', 23 | action=DictAction, 24 | default={}, 25 | help='override some settings in the used config, the key-value pair ' 26 | 'in xxx=yyy format will be merged into config file. For example, ' 27 | "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'") 28 | parser.add_argument('--device', 29 | type=str, 30 | default='cuda:0', 31 | help='CPU/CUDA device option') 32 | args = parser.parse_args() 33 | return args 34 | 35 | 36 | def main(): 37 | args = parse_args() 38 | device = torch.device(args.device) 39 | cfg = Config.fromfile(args.config) 40 | cfg.merge_from_dict(args.cfg_options) 41 | 42 | # build the recognizer from a config file and checkpoint file/url 43 | model = init_recognizer(cfg, args.checkpoint, device=device) 44 | if not args.audio.endswith('.npy'): 45 | raise NotImplementedError('Demo works on extracted audio features') 46 | 47 | results = inference_recognizer(model, args.audio) 48 | 49 | labels = open(args.label).readlines() 50 | labels = [x.strip() for x in labels] 51 | results = [(labels[k[0]], k[1]) for k in results] 52 | 53 | CONSOLE.print('Scores:', style='green') 54 | for result in results: 55 | CONSOLE.print(f'{result[0]}: ', result[1]) 56 | 57 | 58 | if __name__ == '__main__': 59 | main() 60 | -------------------------------------------------------------------------------- /src/demo/demo_skeleton.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import os.path as osp 4 | import shutil 5 | 6 | import cv2 7 | import mmcv 8 | import numpy as np 9 | import torch 10 | from mmaction.apis import inference_recognizer, init_recognizer 11 | from mmcv import DictAction 12 | from rich.console import Console 13 | 14 | try: 15 | from mmdet.apis import inference_detector, init_detector 16 | except (ImportError, ModuleNotFoundError): 17 | raise ImportError('Failed to import `inference_detector` and ' 18 | '`init_detector` form `mmdet.apis`. These apis are ' 19 | 'required in this demo! ') 20 | 21 | try: 22 | from mmpose.apis import (inference_top_down_pose_model, init_pose_model, 23 | vis_pose_result) 24 | except (ImportError, ModuleNotFoundError): 25 | raise ImportError('Failed to import `inference_top_down_pose_model`, ' 26 | '`init_pose_model`, and `vis_pose_result` form ' 27 | '`mmpose.apis`. These apis are required in this demo! ') 28 | 29 | try: 30 | import moviepy.editor as mpy 31 | except ImportError: 32 | raise ImportError('Please install moviepy to enable output file') 33 | 34 | CONSOLE = Console() 35 | 36 | FONTFACE = cv2.FONT_HERSHEY_DUPLEX 37 | FONTSCALE = 0.85 38 | FONTCOLOR = (255, 255, 0) # BGR, white 39 | FONTCOLOR_SCORE = (0, 165, 255) 40 | THICKNESS = 1 41 | LINETYPE = 1 42 | 43 | # TODO: add json option 44 | 45 | 46 | def parse_args(): 47 | parser = argparse.ArgumentParser(description='MMAction2 demo') 48 | parser.add_argument('video', help='video file/url') 49 | parser.add_argument('out_filename', help='output filename') 50 | parser.add_argument( 51 | '--config', 52 | default=('configs/skeleton/posec3d/' 53 | 'slowonly_r50_u48_240e_ntu120_xsub_keypoint.py'), 54 | help='skeleton model config file path') 55 | parser.add_argument( 56 | '--checkpoint', 57 | default=('https://download.openmmlab.com/mmaction/skeleton/posec3d/' 58 | 'slowonly_r50_u48_240e_ntu120_xsub_keypoint/' 59 | 'slowonly_r50_u48_240e_ntu120_xsub_keypoint-6736b03f.pth'), 60 | help='skeleton model checkpoint file/url') 61 | parser.add_argument( 62 | '--det-config', 63 | default='mmaction2/demo/faster_rcnn_r50_fpn_2x_coco.py', 64 | help='human detection config file path (from mmdet)') 65 | parser.add_argument( 66 | '--det-checkpoint', 67 | default=('http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/' 68 | 'faster_rcnn_r50_fpn_2x_coco/' 69 | 'faster_rcnn_r50_fpn_2x_coco_' 70 | 'bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'), 71 | help='human detection checkpoint file/url') 72 | parser.add_argument( 73 | '--pose-config', 74 | default='mmaction2/demo/hrnet_w32_coco_256x192.py', 75 | help='human pose estimation config file path (from mmpose)') 76 | parser.add_argument( 77 | '--pose-checkpoint', 78 | default=('https://download.openmmlab.com/mmpose/top_down/hrnet/' 79 | 'hrnet_w32_coco_256x192-c78dce93_20200708.pth'), 80 | help='human pose estimation checkpoint file/url') 81 | parser.add_argument('--det-score-thr', 82 | type=float, 83 | default=0.8, 84 | help='the threshold of human detection score') 85 | parser.add_argument('--label-map', 86 | default='tools/data/skeleton/label_map_ntu120.txt', 87 | help='label map file') 88 | parser.add_argument('--device', 89 | type=str, 90 | default='cuda:0', 91 | help='CPU/CUDA device option') 92 | parser.add_argument('--short-side', 93 | type=int, 94 | default=480, 95 | help='specify the short-side length of the image') 96 | parser.add_argument( 97 | '--cfg-options', 98 | nargs='+', 99 | action=DictAction, 100 | default={}, 101 | help='override some settings in the used config, the key-value pair ' 102 | 'in xxx=yyy format will be merged into config file. For example, ' 103 | "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'") 104 | parser.add_argument('--pose-score-thr', 105 | type=float, 106 | default=0.4, 107 | help='pose estimation score threshold') 108 | parser.add_argument( 109 | '--correct-rate', 110 | type=float, 111 | default=0.4, 112 | help=('if less than this rate of frame poses have a ' 113 | 'lower confidence than `poses-score-thr`, skip the demo')) 114 | args = parser.parse_args() 115 | return args 116 | 117 | 118 | def frame_extraction(video_path, short_side): 119 | """Extract frames given video_path. 120 | 121 | Args: 122 | video_path (str): The video_path. 123 | """ 124 | # Load the video, extract frames into ./tmp/video_name 125 | target_dir = osp.join('./tmp', osp.basename(osp.splitext(video_path)[0])) 126 | os.makedirs(target_dir, exist_ok=True) 127 | # Should be able to handle videos up to several hours 128 | frame_tmpl = osp.join(target_dir, 'img_{:06d}.jpg') 129 | vid = cv2.VideoCapture(video_path) 130 | frames = [] 131 | frame_paths = [] 132 | flag, frame = vid.read() 133 | cnt = 0 134 | new_h, new_w = None, None 135 | while flag: 136 | if new_h is None: 137 | h, w, _ = frame.shape 138 | new_w, new_h = mmcv.rescale_size((w, h), (short_side, np.Inf)) 139 | 140 | frame = mmcv.imresize(frame, (new_w, new_h)) 141 | 142 | frames.append(frame) 143 | frame_path = frame_tmpl.format(cnt + 1) 144 | frame_paths.append(frame_path) 145 | 146 | cv2.imwrite(frame_path, frame) 147 | cnt += 1 148 | flag, frame = vid.read() 149 | 150 | return frame_paths, frames 151 | 152 | 153 | def detection_inference(args, frame_paths): 154 | """Detect human boxes given frame paths. 155 | 156 | Args: 157 | args (argparse.Namespace): The arguments. 158 | frame_paths (list[str]): The paths of frames to do detection inference. 159 | 160 | Returns: 161 | list[np.ndarray]: The human detection results. 162 | """ 163 | model = init_detector(args.det_config, args.det_checkpoint, args.device) 164 | assert model.CLASSES[0] == 'person', ('We require you to use a detector ' 165 | 'trained on COCO') 166 | results = [] 167 | print('Performing Human Detection for each frame') 168 | prog_bar = mmcv.ProgressBar(len(frame_paths)) 169 | for frame_path in frame_paths: 170 | result = inference_detector(model, frame_path) 171 | # We only keep human detections with score larger than det_score_thr 172 | result = result[0][result[0][:, 4] >= args.det_score_thr] 173 | results.append(result) 174 | prog_bar.update() 175 | return results 176 | 177 | 178 | def pose_inference(args, frame_paths, det_results): 179 | model = init_pose_model(args.pose_config, args.pose_checkpoint, 180 | args.device) 181 | ret = [] 182 | print('Performing Human Pose Estimation for each frame') 183 | prog_bar = mmcv.ProgressBar(len(frame_paths)) 184 | for f, d in zip(frame_paths, det_results): 185 | # Align input format 186 | d = [dict(bbox=x) for x in list(d)] 187 | pose = inference_top_down_pose_model(model, f, d, format='xyxy')[0] 188 | ret.append(pose) 189 | prog_bar.update() 190 | return ret 191 | 192 | 193 | def main(): 194 | args = parse_args() 195 | 196 | frame_paths, original_frames = frame_extraction(args.video, 197 | args.short_side) 198 | num_frame = len(frame_paths) 199 | h, w, _ = original_frames[0].shape 200 | 201 | # Get clip_len, frame_interval and calculate center index of each clip 202 | config = mmcv.Config.fromfile(args.config) 203 | config.merge_from_dict(args.cfg_options) 204 | for component in config.data.test.pipeline: 205 | if component['type'] == 'PoseNormalize': 206 | component['mean'] = (w // 2, h // 2, .5) 207 | component['max_value'] = (w, h, 1.) 208 | 209 | model = init_recognizer(config, args.checkpoint, args.device) 210 | 211 | # Load label_map 212 | label_map = [x.strip() for x in open(args.label_map).readlines()] 213 | 214 | # Get Human detection results 215 | det_results = detection_inference(args, frame_paths) 216 | torch.cuda.empty_cache() 217 | 218 | pose_results = pose_inference(args, frame_paths, det_results) 219 | torch.cuda.empty_cache() 220 | 221 | fake_anno = dict(frame_dir='', 222 | label=-1, 223 | img_shape=(h, w), 224 | original_shape=(h, w), 225 | start_index=0, 226 | modality='Pose', 227 | total_frames=num_frame) 228 | num_person = max([len(x) for x in pose_results]) 229 | num_person = 2 # TODO: one person can also be in the frame 230 | CONSOLE.print(f'# Persons: {num_person}\n', style='green') 231 | 232 | num_keypoint = 17 233 | keypoint = np.zeros((num_person, num_frame, num_keypoint, 2), 234 | dtype=np.float16) 235 | keypoint_score = np.zeros((num_person, num_frame, num_keypoint), 236 | dtype=np.float16) 237 | for i, poses in enumerate(pose_results): 238 | for j, pose in enumerate(poses): 239 | pose = pose['keypoints'] 240 | try: 241 | keypoint[j, i] = pose[:, :2] 242 | except IndexError: 243 | continue 244 | keypoint_score[j, i] = pose[:, 2] 245 | 246 | fake_anno['keypoint'] = keypoint 247 | fake_anno['keypoint_score'] = keypoint_score 248 | count_0 = 0 249 | 250 | for k in range(0, num_person): 251 | for i in range(0, num_frame): 252 | for j in range(0, 17): # 17 defined keypoints 253 | if fake_anno['keypoint_score'][k][i][j] < args.pose_score_thr: 254 | # fake_anno['keypoint'][k][i][j] = 0 255 | count_0 += 1 256 | 257 | correct_rate = 1 - round(count_0 / (num_person * num_frame * 17), 3) 258 | if correct_rate < args.correct_rate: 259 | CONSOLE.print((f'Clip has correct rate of {correct_rate} lower than ' 260 | f'the threshold of {args.correct_rate}. Skipping...'), 261 | style='red') 262 | tmp_frame_dir = osp.dirname(frame_paths[0]) 263 | shutil.rmtree(tmp_frame_dir) 264 | return 265 | 266 | results = inference_recognizer(model, fake_anno) 267 | 268 | top_actions = 3 269 | action_labels = [label_map[results[i][0]] for i in range(top_actions)] 270 | action_scores = [results[i][1] for i in range(top_actions)] 271 | 272 | pose_model = init_pose_model(args.pose_config, args.pose_checkpoint, 273 | args.device) 274 | vis_frames = [ 275 | vis_pose_result(pose_model, frame_paths[i], pose_results[i]) 276 | for i in range(num_frame) 277 | ] 278 | x, y = 10, 30 279 | x_y_dist = 200 280 | for frame in vis_frames: 281 | i = 0 282 | for label, score in zip(action_labels, action_scores): 283 | i += 1 284 | cv2.putText(frame, label, (x, y * i), FONTFACE, FONTSCALE, 285 | FONTCOLOR, THICKNESS, LINETYPE) 286 | cv2.putText(frame, str(round(100 * score, 287 | 2)), (x + x_y_dist, y * i), FONTFACE, 288 | FONTSCALE, FONTCOLOR_SCORE, THICKNESS, LINETYPE) 289 | 290 | vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames], fps=24) 291 | vid.write_videofile(args.out_filename, remove_temp=True) 292 | 293 | tmp_frame_dir = osp.dirname(frame_paths[0]) 294 | shutil.rmtree(tmp_frame_dir) 295 | 296 | 297 | if __name__ == '__main__': 298 | main() 299 | -------------------------------------------------------------------------------- /src/demo/long_video_demo_clips.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import os.path as osp 4 | import random 5 | import string 6 | import subprocess 7 | from itertools import repeat 8 | from multiprocessing import Manager, Pool, cpu_count 9 | 10 | import moviepy.editor as mpy 11 | import numpy as np 12 | from rich.console import Console 13 | 14 | CONSOLE = Console() 15 | manager = Manager() 16 | clips = manager.list() 17 | json_res = manager.list() 18 | 19 | MIN_CLIP_DUR = None 20 | 21 | 22 | def gen_id(size=8): 23 | """Generate a random id.""" 24 | chars = string.ascii_uppercase + string.digits 25 | return ''.join(random.choice(chars) for _ in range(size)) 26 | 27 | 28 | def prettify(byte_content): 29 | decoded = byte_content.decode('utf-8') 30 | formatted_output = decoded.replace('\\n', '\n').replace('\\t', '\t') 31 | return formatted_output 32 | 33 | 34 | def delete_clips(clips): 35 | for clip in clips: 36 | try: 37 | os.unlink(clip) 38 | except FileNotFoundError: 39 | pass 40 | 41 | 42 | def parse_args(): 43 | parser = argparse.ArgumentParser( 44 | description='long video demo based on clips') 45 | parser.add_argument('video', help='video file') 46 | parser.add_argument('config', help='model config file') 47 | parser.add_argument('checkpoint', help='model checkpoint') 48 | parser.add_argument('out', help='out file. Video or Json') 49 | parser.add_argument('--ann', 50 | type=str, 51 | default='resources/annotations/annotations_pose.txt', 52 | help='for base or eval annotations') 53 | parser.add_argument('--type', 54 | type=str, 55 | default='pose', 56 | choices=['pose', 'recognition'], 57 | help='whether the demo will be pose or recognition') 58 | parser.add_argument('--num-processes', 59 | type=int, 60 | default=(cpu_count() - 1 or 1), 61 | help='Number of processes to extract subclips') 62 | parser.add_argument('--num-gpus', 63 | type=int, 64 | default=1, 65 | help='Number of gpus to perform pose-har') 66 | parser.add_argument('--subclip-duration', 67 | type=int, 68 | default=7, 69 | help='duration of subclips') 70 | args = parser.parse_args() 71 | return args 72 | 73 | 74 | def pose(items): 75 | gpu, clips, args = items 76 | script_path = 'src/demo/demo_skeleton.py' 77 | if not osp.exists(script_path): 78 | CONSOLE.print(f'{script_path} does not exist', style='red') 79 | for clip in clips: 80 | subargs = [ 81 | 'python', 82 | script_path, 83 | clip, 84 | clip, # overwrite original clip 85 | '--config', 86 | args.config, 87 | '--checkpoint', 88 | args.checkpoint, 89 | '--label-map', 90 | args.ann, # class annotations 91 | '--device', 92 | gpu 93 | ] 94 | result = subprocess.run(subargs, capture_output=True) 95 | error = result.stderr.decode('utf-8') 96 | if error: 97 | CONSOLE.print(error, style='red') 98 | 99 | 100 | def recognition(items): 101 | gpu, clips, args = items 102 | script_path = 'demo/demo.py' 103 | for clip in clips: 104 | subargs = [ 105 | 'python', 106 | script_path, 107 | args.config, 108 | args.checkpoint, 109 | clip, 110 | args.ann, # class annotations 111 | '--font-color', 112 | 'blue', 113 | '--out-filename', 114 | clip, # overwrite original clip 115 | '--device', 116 | gpu 117 | ] 118 | try: 119 | subprocess.check_output(subargs) 120 | except Exception as e: 121 | CONSOLE.print(e, style='bold red') 122 | 123 | 124 | def extract_subclip(items): 125 | ts, timestamps, video = items 126 | video = mpy.VideoFileClip(video) 127 | start = timestamps[ts[0]] 128 | finish = timestamps[ts[1]] 129 | 130 | clip_pth = f'{ts[0]}_{gen_id()}.mp4' 131 | clips.append(clip_pth) 132 | 133 | try: 134 | clip = video.subclip(start, finish) 135 | if clip.duration < MIN_CLIP_DUR: 136 | CONSOLE.print(f'Subclip duration < {MIN_CLIP_DUR}. Skipping...', 137 | style='yellow') 138 | return 139 | clip.write_videofile(clip_pth, logger=None, audio=False) 140 | except OSError as e: 141 | CONSOLE.print(e, style='bold red') 142 | pass 143 | finally: 144 | video.close() 145 | 146 | 147 | def merge_clips(clips, out): 148 | clips = sorted(clips, key=lambda x: int(x[2:4])) 149 | video_clips = [] 150 | for clip in clips: 151 | try: 152 | video_clips.append(mpy.VideoFileClip(clip)) 153 | except OSError: 154 | pass 155 | 156 | result = mpy.concatenate_videoclips(video_clips, method='compose') 157 | result.write_videofile(out) 158 | delete_clips(clips) 159 | 160 | 161 | def merge_json(json_res, time_segments, out): 162 | result = {} 163 | json_res = sorted(json_res, key=lambda x: int(x[:2])) 164 | for tup in zip(time_segments, json_res): 165 | result[str(tup[0])] = tup[1].split(' ', 1)[1].strip() 166 | 167 | import json 168 | with open(out, 'w') as f: 169 | json.dump(result, f, indent=2) 170 | 171 | 172 | def main(): 173 | args = parse_args() 174 | global MIN_CLIP_DUR 175 | MIN_CLIP_DUR = args.subclip_duration 176 | 177 | splits = int( 178 | mpy.VideoFileClip(args.video).duration / args.subclip_duration) 179 | timestamps = { 180 | f'ts{i:02}': args.subclip_duration * i 181 | for i in range(0, splits + 1) 182 | } 183 | time_segments = [(f'ts{i:02}', f'ts{i+1:02}') for i in range(0, splits)] 184 | # add a timestamp for any remaining segments < 10s 185 | rest_timestamp = f'ts{int(list(timestamps.keys())[-1][2:]) + 1}' 186 | timestamps[rest_timestamp] = None 187 | time_segments.append( 188 | (list(timestamps.keys())[-2], list(timestamps.keys())[-1])) 189 | 190 | CONSOLE.print('Extracting subclips...', style='green') 191 | pool1 = Pool(args.num_processes) 192 | gpus = [f'cuda:{i}' for i in range(args.num_gpus)] 193 | pool1.map(extract_subclip, 194 | zip(time_segments, repeat(timestamps), repeat(args.video))) 195 | 196 | pool2 = Pool(len(gpus)) 197 | callback = pose if args.type == 'pose' else recognition 198 | CONSOLE.print(f'Performing {args.type}...', style='green') 199 | clips_per_gpus = [ 200 | label_split for label_split in np.array_split(clips, args.num_gpus) 201 | ] 202 | pool2.map(callback, zip(gpus, clips_per_gpus, repeat(args))) 203 | 204 | merge_clips(clips, args.out.split('.')[0] + '.mp4') 205 | if args.out.endswith('.json'): 206 | merge_json(json_res, time_segments, args.out) 207 | 208 | 209 | if __name__ == '__main__': 210 | main() 211 | -------------------------------------------------------------------------------- /src/demo/visualize_heatmap_volume.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os.path as osp 3 | import sys 4 | 5 | import cv2 6 | import decord 7 | import moviepy.editor as mpy 8 | import numpy as np 9 | from mmaction.datasets.pipelines import Compose 10 | from mmcv import load 11 | 12 | from mmpose.apis import vis_pose_result 13 | from mmpose.models import TopDown 14 | 15 | sys.path.append('src/') # noqa 16 | import utils as utils # noqa isort:skip 17 | 18 | keypoint_pipeline = [ 19 | dict(type='PoseDecode'), 20 | dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), 21 | dict(type='Resize', scale=(-1, 64)), 22 | dict(type='CenterCrop', crop_size=64), 23 | dict(type='GeneratePoseTarget', 24 | sigma=0.6, 25 | use_score=True, 26 | with_kp=True, 27 | with_limb=False) 28 | ] 29 | 30 | limb_pipeline = [ 31 | dict(type='PoseDecode'), 32 | dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), 33 | dict(type='Resize', scale=(-1, 64)), 34 | dict(type='CenterCrop', crop_size=64), 35 | dict(type='GeneratePoseTarget', 36 | sigma=0.6, 37 | use_score=True, 38 | with_kp=False, 39 | with_limb=True) 40 | ] 41 | 42 | FONTFACE = cv2.FONT_HERSHEY_DUPLEX 43 | FONTSCALE = 0.6 44 | FONTCOLOR = (255, 255, 255) 45 | BGBLUE = (0, 119, 182) 46 | THICKNESS = 1 47 | LINETYPE = 1 48 | 49 | 50 | def add_label(frame, label, BGCOLOR=BGBLUE): 51 | threshold = 30 52 | 53 | def split_label(label): 54 | label = label.split() 55 | lines, cline = [], '' 56 | for word in label: 57 | if len(cline) + len(word) < threshold: 58 | cline = cline + ' ' + word 59 | else: 60 | lines.append(cline) 61 | cline = word 62 | if cline != '': 63 | lines += [cline] 64 | return lines 65 | 66 | if len(label) > 30: 67 | label = split_label(label) 68 | else: 69 | label = [label] 70 | label = ['Action: '] + label 71 | 72 | sizes = [] 73 | for line in label: 74 | sizes.append(cv2.getTextSize(line, FONTFACE, FONTSCALE, THICKNESS)[0]) 75 | box_width = max([x[0] for x in sizes]) + 10 76 | text_height = sizes[0][1] 77 | box_height = len(sizes) * (text_height + 6) 78 | 79 | cv2.rectangle(frame, (0, 0), (box_width, box_height), BGCOLOR, -1) 80 | for i, line in enumerate(label): 81 | location = (5, (text_height + 6) * i + text_height + 3) 82 | cv2.putText(frame, line, location, FONTFACE, FONTSCALE, FONTCOLOR, 83 | THICKNESS, LINETYPE) 84 | return frame 85 | 86 | 87 | def vis_skeleton(vid_path, anno, category_name=None, ratio=0.5): 88 | vid = decord.VideoReader(vid_path) 89 | frames = [x.asnumpy() for x in vid] 90 | 91 | h, w, _ = frames[0].shape 92 | new_shape = (int(w * ratio), int(h * ratio)) 93 | frames = [cv2.resize(f, new_shape) for f in frames] 94 | 95 | assert len(frames) == anno['total_frames'] 96 | # The shape is N x T x K x 3 97 | kps = np.concatenate([anno['keypoint'], anno['keypoint_score'][..., None]], 98 | axis=-1) 99 | kps[..., :2] *= ratio 100 | # Convert to T x N x K x 3 101 | kps = kps.transpose([1, 0, 2, 3]) 102 | vis_frames = [] 103 | 104 | # we need an instance of TopDown model, so build a minimal one 105 | model = TopDown(backbone=dict(type='ShuffleNetV1')) 106 | 107 | for f, kp in zip(frames, kps): 108 | bbox = np.zeros([0, 4], dtype=np.float32) 109 | result = [dict(bbox=bbox, keypoints=k) for k in kp] 110 | vis_frame = vis_pose_result(model, f, result) 111 | 112 | if category_name is not None: 113 | vis_frame = add_label(vis_frame, category_name) 114 | 115 | vis_frames.append(vis_frame) 116 | return vis_frames 117 | 118 | 119 | def get_pseudo_heatmap(anno, flag='keypoint'): 120 | assert flag in ['keypoint', 'limb'] 121 | pipeline = Compose(keypoint_pipeline if flag == 122 | 'keypoint' else limb_pipeline) 123 | return pipeline(anno)['imgs'] 124 | 125 | 126 | def vis_heatmaps(heatmaps, channel=-1, ratio=8): 127 | # if channel is -1, draw all keypoints / limbs on the same map 128 | import matplotlib.cm as cm 129 | h, w, _ = heatmaps[0].shape 130 | newh, neww = int(h * ratio), int(w * ratio) 131 | 132 | if channel == -1: 133 | heatmaps = [np.max(x, axis=-1) for x in heatmaps] 134 | cmap = cm.viridis 135 | heatmaps = [(cmap(x)[..., :3] * 255).astype(np.uint8) for x in heatmaps] 136 | heatmaps = [cv2.resize(x, (neww, newh)) for x in heatmaps] 137 | return heatmaps 138 | 139 | 140 | def parse_args(): 141 | parser = argparse.ArgumentParser(description='Visualize Pose & Heatmap') 142 | parser.add_argument('video', type=str, help='source video') 143 | parser.add_argument('pose_ann', type=str, help='pose pickle annotation') 144 | parser.add_argument('--ann', 145 | type=str, 146 | default='resources/annotations/annotations_pose.txt', 147 | help='dataset annotations') 148 | parser.add_argument('--det-score-thr', 149 | type=float, 150 | help='detection score threshold') 151 | parser.add_argument('--out-dir', type=str, default='demos/') 152 | parser.add_argument('--device', type=str, default='cuda:0') 153 | args = parser.parse_args() 154 | return args 155 | 156 | 157 | def main(): 158 | args = parse_args() 159 | anno = load(args.pose_ann) 160 | categories = utils.annotations_list(args.ann) 161 | video_name = osp.splitext(args.video.split('/')[-1])[0] 162 | 163 | # visualize skeleton 164 | vis_frames = vis_skeleton(args.video, 165 | anno, 166 | categories[anno['label']], 167 | ratio=1) 168 | cv2.imwrite(osp.join(args.out_dir, f'{video_name}_pose.jpg'), 169 | vis_frames[int(len(vis_frames) / 2)]) 170 | vid = mpy.ImageSequenceClip(vis_frames, fps=24) 171 | vid.write_videofile(osp.join(args.out_dir, f'{video_name}_pose.mp4')) 172 | 173 | # visualize heatmaps 174 | keypoint_heatmap = get_pseudo_heatmap(anno) 175 | keypoint_mapvis = vis_heatmaps(keypoint_heatmap) 176 | keypoint_mapvis = [ 177 | add_label(f, categories[anno['label']]) for f in keypoint_mapvis 178 | ] 179 | vid = mpy.ImageSequenceClip(keypoint_mapvis, fps=24) 180 | vid.write_videofile(osp.join(args.out_dir, f'{video_name}_heatmap.mp4')) 181 | 182 | 183 | if __name__ == '__main__': 184 | main() 185 | -------------------------------------------------------------------------------- /src/late_fusion.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os.path as osp 3 | import sys 4 | 5 | from mmaction.core.evaluation import (get_weighted_score, mean_class_accuracy, 6 | top_k_accuracy) 7 | from mmcv import load 8 | from rich.console import Console 9 | from scipy.special import softmax 10 | 11 | sys.path.append('./tools') # noqa 12 | import utils as utils # noqa isort:skip 13 | 14 | CONSOLE = Console() 15 | 16 | 17 | def get_class_id(path: str) -> int: 18 | """Get the label id of a clip given its path (e.g. 1). 19 | 20 | Args: 21 | path (str): path to clip 22 | 23 | Returns: 24 | int: label id of clips 25 | """ 26 | return int(osp.splitext(osp.basename(path.split()[1]))[0]) 27 | 28 | 29 | def get_clip_id(path: str) -> str: 30 | """Get the name (id) of a clip given its path (e.g. XNXXNAER). 31 | 32 | Args: 33 | path (str): path to clip 34 | 35 | Returns: 36 | str: clip name(id) 37 | """ 38 | return osp.splitext(osp.basename(path.split()[0]))[0] 39 | 40 | 41 | def clip_ids(datalist: list) -> list: 42 | """Returns a list of clip ids given the datalist. 43 | 44 | Args: 45 | datalist (list): label map 46 | 47 | Returns: 48 | list: of ids 49 | """ 50 | return [get_clip_id(d) for d in datalist] 51 | 52 | 53 | def parse_args(): 54 | parser = argparse.ArgumentParser(description='Fusing multiple scores') 55 | parser.add_argument('--scores', 56 | nargs='+', 57 | help='list of scores', 58 | default=['demo/fuse/rgb.pkl', 'demo/fuse/flow.pkl']) 59 | parser.add_argument('--coefficients', 60 | nargs='+', 61 | type=float, 62 | help='coefficients of each score file', 63 | default=[1.0, 1.0]) 64 | parser.add_argument( 65 | '--datalists', 66 | nargs='+', 67 | help='list of testing data', 68 | default=[ 69 | 'mmaction2/data/phar/val.txt', 70 | 'mmaction2/data/phar/audio_feature/filtered_20/val.txt' 71 | ]) 72 | parser.add_argument('--apply-softmax', action='store_true') 73 | parser.add_argument('--top-k', 74 | nargs='+', 75 | type=int, 76 | default=[1, 2, 3, 4, 5], 77 | help='top k accuracy to calculate') 78 | parser.add_argument('--label-map', 79 | nargs='+', 80 | help='annotation files', 81 | default=[ 82 | 'resources/annotations/annotations.txt', 83 | 'resources/annotations/annotations_audio.txt' 84 | ]) 85 | args = parser.parse_args() 86 | return args 87 | 88 | 89 | def main(): 90 | args = parse_args() 91 | assert len(args.scores) == len(args.coefficients) == len(args.label_map) 92 | 93 | lmaps = [] 94 | for lmap in args.label_map: 95 | lmaps.append(utils.annotations_dict_rev(lmap)) 96 | score_list = [load(f) for f in args.scores] 97 | data = [open(dl).readlines() for dl in args.datalists] 98 | 99 | # superset contains all the samples to be tested 100 | superset = max(data, key=len) 101 | superset_score = max(score_list, key=len) 102 | superset_lmap = max(lmaps, key=len) 103 | # remove the superset from the lists 104 | i = 0 105 | while i < len(data): 106 | if data[i] is superset: 107 | data.remove(data[i]) 108 | score_list.remove(score_list[i]) 109 | lmaps.remove(lmaps[i]) 110 | break 111 | i += 1 112 | 113 | # reload superset labels 114 | superset_lmap = utils.annotations_dic(args.label_map[i]) 115 | labels = [int(x.strip().split()[-1]) for x in superset] 116 | superset_ids = clip_ids(superset) 117 | for d in data: 118 | # CONSOLE.print(set(clip_ids(d)).difference(superset_ids)) 119 | assert set(clip_ids(d)).issubset(superset_ids) 120 | 121 | # order & fill in the scores of the subsets according to the superset 122 | ordered_scores = [] 123 | superset_ids = clip_ids(superset) 124 | zeros = [0 for _ in range(len(superset_score[0]))] 125 | for i in range(len(score_list)): 126 | ordered_scores.append(list()) 127 | data_ids = clip_ids(data[i]) 128 | for clip in superset: 129 | id = get_clip_id(clip) 130 | if id not in data_ids: 131 | ordered_scores[i].append(zeros) 132 | else: 133 | score = score_list[i][data_ids.index(id)] 134 | to_add = zeros.copy() 135 | for j in range(len(score)): 136 | # add the scores of the models with less classes in the 137 | # exact same position as it is in the model that contains 138 | # all the classes 139 | index = superset_lmap[lmaps[i][j]] 140 | to_add[index] = score[j] 141 | ordered_scores[i].append(to_add) 142 | 143 | ordered_scores.insert(0, superset_score) 144 | 145 | if args.apply_softmax: 146 | 147 | def apply_softmax(scores): 148 | return [softmax(score) for score in scores] 149 | 150 | ordered_scores = [apply_softmax(scores) for scores in ordered_scores] 151 | 152 | weighted_scores = get_weighted_score(ordered_scores, args.coefficients) 153 | CONSOLE.print('Weighted Scores', style='green') 154 | mean_class_acc = mean_class_accuracy(weighted_scores, labels) 155 | top_k = top_k_accuracy(weighted_scores, labels, args.top_k) 156 | print(f'Mean Class Accuracy: {mean_class_acc:.04f}') 157 | for k, topk in enumerate(top_k): 158 | CONSOLE.print(f'Top {k+1} Accuracy: {topk:.04f}') 159 | 160 | 161 | if __name__ == '__main__': 162 | main() 163 | -------------------------------------------------------------------------------- /src/record_experiment.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import re 4 | from argparse import ArgumentParser 5 | from pathlib import Path 6 | 7 | import mlflow 8 | from rich.console import Console 9 | 10 | CONSOLE = Console() 11 | 12 | 13 | def parse_args(): 14 | parser = ArgumentParser(prog='track experiments with mlflow tracking' 15 | 'https://mlflow.org/docs/latest/tracking.html') 16 | parser.add_argument( 17 | 'experiment_name', 18 | help='name of experiment. Should correspond the model name') 19 | parser.add_argument( 20 | 'run_name', 21 | help='name of experiment run. Add things like hyperparameters here.') 22 | parser.add_argument('work_dir', help='dir where model files are stored') 23 | parser.add_argument('--mlrun-dir', 24 | default='./mlruns', 25 | help='mlrun storage dir. Leave default.') 26 | parser.add_argument('--data-dir', 27 | default='mmaction2/data/phar/', 28 | help='path to train/val/test dataset') 29 | args = parser.parse_args() 30 | return args 31 | 32 | 33 | def get_train_acc(log, start, topk_length, top_train): 34 | """Get training accuracy from mmaction2 log files.""" 35 | # * play with these two parameters if the results aren't perfect 36 | # audio: 1400, 6 37 | look_back, n_back = 1400, 6 38 | # train indexes start before needles[1] 39 | train_index = start 40 | # take average of last n_back readings 41 | for row in log[train_index - look_back:train_index].split('\t'): 42 | for i in range(1, 6): 43 | t = f'top{i}' 44 | sub_index = row.find(t) 45 | if sub_index == -1: 46 | break 47 | 48 | topk = row[sub_index:sub_index + topk_length] 49 | topk = float(topk.split('acc: ')[1]) 50 | top_train[t] += topk 51 | 52 | top_train = {k: round(v / n_back, 3) for k, v in top_train.items()} 53 | return top_train 54 | 55 | 56 | def get_train_val_acc(logs): 57 | """Get the validation & training accuracy from mmaction2 log files.""" 58 | 59 | # specific to mmaction2 logs 60 | needles = ('Now best checkpoint is saved as', 'Evaluating top_k_accuracy') 61 | topk_length = 15 62 | 63 | top_val = {f'top{k}': 0 for k in range(1, 6)} 64 | top_train = {f'top{k}': 0 for k in range(1, 6)} 65 | 66 | for log in logs: 67 | # find all indexes for new best models logs 68 | new_best_indexes = [m.start() for m in re.finditer(needles[0], log)] 69 | 70 | for index in new_best_indexes: 71 | # topks are replaced only if top1 is exceeded 72 | replace = False 73 | # find the start of the new best models log 74 | start = log[:index].rfind(needles[1]) 75 | 76 | for i in range(1, 6): 77 | t = f'top{i}' 78 | sub_index = log[start:index].find(t) 79 | topk = log[start + sub_index:start + sub_index + topk_length] 80 | topk = float(topk.split('acc')[1]) 81 | 82 | if topk > top_val[t] and t == 'top1': 83 | replace = True 84 | if replace: 85 | top_val[t] = topk 86 | 87 | if not replace: 88 | continue 89 | 90 | try: 91 | top_train = get_train_acc(log, start, topk_length, top_train) 92 | except IndexError: 93 | CONSOLE.print('Log is missing train infos', style='yellow') 94 | 95 | return top_train, top_val 96 | 97 | 98 | def get_last_model(dir): 99 | """Get the latest checkpoint of a model.""" 100 | latest = osp.join(dir, 'latest.pth') 101 | if os.path.exists(latest): 102 | os.remove(osp.join(latest)) 103 | models = [m for m in os.listdir(dir) if m.endswith('.pth')] 104 | 105 | return sorted(models, 106 | key=lambda x: int(''.join([d for d in x if d.isdigit()])), 107 | reverse=True) 108 | 109 | 110 | def get_top_model(dir): 111 | return [model for model in os.listdir(dir) if model[:4] == 'best'] 112 | 113 | 114 | def find_artifact(dir, ext, hint=''): 115 | """Given a folder, find files based on their extension and part of name.""" 116 | return [ 117 | file for file in os.listdir(dir) 118 | if (osp.splitext(file)[1] == ext and hint in file) 119 | ] 120 | 121 | 122 | def main(): 123 | args = parse_args() 124 | CONSOLE.print(f'Logging {args.experiment_name}-{args.run_name}...', 125 | style='green') 126 | Path(args.mlrun_dir).mkdir(parents=True, exist_ok=True) 127 | mlflow.set_tracking_uri(args.mlrun_dir) 128 | mlflow.set_experiment(args.experiment_name) 129 | 130 | with mlflow.start_run(run_name=args.run_name): 131 | logs = [] 132 | # log artifacts from work dir 133 | for ext in ['.json', '.log', '.py', '.txt', '.pkl']: 134 | for artifact in find_artifact(args.work_dir, ext): 135 | mlflow.log_artifact(osp.join(args.work_dir, artifact)) 136 | if ext == '.log': 137 | with open(osp.join(args.work_dir, artifact), 'r') as f: 138 | logs.append(f.read()) 139 | 140 | for ext in ['.txt', '.pkl']: 141 | for artifact in find_artifact(args.data_dir, ext): 142 | mlflow.log_artifact(osp.join(args.data_dir, artifact)) 143 | 144 | top_model = get_top_model(args.work_dir) 145 | if not top_model: 146 | CONSOLE.print(f'No best model found @{args.work_dir}', 147 | style='yellow') 148 | else: 149 | mlflow.log_artifact(osp.join(args.work_dir, top_model[0])) 150 | 151 | last_model = get_last_model(args.work_dir) 152 | if not last_model or len(last_model) == 1: 153 | CONSOLE.print(f'Last saved checkpoint not found @{args.work_dir}', 154 | style='yellow') 155 | else: 156 | last_model = list( 157 | filter(lambda x: not x.startswith('best'), last_model)) 158 | mlflow.log_artifact(osp.join(args.work_dir, last_model[0])) 159 | 160 | train_acc, val_acc = get_train_val_acc(logs) 161 | 162 | mlflow.log_params({ 163 | 'model': args.experiment_name, 164 | 'run': args.run_name, 165 | 'train acc': f'{train_acc}', 166 | 'val acc': f'{val_acc}', 167 | 'test acc': 'NA' 168 | }) 169 | 170 | 171 | if __name__ == '__main__': 172 | main() 173 | -------------------------------------------------------------------------------- /src/schedule_stuff.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import subprocess 4 | 5 | import schedule 6 | from rich.console import Console 7 | 8 | CONSOLE = Console() 9 | 10 | # https://schedule.readthedocs.io/en/stable/examples.html 11 | 12 | 13 | def pose_feasibility(cat, out_dir='mmaction2/data/phar/pose'): 14 | """Schedule for the pose_feasibility.py script.""" 15 | CONSOLE.print(f'Checking pose feasibility for {cat}...', style='green') 16 | script_path = 'tools/analysis/pose_feasibility.py' 17 | 18 | subargs = ['python', script_path, cat, '--out-dir', out_dir, '--resume'] 19 | subprocess.run(subargs) 20 | return schedule.CancelJob 21 | 22 | 23 | def extract_audio(in_dir, out_dir): 24 | """Scheduler to extract audio from videos_val. 25 | 26 | Args: 27 | in_dir (_type_): _description_ 28 | out_dir (_type_): _description_ 29 | 30 | Returns: 31 | _type_: _description_ 32 | """ 33 | import time 34 | script_dir = '/mmaction2/tools/data/extract_audio.py' 35 | for dIr in os.listdir(in_dir): 36 | CONSOLE.print(f'Extracting videos for {dIr}...', style='green') 37 | CONSOLE.print(osp.join(in_dir, dIr)) 38 | CONSOLE.print(osp.join(out_dir, dIr)) 39 | 40 | subargs = [ 41 | 'python', script_dir, 42 | osp.join(in_dir, dIr), 43 | osp.join(out_dir, dIr), '--level', '1', '--ext', 'avi' 44 | ] 45 | subprocess.run(subargs) 46 | time.sleep(30) 47 | return schedule.CancelJob 48 | 49 | 50 | def extract_audio_feature(in_dir, out_dir): 51 | """Extract spectogram features from audio. 52 | 53 | Args: 54 | in_dir (_type_): _description_ 55 | out_dir (_type_): _description_ 56 | 57 | Returns: 58 | _type_: _description_ 59 | """ 60 | script_dir = '/mmaction2/tools/data/build_audio_features.py' 61 | for dIr in os.listdir(in_dir): 62 | dir_path = osp.join(in_dir, dIr) 63 | for audio in os.listdir(dir_path): 64 | audio_path = osp.join(dir_path, audio) 65 | subargs = [ 66 | 'python', script_dir, audio_path, 67 | osp.join(out_dir, 68 | audio.split('.')[0] + '.npy'), '--level', '1', 69 | '--ext', 'avi' 70 | ] 71 | subprocess.run(subargs) 72 | 73 | return schedule.CancelJob 74 | 75 | 76 | def train_model(config: str, 77 | work_dir: str, 78 | resume_from=None, 79 | cfg_options=None): 80 | script_path = 'mmaction2/tools/dist_train.sh' 81 | no_gpus = 1 82 | subargs = [ 83 | 'bash', script_path, config, 84 | str(no_gpus), '--work-dir', work_dir, '--validate' 85 | ] 86 | if resume_from: 87 | subargs.append('--resume-from') 88 | subargs.append(resume_from) 89 | if cfg_options: 90 | subargs.append('--cfg-options') 91 | for tup in cfg_options.items(): 92 | subargs.append(f'{tup[0]}={tup[1]}') 93 | subprocess.run(subargs) 94 | 95 | 96 | def demo(in_video, out_video): 97 | script_path = 'src/demo/multimodial_demo.py' 98 | subargs = ['python', script_path, in_video, out_video] 99 | subprocess.run(subargs) 100 | 101 | 102 | schedule.every().friday.at('02:30').do( 103 | train_model, 104 | config=('configs/timesformer/' 105 | 'timesformer_divST_8x32x1_15e_kinetics400_rgb.py'), 106 | work_dir='mmaction2/work_dir/timesformer/') 107 | 108 | while True: 109 | schedule.run_pending() 110 | -------------------------------------------------------------------------------- /src/top_tags.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | from rich.console import Console 5 | 6 | CONSOLE = Console() 7 | 8 | # top predictions to check for each clip 9 | N = 2 10 | 11 | 12 | def parse_args(): 13 | parser = argparse.ArgumentParser(description='get the top tags of a video') 14 | parser.add_argument('predictions', help='json file containing predictions') 15 | parser.add_argument('--topk', 16 | type=int, 17 | default=3, 18 | choices=[1, 2, 3, 4, 5], 19 | help='top k tags to calculate') 20 | parser.add_argument('--label-map', 21 | default='resources/annotations/annotations.txt', 22 | help='annotation file') 23 | args = parser.parse_args() 24 | return args 25 | 26 | 27 | def main(): 28 | args = parse_args() 29 | with open(args.label_map, 'r') as ann: 30 | result = {line.strip(): 0 for line in ann} 31 | 32 | assert args.predictions.endswith( 33 | '.json'), 'prediction file is only supported in json format' 34 | with open(args.predictions, 'r') as f: 35 | predictions = json.load(f) 36 | 37 | for pred in predictions: 38 | top_pred = list(pred.items())[:N] 39 | for p in top_pred: 40 | result[p[1]] += 1 41 | 42 | result = dict(sorted(result.items(), key=lambda x: x[1], reverse=True)) 43 | CONSOLE.print(f'Top {args.topk} tags: {list(result.items())[:args.topk]}') 44 | 45 | 46 | if __name__ == '__main__': 47 | main() 48 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | 4 | 5 | def annotations_list(annotations): 6 | """Given an annotation file, return a list of them.""" 7 | with open(annotations) as ann: 8 | result = [line.strip().replace('-', '_') for line in ann] 9 | return result 10 | 11 | 12 | def annotations_dic(annotations): 13 | """Given an annotation file, return a dictionary {label: index} of them.""" 14 | labels = annotations_list(annotations) 15 | return {label: i for i, label in enumerate(labels)} 16 | 17 | 18 | def annotations_dict_rev(annotations): 19 | """Given an annotation file return a dictionary {index: label} of them.""" 20 | result = annotations_dic(annotations) 21 | return {v: k for k, v in result.items()} 22 | 23 | 24 | def gen_id(size=8): 25 | """Generate a random id.""" 26 | chars = string.ascii_uppercase + string.digits 27 | return ''.join(random.choice(chars) for _ in range(size)) 28 | 29 | 30 | def prettify(byte_content): 31 | """Prettify subprocess output. 32 | 33 | Args: 34 | byte_content ([type]): [description] 35 | 36 | Returns: 37 | [type]: [description] 38 | """ 39 | decoded = byte_content.decode('utf-8') 40 | formatted_output = decoded.replace('\\n', '\n').replace('\\t', '\t') 41 | return formatted_output 42 | --------------------------------------------------------------------------------