├── .gitignore
├── .gitmodules
├── .isort.cfg
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── annotator
    └── README.md
├── checkpoints
    ├── detector
    │   └── .gitkeep
    ├── har
    │   ├── audioonly_64x1x1.py
    │   ├── slowonly_u54_kinetics.py
    │   └── timesformer_divST_16x12x1_kinetics.py
    └── pose
    │   └── .gitkeep
├── configs
    ├── audio
    │   ├── audioonly_r101_64x1x1_200e_audio_feature.py
    │   ├── tsn_r18_64x1x1_100e_kinetics200_audio_feature.py
    │   └── tsn_r50_64x1x1_100e_kinetics400_audio.py
    ├── i3d
    │   └── i3d_r50_video_32x2x1_256e_kinetics400_rgb.py
    ├── omnisourced
    │   └── slowonly_r50_8x8x1_256e_omnisource_rgb.py
    ├── skeleton
    │   ├── agcn
    │   │   └── 2sagcn_640e_p300_keypoint_2d.py
    │   └── posec3d
    │   │   └── slowonly_r50_u54_640e_pr-kinetics.py
    ├── slowfast
    │   └── slowfast_r50_video_4x16x1_256e_kinetics400_rgb.py
    ├── slowonly
    │   └── slowonly_nl_embedded_gaussian_r50_8x8x1_150e.py
    └── timesformer
    │   └── timesformer_divST_16x12x1_15e_kinetics400_rgb.py
├── docker
    └── Dockerfile
├── requirements
    ├── extra.txt
    └── requirements.txt
├── resources
    ├── ann_dist_clip.jpg
    ├── ann_dist_clips.json
    ├── annotation_distribution(min).json
    ├── annotation_distribution.jpg
    ├── annotations
    │   ├── annotations.txt
    │   ├── annotations_audio.txt
    │   ├── annotations_pose.txt
    │   ├── current_annotations.txt
    │   └── temp.txt
    ├── audio
    │   ├── db_20_config.yml
    │   └── db_30_config.yml
    └── metrics
    │   ├── audio_cm.png
    │   ├── audio_loss.jpg
    │   ├── posec3d_loss.jpg
    │   ├── skeleton_cm.png
    │   └── timesformer_loss.jpg
└── src
    ├── __int__.py
    ├── analysis
        ├── __int__.py
        ├── audio_filter.py
        ├── class_distribution_clips.py
        ├── class_distribution_time.py
        ├── evaluate_acc_per_cls.py
        ├── pose_feasibility.py
        └── print_layers.py
    ├── data
        ├── README.md
        ├── __int__.py
        ├── augment_dataset.py
        ├── build_file_list.py
        ├── generate_dataset.py
        ├── generate_dataset_pose.py
        └── pose_extraction.py
    ├── demo
        ├── __int__.py
        ├── demo_audio.py
        ├── demo_skeleton.py
        ├── long_video_demo_clips.py
        ├── multimodial_demo.py
        └── visualize_heatmap_volume.py
    ├── late_fusion.py
    ├── misc.py
    ├── record_experiment.py
    ├── schedule_stuff.py
    ├── top_tags.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # vscode
132 | .vscode/
133 | 
134 | # data
135 | dataset/
136 | dataset_2/
137 | mmaction2/data/
138 | mmaction2/work_dir/
139 | mlruns/
140 | demos/
141 | annotator/via-3.0.11/
142 | annotator/via_video_annotator.html
143 | checkpoints/har/audio.pth
144 | checkpoints/har/posec3d.pth
145 | checkpoints/har/timeSformer.pth
146 | checkpoints/pose/hrnet_w32_coco_256x192.pth
147 | checkpoints/detector/faster_rcnn_r50_fpn_1x_coco-person.pth
148 | temp/
149 | tmp/
150 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "mmaction2"]
 2 | 	path = mmaction2
 3 | 	url = https://github.com/open-mmlab/mmaction2.git
 4 | [submodule "mmdetection"]
 5 | 	path = mmdetection
 6 | 	url = https://github.com/open-mmlab/mmdetection.git
 7 | [submodule "mmpose"]
 8 | 	path = mmpose
 9 | 	url = https://github.com/open-mmlab/mmpose.git
10 | 


--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
1 | [settings]
2 | known_third_party = cv2,data,decord,demo,har,mlflow,mmaction,mmcv,moviepy,numpy,pandas,pyloudnorm,rich,schedule,scipy,seaborn,soundfile,torch,tqdm,utils,vidaug,yaml
3 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | exclude: ^tests/data/
 2 | repos:
 3 |   - repo: https://gitlab.com/pycqa/flake8.git
 4 |     rev: 3.8.3
 5 |     hooks:
 6 |       - id: flake8
 7 |   - repo: https://github.com/asottile/seed-isort-config
 8 |     rev: v2.2.0
 9 |     hooks:
10 |       - id: seed-isort-config
11 |   - repo: https://github.com/timothycrosley/isort
12 |     rev: 4.3.21
13 |     hooks:
14 |       - id: isort
15 |   - repo: https://github.com/pre-commit/mirrors-yapf
16 |     rev: v0.30.0
17 |     hooks:
18 |       - id: yapf
19 |   - repo: https://github.com/pre-commit/pre-commit-hooks
20 |     rev: v3.1.0
21 |     hooks:
22 |       - id: trailing-whitespace
23 |       - id: check-yaml
24 |       - id: end-of-file-fixer
25 |       - id: requirements-txt-fixer
26 |       - id: double-quote-string-fixer
27 |       - id: check-merge-conflict
28 |       - id: fix-encoding-pragma
29 |         args: ["--remove"]
30 |       - id: mixed-line-ending
31 |         args: ["--fix=lf"]
32 |   - repo: https://github.com/markdownlint/markdownlint
33 |     rev: v0.11.0
34 |     hooks:
35 |       - id: markdownlint
36 |         args: [ "-r", "~MD002,~MD013,~MD024,~MD029,~MD033,~MD034,~MD036" ]
37 |   - repo: https://github.com/myint/docformatter
38 |     rev: v1.3.1
39 |     hooks:
40 |       - id: docformatter
41 |         args: ["--in-place", "--wrap-descriptions", "79"]
42 |   - repo: https://github.com/codespell-project/codespell
43 |     rev: v2.1.0
44 |     hooks:
45 |       - id: codespell
46 |         args: ["--skip", "*.ipynb,tools/data/hvu/label_map.json", "-L", "te,nd,thre,Gool,gool"]
47 |   # - repo: https://github.com/open-mmlab/pre-commit-hooks
48 |     # rev: v0.1.0  # Use the ref you want to point at
49 |     # hooks:
50 |       # - id: check-algo-readme
51 |       # - id: check-copyright
52 |         # args: ["mmaction", "tools", "tests"]  # these directories will be checked
53 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/annotator/README.md:
--------------------------------------------------------------------------------
1 | # VIA
2 | 
3 | Simple but powerful annotator tool. Check it out, [link](https://www.robots.ox.ac.uk/~vgg/software/via/).
4 | 


--------------------------------------------------------------------------------
/checkpoints/detector/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/checkpoints/detector/.gitkeep


--------------------------------------------------------------------------------
/checkpoints/har/audioonly_64x1x1.py:
--------------------------------------------------------------------------------
  1 | dataset_type = 'AudioFeatureDataset'
  2 | data_root = 'phar/mmaction2/data/phar/audio_feature/filtered_20/'
  3 | data_root_val = 'mmaction2/data/phar/audio_feature/filtered_20/'
  4 | data_root_test = 'mmaction2/data/phar/audio_feature/filtered_20/'
  5 | ann_file_train = 'mmaction2/data/phar/audio_feature/filtered_20//train.txt'
  6 | ann_file_val = 'mmaction2/data/phar/audio_feature/filtered_20//val.txt'
  7 | ann_file_test = 'mmaction2/data/phar/audio_feature/filtered_20//val.txt'
  8 | num_classes = 4
  9 | model = dict(type='AudioRecognizer',
 10 |              backbone=dict(type='ResNetAudio',
 11 |                            depth=101,
 12 |                            pretrained=None,
 13 |                            in_channels=1,
 14 |                            norm_eval=False),
 15 |              cls_head=dict(type='AudioTSNHead',
 16 |                            num_classes=4,
 17 |                            in_channels=1024,
 18 |                            dropout_ratio=0.5,
 19 |                            init_std=0.01),
 20 |              train_cfg=None,
 21 |              test_cfg=dict(average_clips='prob'))
 22 | train_pipeline = [
 23 |     dict(type='LoadAudioFeature'),
 24 |     dict(type='SampleFrames', clip_len=64, frame_interval=1, num_clips=1),
 25 |     dict(type='AudioFeatureSelector'),
 26 |     dict(type='FormatAudioShape', input_format='NCTF'),
 27 |     dict(type='Collect', keys=['audios', 'label'], meta_keys=[]),
 28 |     dict(type='ToTensor', keys=['audios'])
 29 | ]
 30 | val_pipeline = [
 31 |     dict(type='LoadAudioFeature'),
 32 |     dict(type='SampleFrames',
 33 |          clip_len=64,
 34 |          frame_interval=1,
 35 |          num_clips=1,
 36 |          test_mode=True),
 37 |     dict(type='AudioFeatureSelector'),
 38 |     dict(type='FormatAudioShape', input_format='NCTF'),
 39 |     dict(type='Collect', keys=['audios', 'label'], meta_keys=[]),
 40 |     dict(type='ToTensor', keys=['audios'])
 41 | ]
 42 | test_pipeline = [
 43 |     dict(type='LoadAudioFeature'),
 44 |     dict(type='SampleFrames',
 45 |          clip_len=64,
 46 |          frame_interval=1,
 47 |          num_clips=10,
 48 |          test_mode=True),
 49 |     dict(type='AudioFeatureSelector'),
 50 |     dict(type='FormatAudioShape', input_format='NCTF'),
 51 |     dict(type='Collect', keys=['audios', 'label'], meta_keys=[]),
 52 |     dict(type='ToTensor', keys=['audios'])
 53 | ]
 54 | data = dict(videos_per_gpu=16,
 55 |             workers_per_gpu=1,
 56 |             test_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1),
 57 |             val_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1),
 58 |             train=dict(type='AudioFeatureDataset',
 59 |                        ann_file='train.txt',
 60 |                        data_prefix='',
 61 |                        pipeline=[
 62 |                            dict(type='LoadAudioFeature'),
 63 |                            dict(type='SampleFrames',
 64 |                                 clip_len=64,
 65 |                                 frame_interval=1,
 66 |                                 num_clips=1),
 67 |                            dict(type='AudioFeatureSelector'),
 68 |                            dict(type='FormatAudioShape', input_format='NCTF'),
 69 |                            dict(type='Collect',
 70 |                                 keys=['audios', 'label'],
 71 |                                 meta_keys=[]),
 72 |                            dict(type='ToTensor', keys=['audios'])
 73 |                        ]),
 74 |             val=dict(type='AudioFeatureDataset',
 75 |                      ann_file='val.txt',
 76 |                      data_prefix='',
 77 |                      pipeline=[
 78 |                          dict(type='LoadAudioFeature'),
 79 |                          dict(type='SampleFrames',
 80 |                               clip_len=64,
 81 |                               frame_interval=1,
 82 |                               num_clips=1,
 83 |                               test_mode=True),
 84 |                          dict(type='AudioFeatureSelector'),
 85 |                          dict(type='FormatAudioShape', input_format='NCTF'),
 86 |                          dict(type='Collect',
 87 |                               keys=['audios', 'label'],
 88 |                               meta_keys=[]),
 89 |                          dict(type='ToTensor', keys=['audios'])
 90 |                      ]),
 91 |             test=dict(type='AudioFeatureDataset',
 92 |                       ann_file='val.txt',
 93 |                       data_prefix='',
 94 |                       pipeline=[
 95 |                           dict(type='LoadAudioFeature'),
 96 |                           dict(type='SampleFrames',
 97 |                                clip_len=64,
 98 |                                frame_interval=1,
 99 |                                num_clips=10,
100 |                                test_mode=True),
101 |                           dict(type='AudioFeatureSelector'),
102 |                           dict(type='FormatAudioShape', input_format='NCTF'),
103 |                           dict(type='Collect',
104 |                                keys=['audios', 'label'],
105 |                                meta_keys=[]),
106 |                           dict(type='ToTensor', keys=['audios'])
107 |                       ]))
108 | evaluation = dict(interval=5,
109 |                   metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4,
110 |                                                                 5))))
111 | eval_config = dict(metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4,
112 |                                                                  5))))
113 | optimizer = dict(type='SGD', lr=0.025, momentum=0.9, weight_decay=0.0001)
114 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
115 | lr_config = dict(policy='CosineAnnealing', min_lr=0)
116 | total_epochs = 240
117 | checkpoint_config = dict(interval=20)
118 | log_config = dict(interval=20, hooks=[dict(type='TextLoggerHook')])
119 | dist_params = dict(backend='nccl')
120 | log_level = 'INFO'
121 | load_from = None
122 | resume_from = None
123 | workflow = [('train', 1)]
124 | opencv_num_threads = 0
125 | mp_start_method = 'fork'
126 | work_dir = 'mmaction2/work_dir/audio/'
127 | gpu_ids = range(0, 1)
128 | omnisource = False
129 | module_hooks = []
130 | 


--------------------------------------------------------------------------------
/checkpoints/har/slowonly_u54_kinetics.py:
--------------------------------------------------------------------------------
  1 | dataset_type = 'PoseDataset'
  2 | data_root = '/mmaction2/data/'
  3 | data_root_val = '/mmaction2/data/'
  4 | data_root_test = '/mmaction2/data/'
  5 | ann_file_train = '/mmaction2/data/train.pkl'
  6 | ann_file_val = '/mmaction2/data/val.pkl'
  7 | ann_file_test = '/mmaction2/data/val.pkl'
  8 | num_classes = 6
  9 | left_kp = [1, 3, 5, 7, 9, 11, 13, 15]
 10 | right_kp = [2, 4, 6, 8, 10, 12, 14, 16]
 11 | model = dict(type='Recognizer3D',
 12 |              backbone=dict(type='ResNet3dSlowOnly',
 13 |                            depth=50,
 14 |                            pretrained=None,
 15 |                            in_channels=17,
 16 |                            base_channels=32,
 17 |                            num_stages=3,
 18 |                            out_indices=(2, ),
 19 |                            stage_blocks=(4, 6, 3),
 20 |                            conv1_stride_s=1,
 21 |                            pool1_stride_s=1,
 22 |                            inflate=(0, 1, 1),
 23 |                            spatial_strides=(2, 2, 2),
 24 |                            temporal_strides=(1, 1, 2),
 25 |                            dilations=(1, 1, 1)),
 26 |              cls_head=dict(type='I3DHead',
 27 |                            in_channels=512,
 28 |                            num_classes=6,
 29 |                            spatial_type='avg',
 30 |                            dropout_ratio=0.7),
 31 |              train_cfg=dict(),
 32 |              test_cfg=dict(average_clips='prob'))
 33 | train_pipeline = [
 34 |     dict(type='UniformSampleFrames', clip_len=54),
 35 |     dict(type='PoseDecode'),
 36 |     dict(type='PoseCompact', hw_ratio=1.0, allow_imgpad=True),
 37 |     dict(type='Resize', scale=(-1, 64)),
 38 |     dict(type='RandomResizedCrop', area_range=(0.56, 1.0)),
 39 |     dict(type='Resize', scale=(56, 56), keep_ratio=False),
 40 |     dict(type='Flip',
 41 |          flip_ratio=0.5,
 42 |          left_kp=[1, 3, 5, 7, 9, 11, 13, 15],
 43 |          right_kp=[2, 4, 6, 8, 10, 12, 14, 16]),
 44 |     dict(type='GeneratePoseTarget',
 45 |          sigma=0.6,
 46 |          use_score=True,
 47 |          with_kp=True,
 48 |          with_limb=False),
 49 |     dict(type='FormatShape', input_format='NCTHW'),
 50 |     dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
 51 |     dict(type='ToTensor', keys=['imgs', 'label'])
 52 | ]
 53 | val_pipeline = [
 54 |     dict(type='UniformSampleFrames', clip_len=54, num_clips=1, test_mode=True),
 55 |     dict(type='PoseDecode'),
 56 |     dict(type='PoseCompact', hw_ratio=1.0, allow_imgpad=True),
 57 |     dict(type='Resize', scale=(-1, 64)),
 58 |     dict(type='CenterCrop', crop_size=64),
 59 |     dict(type='GeneratePoseTarget',
 60 |          sigma=0.6,
 61 |          use_score=True,
 62 |          with_kp=True,
 63 |          with_limb=False),
 64 |     dict(type='FormatShape', input_format='NCTHW'),
 65 |     dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
 66 |     dict(type='ToTensor', keys=['imgs'])
 67 | ]
 68 | test_pipeline = [
 69 |     dict(type='UniformSampleFrames', clip_len=54, num_clips=10,
 70 |          test_mode=True),
 71 |     dict(type='PoseDecode'),
 72 |     dict(type='PoseCompact', hw_ratio=1.0, allow_imgpad=True),
 73 |     dict(type='Resize', scale=(-1, 64)),
 74 |     dict(type='CenterCrop', crop_size=64),
 75 |     dict(type='GeneratePoseTarget',
 76 |          sigma=0.6,
 77 |          use_score=True,
 78 |          with_kp=True,
 79 |          with_limb=False,
 80 |          double=True,
 81 |          left_kp=[1, 3, 5, 7, 9, 11, 13, 15],
 82 |          right_kp=[2, 4, 6, 8, 10, 12, 14, 16]),
 83 |     dict(type='FormatShape', input_format='NCTHW'),
 84 |     dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
 85 |     dict(type='ToTensor', keys=['imgs'])
 86 | ]
 87 | data = dict(videos_per_gpu=16,
 88 |             workers_per_gpu=2,
 89 |             test_dataloader=dict(videos_per_gpu=1),
 90 |             train=dict(type='PoseDataset',
 91 |                        ann_file='/mmaction2/data//train.pkl',
 92 |                        data_prefix='',
 93 |                        pipeline=[
 94 |                            dict(type='UniformSampleFrames', clip_len=54),
 95 |                            dict(type='PoseDecode'),
 96 |                            dict(type='PoseCompact',
 97 |                                 hw_ratio=1.0,
 98 |                                 allow_imgpad=True),
 99 |                            dict(type='Resize', scale=(-1, 64)),
100 |                            dict(type='RandomResizedCrop',
101 |                                 area_range=(0.56, 1.0)),
102 |                            dict(type='Resize',
103 |                                 scale=(56, 56),
104 |                                 keep_ratio=False),
105 |                            dict(type='Flip',
106 |                                 flip_ratio=0.5,
107 |                                 left_kp=[1, 3, 5, 7, 9, 11, 13, 15],
108 |                                 right_kp=[2, 4, 6, 8, 10, 12, 14, 16]),
109 |                            dict(type='GeneratePoseTarget',
110 |                                 sigma=0.6,
111 |                                 use_score=True,
112 |                                 with_kp=True,
113 |                                 with_limb=False),
114 |                            dict(type='FormatShape', input_format='NCTHW'),
115 |                            dict(type='Collect',
116 |                                 keys=['imgs', 'label'],
117 |                                 meta_keys=[]),
118 |                            dict(type='ToTensor', keys=['imgs', 'label'])
119 |                        ]),
120 |             val=dict(type='PoseDataset',
121 |                      ann_file='/mmaction2/data//val.pkl',
122 |                      data_prefix='',
123 |                      pipeline=[
124 |                          dict(type='UniformSampleFrames',
125 |                               clip_len=54,
126 |                               num_clips=1,
127 |                               test_mode=True),
128 |                          dict(type='PoseDecode'),
129 |                          dict(type='PoseCompact',
130 |                               hw_ratio=1.0,
131 |                               allow_imgpad=True),
132 |                          dict(type='Resize', scale=(-1, 64)),
133 |                          dict(type='CenterCrop', crop_size=64),
134 |                          dict(type='GeneratePoseTarget',
135 |                               sigma=0.6,
136 |                               use_score=True,
137 |                               with_kp=True,
138 |                               with_limb=False),
139 |                          dict(type='FormatShape', input_format='NCTHW'),
140 |                          dict(type='Collect',
141 |                               keys=['imgs', 'label'],
142 |                               meta_keys=[]),
143 |                          dict(type='ToTensor', keys=['imgs'])
144 |                      ]),
145 |             test=dict(type='PoseDataset',
146 |                       ann_file='/mmaction2/data//val.pkl',
147 |                       data_prefix='',
148 |                       pipeline=[
149 |                           dict(type='UniformSampleFrames',
150 |                                clip_len=54,
151 |                                num_clips=10,
152 |                                test_mode=True),
153 |                           dict(type='PoseDecode'),
154 |                           dict(type='PoseCompact',
155 |                                hw_ratio=1.0,
156 |                                allow_imgpad=True),
157 |                           dict(type='Resize', scale=(-1, 64)),
158 |                           dict(type='CenterCrop', crop_size=64),
159 |                           dict(type='GeneratePoseTarget',
160 |                                sigma=0.6,
161 |                                use_score=True,
162 |                                with_kp=True,
163 |                                with_limb=False,
164 |                                double=True,
165 |                                left_kp=[1, 3, 5, 7, 9, 11, 13, 15],
166 |                                right_kp=[2, 4, 6, 8, 10, 12, 14, 16]),
167 |                           dict(type='FormatShape', input_format='NCTHW'),
168 |                           dict(type='Collect',
169 |                                keys=['imgs', 'label'],
170 |                                meta_keys=[]),
171 |                           dict(type='ToTensor', keys=['imgs'])
172 |                       ]))
173 | optimizer = dict(type='SGD', lr=0.05, momentum=0.9, weight_decay=0.0003)
174 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
175 | lr_config = dict(policy='CosineAnnealing', by_epoch=False, min_lr=0)
176 | total_epochs = 480
177 | checkpoint_config = dict(interval=20)
178 | workflow = [('train', 10)]
179 | evaluation = dict(interval=5,
180 |                   metrics=['top_k_accuracy', 'mean_class_accuracy'],
181 |                   topk=(1, 2, 3, 4, 5))
182 | eval_config = dict(metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4,
183 |                                                                  5))))
184 | log_config = dict(interval=20, hooks=[dict(type='TextLoggerHook')])
185 | dist_params = dict(backend='nccl')
186 | log_level = 'INFO'
187 | load_from = ('https://download.openmmlab.com/mmaction/skeleton/posec3d/'
188 |              'slowonly_kinetics400_pretrained_r50_u48_120e_ucf101_split1_'
189 |              'keypoint/slowonly_kinetics400_pretrained_r50_u48_120e_ucf101'
190 |              '_split1_keypoint-cae8aa4a.pth')
191 | resume_from = None
192 | find_unused_parameters = False
193 | work_dir = 'work_dir/posec3d/'
194 | gpu_ids = range(0, 1)
195 | omnisource = False
196 | module_hooks = []
197 | 


--------------------------------------------------------------------------------
/checkpoints/har/timesformer_divST_16x12x1_kinetics.py:
--------------------------------------------------------------------------------
  1 | # * dataset settings
  2 | dataset_type = 'VideoDataset'
  3 | data_root = 'mmaction2/data/phar/'
  4 | data_root_val = data_root
  5 | data_root_test = data_root
  6 | ann_file_train = f'{data_root}/train_aug.txt'
  7 | ann_file_val = f'{data_root_val}/val.txt'
  8 | ann_file_test = f'{data_root_test}/val.txt'
  9 | num_classes = 17
 10 | img_norm_cfg = dict(mean=[127.5, 127.5, 127.5],
 11 |                     std=[127.5, 127.5, 127.5],
 12 |                     to_bgr=False)
 13 | 
 14 | # * model settings
 15 | model = dict(
 16 |     type='Recognizer3D',
 17 |     backbone=dict(
 18 |         type='TimeSformer',
 19 |         pretrained=  # noqa: E251
 20 |         'https://download.openmmlab.com/mmaction/recognition/timesformer/vit_base_patch16_224.pth',  # noqa: E501
 21 |         num_frames=16,
 22 |         img_size=224,
 23 |         patch_size=16,
 24 |         embed_dims=768,
 25 |         in_channels=3,
 26 |         dropout_ratio=0.2,
 27 |         transformer_layers=None,
 28 |         # divided attention is the best strategy
 29 |         attention_type='divided_space_time',
 30 |         norm_cfg=dict(type='LN', eps=1e-6)),
 31 |     cls_head=dict(type='TimeSformerHead',
 32 |                   num_classes=num_classes,
 33 |                   in_channels=768,
 34 |                   topk=(1, 2, 3, 4, 5)),
 35 |     # model training and testing settings
 36 |     train_cfg=None,
 37 |     test_cfg=dict(average_clips='prob'))
 38 | 
 39 | train_pipeline = [
 40 |     dict(type='DecordInit'),
 41 |     # * frame_interval has been selected for 7s clips
 42 |     dict(type='SampleFrames', clip_len=16, frame_interval=12, num_clips=1),
 43 |     dict(type='DecordDecode'),
 44 |     dict(type='RandomRescale', scale_range=(256, 320)),
 45 |     dict(type='RandomCrop', size=224),
 46 |     dict(type='Flip', flip_ratio=0.5),
 47 |     dict(type='Normalize', **img_norm_cfg),
 48 |     dict(type='FormatShape', input_format='NCTHW'),
 49 |     dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
 50 |     dict(type='ToTensor', keys=['imgs', 'label'])
 51 | ]
 52 | val_pipeline = [
 53 |     dict(type='DecordInit'),
 54 |     dict(type='SampleFrames',
 55 |          clip_len=16,
 56 |          frame_interval=12,
 57 |          num_clips=1,
 58 |          test_mode=True),
 59 |     dict(type='DecordDecode'),
 60 |     dict(type='Resize', scale=(-1, 256)),
 61 |     dict(type='CenterCrop', crop_size=224),
 62 |     dict(type='Normalize', **img_norm_cfg),
 63 |     dict(type='FormatShape', input_format='NCTHW'),
 64 |     dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
 65 |     dict(type='ToTensor', keys=['imgs', 'label'])
 66 | ]
 67 | test_pipeline = [
 68 |     dict(type='DecordInit'),
 69 |     dict(type='SampleFrames',
 70 |          clip_len=16,
 71 |          frame_interval=12,
 72 |          num_clips=1,
 73 |          test_mode=True),
 74 |     dict(type='DecordDecode'),
 75 |     dict(type='Resize', scale=(-1, 224)),
 76 |     dict(type='ThreeCrop', crop_size=224),
 77 |     dict(type='Normalize', **img_norm_cfg),
 78 |     dict(type='FormatShape', input_format='NCTHW'),
 79 |     dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
 80 |     dict(type='ToTensor', keys=['imgs', 'label'])
 81 | ]
 82 | data = dict(videos_per_gpu=1,
 83 |             workers_per_gpu=1,
 84 |             test_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1),
 85 |             val_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1),
 86 |             train=dict(type=dataset_type,
 87 |                        ann_file=ann_file_train,
 88 |                        data_prefix='',
 89 |                        pipeline=train_pipeline),
 90 |             val=dict(type=dataset_type,
 91 |                      ann_file=ann_file_val,
 92 |                      data_prefix='',
 93 |                      pipeline=val_pipeline),
 94 |             test=dict(type=dataset_type,
 95 |                       ann_file=ann_file_test,
 96 |                       data_prefix='',
 97 |                       pipeline=test_pipeline))
 98 | 
 99 | # set the top-k accuracy during validation
100 | evaluation = dict(
101 |     interval=1,
102 |     metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 5))),
103 | )
104 | # set the top-k accuracy during testing
105 | eval_config = dict(metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4,
106 |                                                                  5))), )
107 | 
108 | # optimizer
109 | optimizer = dict(type='SGD',
110 |                  lr=0.0015625,
111 |                  momentum=0.9,
112 |                  paramwise_cfg=dict(
113 |                      custom_keys={
114 |                          '.backbone.cls_token': dict(decay_mult=0.0),
115 |                          '.backbone.pos_embed': dict(decay_mult=0.0),
116 |                          '.backbone.time_embed': dict(decay_mult=0.0)
117 |                      }),
118 |                  weight_decay=1e-4,
119 |                  nesterov=True)  # this lr is used for 8 gpus
120 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
121 | 
122 | # learning policy
123 | lr_config = dict(policy='step', step=[5, 10])
124 | total_epochs = 25
125 | 
126 | # * runtime settings
127 | checkpoint_config = dict(interval=1)
128 | log_config = dict(
129 |     interval=1000,
130 |     hooks=[
131 |         dict(type='TextLoggerHook'),
132 |         # dict(type='TensorboardLoggerHook'),
133 |     ])
134 | dist_params = dict(backend='nccl')
135 | log_level = 'INFO'
136 | load_from = ('https://download.openmmlab.com/mmaction/recognition/timesformer/'
137 |              'timesformer_divST_8x32x1_15e_kinetics400_rgb/'
138 |              'timesformer_divST_8x32x1_15e_kinetics400_rgb-3f8e5d03.pth')
139 | resume_from = None
140 | workflow = [('train', 1)]
141 | 
142 | # disable opencv multithreading to avoid system being overloaded
143 | opencv_num_threads = 0
144 | # set multi-process start method as `fork` to speed up the training
145 | mp_start_method = 'fork'
146 | 


--------------------------------------------------------------------------------
/checkpoints/pose/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/checkpoints/pose/.gitkeep


--------------------------------------------------------------------------------
/configs/audio/audioonly_r101_64x1x1_200e_audio_feature.py:
--------------------------------------------------------------------------------
  1 | # * dataset settings
  2 | dataset_type = 'AudioFeatureDataset'
  3 | data_root = ('/home/rejnald/projects/side_projects/phar/mmaction2/data/phar/'
  4 |              'audio_feature/filtered_30/')
  5 | data_root_val = data_root
  6 | data_root_test = data_root
  7 | ann_file_train = f'{data_root}/train.txt'
  8 | ann_file_val = f'{data_root_val}/val.txt'
  9 | ann_file_test = f'{data_root_test}/val.txt'
 10 | num_classes = 4
 11 | 
 12 | # * model settings
 13 | model = dict(
 14 |     type='AudioRecognizer',
 15 |     backbone=dict(type='ResNetAudio',
 16 |                   depth=101,
 17 |                   pretrained=None,
 18 |                   in_channels=1,
 19 |                   norm_eval=False),
 20 |     cls_head=dict(
 21 |         type='AudioTSNHead',
 22 |         num_classes=num_classes,
 23 |         in_channels=1024,
 24 |         dropout_ratio=0.5,  # TODO: 0.6 - 0.8
 25 |         init_std=0.01,
 26 |         topk=(1, 2, 3, 4, 5)),
 27 |     # model training and testing settings
 28 |     train_cfg=None,
 29 |     test_cfg=dict(average_clips='prob'))
 30 | 
 31 | train_pipeline = [
 32 |     dict(type='LoadAudioFeature'),
 33 |     dict(type='SampleFrames', clip_len=64, frame_interval=1, num_clips=1),
 34 |     dict(type='AudioFeatureSelector'),
 35 |     dict(type='FormatAudioShape', input_format='NCTF'),
 36 |     dict(type='Collect', keys=['audios', 'label'], meta_keys=[]),
 37 |     dict(type='ToTensor', keys=['audios'])
 38 | ]
 39 | val_pipeline = [
 40 |     dict(type='LoadAudioFeature'),
 41 |     dict(type='SampleFrames',
 42 |          clip_len=64,
 43 |          frame_interval=1,
 44 |          num_clips=1,
 45 |          test_mode=True),
 46 |     dict(type='AudioFeatureSelector'),
 47 |     dict(type='FormatAudioShape', input_format='NCTF'),
 48 |     dict(type='Collect', keys=['audios', 'label'], meta_keys=[]),
 49 |     dict(type='ToTensor', keys=['audios'])
 50 | ]
 51 | test_pipeline = [
 52 |     dict(type='LoadAudioFeature'),
 53 |     dict(type='SampleFrames',
 54 |          clip_len=64,
 55 |          frame_interval=1,
 56 |          num_clips=10,
 57 |          test_mode=True),
 58 |     dict(type='AudioFeatureSelector'),
 59 |     dict(type='FormatAudioShape', input_format='NCTF'),
 60 |     dict(type='Collect', keys=['audios', 'label'], meta_keys=[]),
 61 |     dict(type='ToTensor', keys=['audios'])
 62 | ]
 63 | data = dict(videos_per_gpu=16,
 64 |             workers_per_gpu=1,
 65 |             test_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1),
 66 |             val_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1),
 67 |             train=dict(type=dataset_type,
 68 |                        ann_file=ann_file_train,
 69 |                        data_prefix='',
 70 |                        pipeline=train_pipeline),
 71 |             val=dict(type=dataset_type,
 72 |                      ann_file=ann_file_val,
 73 |                      data_prefix='',
 74 |                      pipeline=val_pipeline),
 75 |             test=dict(type=dataset_type,
 76 |                       ann_file=ann_file_test,
 77 |                       data_prefix='',
 78 |                       pipeline=test_pipeline))
 79 | # set the top-k accuracy during validation
 80 | evaluation = dict(
 81 |     interval=5,  # Interval to perform evaluation
 82 |     metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 5))),
 83 | )
 84 | # set the top-k accuracy during testing
 85 | eval_config = dict(metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4,
 86 |                                                                  5))), )
 87 | 
 88 | # optimizer
 89 | optimizer = dict(type='SGD', lr=0.0025, momentum=0.9,
 90 |                  weight_decay=0.0001)  # this lr is used for 8 gpus
 91 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
 92 | # learning policy
 93 | lr_config = dict(policy='CosineAnnealing', min_lr=0)
 94 | total_epochs = 320
 95 | 
 96 | # * runtime settings
 97 | checkpoint_config = dict(interval=20)
 98 | log_config = dict(
 99 |     interval=100,
100 |     hooks=[
101 |         dict(type='TextLoggerHook'),
102 |         # dict(type='TensorboardLoggerHook'),
103 |     ])
104 | # runtime settings
105 | dist_params = dict(backend='nccl')
106 | log_level = 'INFO'
107 | load_from = None
108 | resume_from = None
109 | workflow = [('train', 1)]
110 | 
111 | # disable opencv multithreading to avoid system being overloaded
112 | opencv_num_threads = 0
113 | # set multi-process start method as `fork` to speed up the training
114 | mp_start_method = 'fork'
115 | 


--------------------------------------------------------------------------------
/configs/audio/tsn_r18_64x1x1_100e_kinetics200_audio_feature.py:
--------------------------------------------------------------------------------
  1 | # * dataset settings
  2 | dataset_type = 'AudioFeatureDataset'
  3 | data_root = ('/home/rejnald/projects/side_projects/phar/mmaction2/data/phar/'
  4 |              'audio_feature/filtered_20/')
  5 | data_root_val = data_root
  6 | data_root_test = data_root
  7 | ann_file_train = f'{data_root}/train.txt'
  8 | ann_file_val = f'{data_root_val}/val.txt'
  9 | ann_file_test = f'{data_root_test}/test.txt'
 10 | num_classes = 3
 11 | 
 12 | # * model settings
 13 | model = dict(
 14 |     type='AudioRecognizer',
 15 |     backbone=dict(type='ResNet', depth=18, in_channels=1, norm_eval=False),
 16 |     cls_head=dict(type='AudioTSNHead',
 17 |                   num_classes=num_classes,
 18 |                   in_channels=512,
 19 |                   dropout_ratio=0.7,
 20 |                   init_std=0.01,
 21 |                   topk=(1, 2, 3, 4, 5)),
 22 |     # model training and testing settings
 23 |     train_cfg=None,
 24 |     test_cfg=dict(average_clips='prob'))
 25 | 
 26 | train_pipeline = [
 27 |     dict(type='LoadAudioFeature'),
 28 |     dict(type='SampleFrames', clip_len=64, frame_interval=2, num_clips=1),
 29 |     dict(type='AudioFeatureSelector'),
 30 |     dict(type='FormatAudioShape', input_format='NCTF'),
 31 |     dict(type='Collect', keys=['audios', 'label'], meta_keys=[]),
 32 |     dict(type='ToTensor', keys=['audios'])
 33 | ]
 34 | val_pipeline = [
 35 |     dict(type='LoadAudioFeature'),
 36 |     dict(type='SampleFrames',
 37 |          clip_len=64,
 38 |          frame_interval=2,
 39 |          num_clips=1,
 40 |          test_mode=True),
 41 |     dict(type='AudioFeatureSelector'),
 42 |     dict(type='FormatAudioShape', input_format='NCTF'),
 43 |     dict(type='Collect', keys=['audios', 'label'], meta_keys=[]),
 44 |     dict(type='ToTensor', keys=['audios'])
 45 | ]
 46 | test_pipeline = [
 47 |     dict(type='LoadAudioFeature'),
 48 |     dict(type='SampleFrames',
 49 |          clip_len=64,
 50 |          frame_interval=2,
 51 |          num_clips=1,
 52 |          test_mode=True),
 53 |     dict(type='AudioFeatureSelector'),
 54 |     dict(type='FormatAudioShape', input_format='NCTF'),
 55 |     dict(type='Collect', keys=['audios', 'label'], meta_keys=[]),
 56 |     dict(type='ToTensor', keys=['audios'])
 57 | ]
 58 | data = dict(videos_per_gpu=32,
 59 |             workers_per_gpu=2,
 60 |             test_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1),
 61 |             val_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1),
 62 |             train=dict(type=dataset_type,
 63 |                        ann_file=ann_file_train,
 64 |                        data_prefix='',
 65 |                        pipeline=train_pipeline),
 66 |             val=dict(type=dataset_type,
 67 |                      ann_file=ann_file_val,
 68 |                      data_prefix='',
 69 |                      pipeline=val_pipeline),
 70 |             test=dict(type=dataset_type,
 71 |                       ann_file=ann_file_test,
 72 |                       data_prefix='',
 73 |                       pipeline=test_pipeline))
 74 | # set the top-k accuracy during validation
 75 | evaluation = dict(
 76 |     interval=5,  # Interval to perform evaluation
 77 |     metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 5))),
 78 | )
 79 | # set the top-k accuracy during testing
 80 | eval_config = dict(metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4,
 81 |                                                                  5))), )
 82 | 
 83 | # optimizer
 84 | optimizer = dict(type='SGD', lr=0.1, momentum=0.9,
 85 |                  weight_decay=0.0001)  # this lr is used for 8 gpus
 86 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
 87 | # learning policy
 88 | lr_config = dict(policy='CosineAnnealing', min_lr=0)
 89 | total_epochs = 200
 90 | 
 91 | # * runtime settings
 92 | checkpoint_config = dict(interval=20)
 93 | log_config = dict(
 94 |     interval=20,
 95 |     hooks=[
 96 |         dict(type='TextLoggerHook'),
 97 |         # dict(type='TensorboardLoggerHook'),
 98 |     ])
 99 | # runtime settings
100 | dist_params = dict(backend='nccl')
101 | log_level = 'INFO'
102 | load_from = (
103 |     'https://download.openmmlab.com/mmaction/recognition/'
104 |     'audio_recognition/tsn_r18_64x1x1_100e_kinetics400_audio_feature/'
105 |     'tsn_r18_64x1x1_100e_kinetics400_audio_feature_20201012-bf34df6c.pth')
106 | # load_from=None
107 | resume_from = None
108 | workflow = [('train', 1)]
109 | 
110 | # disable opencv multithreading to avoid system being overloaded
111 | opencv_num_threads = 0
112 | # set multi-process start method as `fork` to speed up the training
113 | mp_start_method = 'fork'
114 | 


--------------------------------------------------------------------------------
/configs/audio/tsn_r50_64x1x1_100e_kinetics400_audio.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # * dataset settings
  3 | dataset_type = 'AudioFeatureDataset'
  4 | data_root = ('/home/rejnald/projects/side_projects/phar/mmaction2/data/phar/'
  5 |              'audio/')
  6 | data_root_val = data_root
  7 | data_root_test = data_root
  8 | ann_file_train = f'{data_root}/train.txt'
  9 | ann_file_val = f'{data_root_val}/val.txt'
 10 | ann_file_test = f'{data_root_test}/test.txt'
 11 | num_classes = 4
 12 | 
 13 | # * model settings
 14 | model = dict(
 15 |     type='AudioRecognizer',
 16 |     backbone=dict(type='ResNet', depth=50, in_channels=1, norm_eval=False),
 17 |     cls_head=dict(
 18 |         type='AudioTSNHead',
 19 |         num_classes=num_classes,
 20 |         in_channels=2048,
 21 |         dropout_ratio=0.5,
 22 |         init_std=0.01),
 23 |     # model training and testing settings
 24 |     train_cfg=None,
 25 |     test_cfg=dict(average_clips='prob'))
 26 | 
 27 | 
 28 | train_pipeline = [
 29 |     dict(type='AudioDecodeInit'),
 30 |     dict(type='SampleFrames', clip_len=64, frame_interval=1, num_clips=1),
 31 |     dict(type='AudioDecode'),
 32 |     dict(type='AudioAmplify', ratio=1.5),
 33 |     # dict(type='MelLogSpectrogram'),
 34 |     dict(type='FormatAudioShape', input_format='NCTF'),
 35 |     dict(type='Collect', keys=['audios', 'label'], meta_keys=[]),
 36 |     dict(type='ToTensor', keys=['audios'])
 37 | ]
 38 | val_pipeline = [
 39 |     dict(type='AudioDecodeInit'),
 40 |     dict(
 41 |         type='SampleFrames',
 42 |         clip_len=64,
 43 |         frame_interval=1,
 44 |         num_clips=1,
 45 |         test_mode=True),
 46 |     dict(type='AudioDecode'),
 47 |     dict(type='AudioAmplify', ratio=1.5),
 48 |     # dict(type='MelLogSpectrogram'),
 49 |     dict(type='FormatAudioShape', input_format='NCTF'),
 50 |     dict(type='Collect', keys=['audios', 'label'], meta_keys=[]),
 51 |     dict(type='ToTensor', keys=['audios'])
 52 | ]
 53 | test_pipeline = [
 54 |     dict(type='AudioDecodeInit'),
 55 |     dict(
 56 |         type='SampleFrames',
 57 |         clip_len=64,
 58 |         frame_interval=1,
 59 |         num_clips=1,
 60 |         test_mode=True),
 61 |     dict(type='AudioDecodeInit'),
 62 |     dict(type='AudioAmplify', ratio=1.5),
 63 |     # dict(type='MelLogSpectrogram'),
 64 |     dict(type='FormatAudioShape', input_format='NCTF'),
 65 |     dict(type='Collect', keys=['audios', 'label'], meta_keys=[]),
 66 |     dict(type='ToTensor', keys=['audios'])
 67 | ]
 68 | data = dict(
 69 |     videos_per_gpu=32,
 70 |     workers_per_gpu=1,
 71 |     test_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1),
 72 |     val_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1),
 73 |     train=dict(
 74 |         type=dataset_type,
 75 |         ann_file=ann_file_train,
 76 |         data_prefix='',
 77 |         pipeline=train_pipeline),
 78 |     val=dict(
 79 |         type=dataset_type,
 80 |         ann_file=ann_file_val,
 81 |         data_prefix='',
 82 |         pipeline=val_pipeline),
 83 |     test=dict(
 84 |         type=dataset_type,
 85 |         ann_file=ann_file_test,
 86 |         data_prefix='',
 87 |         pipeline=test_pipeline))
 88 | # set the top-k accuracy during validation
 89 | evaluation = dict(
 90 |     interval=5,  # Interval to perform evaluation
 91 |     metric_options=dict(
 92 |         top_k_accuracy=dict(topk=(1, 2, 3, 4, 5))),)
 93 | # set the top-k accuracy during testing
 94 | eval_config = dict(
 95 |     metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 5))),)
 96 | 
 97 | # optimizer
 98 | optimizer = dict(
 99 |     type='SGD', lr=0.05, momentum=0.9,
100 |     weight_decay=0.0001)  # this lr is used for 8 gpus
101 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
102 | # learning policy
103 | lr_config = dict(policy='CosineAnnealing', min_lr=0)
104 | total_epochs = 540
105 | 
106 | # * runtime settings
107 | checkpoint_config = dict(interval=10)
108 | log_config = dict(
109 |     interval=20,
110 |     hooks=[
111 |         dict(type='TextLoggerHook'),
112 |         # dict(type='TensorboardLoggerHook'),
113 |     ])
114 | # runtime settings
115 | dist_params = dict(backend='nccl')
116 | log_level = 'INFO'
117 | load_from = None
118 | resume_from = None
119 | workflow = [('train', 1)]
120 | 
121 | # disable opencv multithreading to avoid system being overloaded
122 | opencv_num_threads = 0
123 | # set multi-process start method as `fork` to speed up the training
124 | mp_start_method = 'fork'
125 | 


--------------------------------------------------------------------------------
/configs/i3d/i3d_r50_video_32x2x1_256e_kinetics400_rgb.py:
--------------------------------------------------------------------------------
  1 | # * dataset settings
  2 | dataset_type = 'VideoDataset'
  3 | data_root = '/home/rejnald/projects/side_projects/phar/mmaction2/data/phar/'
  4 | data_root_val = data_root
  5 | data_root_test = data_root
  6 | ann_file_train = f'{data_root}/train_aug.txt'
  7 | ann_file_val = f'{data_root_val}/val.txt'
  8 | ann_file_test = f'{data_root_test}/val.txt'
  9 | num_classes = 17
 10 | 
 11 | # * model settings
 12 | model = dict(
 13 |     type='Recognizer3D',
 14 |     backbone=dict(type='ResNet3d',
 15 |                   pretrained2d=True,
 16 |                   pretrained='torchvision://resnet50',
 17 |                   depth=50,
 18 |                   conv1_kernel=(5, 7, 7),
 19 |                   conv1_stride_t=2,
 20 |                   pool1_stride_t=2,
 21 |                   conv_cfg=dict(type='Conv3d'),
 22 |                   norm_eval=False,
 23 |                   inflate=((1, 1, 1), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 1,
 24 |                                                                          0)),
 25 |                   zero_init_residual=False),
 26 |     cls_head=dict(type='I3DHead',
 27 |                   num_classes=num_classes,
 28 |                   in_channels=2048,
 29 |                   spatial_type='avg',
 30 |                   dropout_ratio=0.8,
 31 |                   init_std=0.01,
 32 |                   topk=(1, 2, 3, 4, 5)),
 33 |     # model training and testing settings
 34 |     train_cfg=None,
 35 |     test_cfg=dict(average_clips='prob'))
 36 | 
 37 | # This setting refers to https://github.com/open-mmlab/mmaction/blob/master/mmaction/models/tenons/backbones/resnet_i3d.py#L329-L332  # noqa: E501
 38 | 
 39 | # * dataset settings
 40 | img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
 41 |                     std=[58.395, 57.12, 57.375],
 42 |                     to_bgr=False)
 43 | train_pipeline = [
 44 |     dict(type='DecordInit'),
 45 |     dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
 46 |     dict(type='DecordDecode'),
 47 |     dict(type='Resize', scale=(-1, 256)),
 48 |     dict(type='MultiScaleCrop',
 49 |          input_size=224,
 50 |          scales=(1, 0.8),
 51 |          random_crop=False,
 52 |          max_wh_scale_gap=0),
 53 |     dict(type='Resize', scale=(224, 224), keep_ratio=False),
 54 |     dict(type='Flip', flip_ratio=0.5),
 55 |     dict(type='Normalize', **img_norm_cfg),
 56 |     dict(type='FormatShape', input_format='NCTHW'),
 57 |     dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
 58 |     dict(type='ToTensor', keys=['imgs', 'label'])
 59 | ]
 60 | val_pipeline = [
 61 |     dict(type='DecordInit'),
 62 |     dict(type='SampleFrames',
 63 |          clip_len=32,
 64 |          frame_interval=2,
 65 |          num_clips=1,
 66 |          test_mode=True),
 67 |     dict(type='DecordDecode'),
 68 |     dict(type='Resize', scale=(-1, 256)),
 69 |     dict(type='CenterCrop', crop_size=224),
 70 |     dict(type='Normalize', **img_norm_cfg),
 71 |     dict(type='FormatShape', input_format='NCTHW'),
 72 |     dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
 73 |     dict(type='ToTensor', keys=['imgs'])
 74 | ]
 75 | test_pipeline = [
 76 |     dict(type='DecordInit'),
 77 |     dict(type='SampleFrames',
 78 |          clip_len=32,
 79 |          frame_interval=2,
 80 |          num_clips=8,
 81 |          test_mode=True),
 82 |     dict(type='DecordDecode'),
 83 |     dict(type='Resize', scale=(-1, 256)),
 84 |     dict(type='ThreeCrop', crop_size=256),
 85 |     dict(type='Normalize', **img_norm_cfg),
 86 |     dict(type='FormatShape', input_format='NCTHW'),
 87 |     dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
 88 |     dict(type='ToTensor', keys=['imgs'])
 89 | ]
 90 | 
 91 | data = dict(videos_per_gpu=4,
 92 |             workers_per_gpu=1,
 93 |             test_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1),
 94 |             val_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1),
 95 |             train=dict(
 96 |                 type=dataset_type,
 97 |                 ann_file=ann_file_train,
 98 |                 data_prefix='',
 99 |                 pipeline=train_pipeline,
100 |             ),
101 |             val=dict(
102 |                 type=dataset_type,
103 |                 ann_file=ann_file_val,
104 |                 data_prefix='',
105 |                 pipeline=val_pipeline,
106 |             ),
107 |             test=dict(
108 |                 type=dataset_type,
109 |                 ann_file=ann_file_test,
110 |                 data_prefix='',
111 |                 pipeline=test_pipeline,
112 |             ))
113 | # set the top-k accuracy during validation
114 | evaluation = dict(
115 |     interval=5,  # Interval to perform evaluation
116 |     metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 5))),
117 | )
118 | # set the top-k accuracy during testing
119 | eval_config = dict(metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4,
120 |                                                                  5))), )
121 | 
122 | # * optimizer
123 | optimizer = dict(type='SGD', lr=0.00625, momentum=0.9, weight_decay=0.0001)
124 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
125 | # learning policy
126 | lr_config = dict(policy='step', step=[40, 80])
127 | total_epochs = 256
128 | 
129 | # * runtime settings
130 | checkpoint_config = dict(interval=2)
131 | log_config = dict(
132 |     interval=200,
133 |     hooks=[
134 |         dict(type='TextLoggerHook'),
135 |         # dict(type='TensorboardLoggerHook'),
136 |     ])
137 | dist_params = dict(backend='nccl')
138 | log_level = 'INFO'
139 | load_from = ('https://download.openmmlab.com/mmaction/'
140 |              'recognition/i3d/i3d_r50_video_32x2x1_100e_kinetics400_rgb/'
141 |              'i3d_r50_video_32x2x1_100e_kinetics400_rgb_20200826-e31c6f52.pth')
142 | resume_from = None
143 | workflow = [('train', 1)]
144 | 


--------------------------------------------------------------------------------
/configs/omnisourced/slowonly_r50_8x8x1_256e_omnisource_rgb.py:
--------------------------------------------------------------------------------
  1 | # * dataset settings
  2 | dataset_type = 'VideoDataset'
  3 | data_root = '/home/rejnald/projects/side_projects/phar/mmaction2/data/phar/'
  4 | data_root_val = data_root
  5 | data_root_test = data_root
  6 | ann_file_train = f'{data_root}/train.txt'
  7 | ann_file_val = f'{data_root_val}/val.txt'
  8 | ann_file_test = f'{data_root_test}/test.txt'
  9 | num_classes = 18
 10 | 
 11 | # * model settings
 12 | model = dict(
 13 |     type='Recognizer3D',
 14 |     backbone=dict(type='ResNet3dSlowOnly',
 15 |                   depth=50,
 16 |                   pretrained=None,
 17 |                   lateral=False,
 18 |                   conv1_kernel=(1, 7, 7),
 19 |                   conv1_stride_t=1,
 20 |                   pool1_stride_t=1,
 21 |                   inflate=(0, 0, 1, 1),
 22 |                   norm_eval=False),
 23 |     cls_head=dict(type='I3DHead',
 24 |                   in_channels=2048,
 25 |                   num_classes=num_classes,
 26 |                   spatial_type='avg',
 27 |                   dropout_ratio=0.5,
 28 |                   topk=(1, 2, 3, 4, 5)),
 29 |     # model training and testing settings
 30 |     train_cfg=None,
 31 |     test_cfg=dict(average_clips='prob'))
 32 | 
 33 | # * dataset settings
 34 | img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
 35 |                     std=[58.395, 57.12, 57.375],
 36 |                     to_bgr=False)
 37 | train_pipeline = [
 38 |     dict(type='DecordInit'),
 39 |     dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
 40 |     dict(type='DecordDecode'),
 41 |     dict(type='Resize', scale=(-1, 256)),
 42 |     dict(type='RandomResizedCrop'),
 43 |     dict(type='Resize', scale=(224, 224), keep_ratio=False),
 44 |     dict(type='Flip', flip_ratio=0.5),
 45 |     dict(type='Normalize', **img_norm_cfg),
 46 |     dict(type='FormatShape', input_format='NCTHW'),
 47 |     dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
 48 |     dict(type='ToTensor', keys=['imgs', 'label'])
 49 | ]
 50 | 
 51 | val_pipeline = [
 52 |     dict(type='DecordInit'),
 53 |     dict(type='SampleFrames',
 54 |          clip_len=8,
 55 |          frame_interval=8,
 56 |          num_clips=1,
 57 |          test_mode=True),
 58 |     dict(type='DecordDecode'),
 59 |     dict(type='Resize', scale=(-1, 256)),
 60 |     dict(type='CenterCrop', crop_size=256),
 61 |     dict(type='Normalize', **img_norm_cfg),
 62 |     dict(type='FormatShape', input_format='NCTHW'),
 63 |     dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
 64 |     dict(type='ToTensor', keys=['imgs'])
 65 | ]
 66 | 
 67 | test_pipeline = [
 68 |     dict(type='DecordInit'),
 69 |     dict(type='SampleFrames',
 70 |          clip_len=8,
 71 |          frame_interval=8,
 72 |          num_clips=10,
 73 |          test_mode=True),
 74 |     dict(type='DecordDecode'),
 75 |     dict(type='Resize', scale=(-1, 256)),
 76 |     dict(type='ThreeCrop', crop_size=256),
 77 |     dict(type='Normalize', **img_norm_cfg),
 78 |     dict(type='FormatShape', input_format='NCTHW'),
 79 |     dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
 80 |     dict(type='ToTensor', keys=['imgs'])
 81 | ]
 82 | 
 83 | data = dict(videos_per_gpu=6,
 84 |             workers_per_gpu=1,
 85 |             test_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1),
 86 |             val_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1),
 87 |             train=dict(
 88 |                 type=dataset_type,
 89 |                 ann_file=ann_file_train,
 90 |                 data_prefix='',
 91 |                 pipeline=train_pipeline,
 92 |             ),
 93 |             val=dict(
 94 |                 type=dataset_type,
 95 |                 ann_file=ann_file_val,
 96 |                 data_prefix='',
 97 |                 pipeline=val_pipeline,
 98 |             ),
 99 |             test=dict(
100 |                 type=dataset_type,
101 |                 ann_file=ann_file_test,
102 |                 data_prefix='',
103 |                 pipeline=test_pipeline,
104 |             ))
105 | # set the top-k accuracy during validation
106 | evaluation = dict(
107 |     interval=5,  # Interval to perform evaluation
108 |     metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 5))),
109 | )
110 | # set the top-k accuracy during testing
111 | eval_config = dict(metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4,
112 |                                                                  5))), )
113 | 
114 | # * optimizer
115 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9,
116 |                  weight_decay=0.0001)  # this lr is used for 8 gpus
117 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
118 | # learning policy
119 | lr_config = dict(policy='CosineAnnealing', min_lr=0)
120 | 
121 | # The flag indicates using joint training
122 | # omnisource = True
123 | 
124 | # * runtime settings
125 | checkpoint_config = dict(interval=4)
126 | log_config = dict(
127 |     interval=20,
128 |     hooks=[
129 |         dict(type='TextLoggerHook'),
130 |         # dict(type='TensorboardLoggerHook'),
131 |     ])
132 | # runtime settings
133 | total_epochs = 256
134 | find_unused_parameters = False
135 | dist_params = dict(backend='nccl')
136 | log_level = 'INFO'
137 | load_from = (
138 |     'https://download.openmmlab.com/mmaction/recognition/slowonly'
139 |     '/omni/slowonly_r50_omni_4x16x1_kinetics400_rgb_20200926-51b1f7ea.pth')
140 | resume_from = None
141 | workflow = [('train', 1)]
142 | 
143 | # disable opencv multithreading to avoid system being overloaded
144 | opencv_num_threads = 0
145 | # set multi-process start method as `fork` to speed up the training
146 | mp_start_method = 'fork'
147 | 


--------------------------------------------------------------------------------
/configs/skeleton/agcn/2sagcn_640e_p300_keypoint_2d.py:
--------------------------------------------------------------------------------
 1 | model = dict(type='SkeletonGCN',
 2 |              backbone=dict(type='AGCN',
 3 |                            in_channels=3,
 4 |                            graph_cfg=dict(layout='coco', strategy='agcn'),
 5 |                            dropout=0.2),
 6 |              cls_head=dict(type='STGCNHead',
 7 |                            num_classes=6,
 8 |                            in_channels=256,
 9 |                            loss_cls=dict(type='CrossEntropyLoss'),
10 |                            topk=(1, 2, 3, 4, 5)),
11 |              train_cfg=None,
12 |              test_cfg=None)
13 | 
14 | dataset_type = 'PoseDataset'
15 | ann_file_train = '/mmaction2/data/kinesphere_train.pkl'
16 | ann_file_val = '/mmaction2/data/kinesphere_val.pkl'
17 | ann_file_test = '/mmaction2/data/kinesphere_val.pkl'
18 | 
19 | train_pipeline = [
20 |     dict(type='PaddingWithLoop', clip_len=450),
21 |     dict(type='PoseDecode'),
22 |     dict(type='FormatGCNInput', input_format='NCTVM'),
23 |     dict(type='Collect', keys=['keypoint', 'label'], meta_keys=[]),
24 |     dict(type='ToTensor', keys=['keypoint'])
25 | ]
26 | val_pipeline = [
27 |     dict(type='PaddingWithLoop', clip_len=450),
28 |     dict(type='PoseDecode'),
29 |     dict(type='FormatGCNInput', input_format='NCTVM'),
30 |     dict(type='Collect', keys=['keypoint', 'label'], meta_keys=[]),
31 |     dict(type='ToTensor', keys=['keypoint'])
32 | ]
33 | test_pipeline = [
34 |     dict(type='PaddingWithLoop', clip_len=450),
35 |     dict(type='PoseDecode'),
36 |     dict(type='FormatGCNInput', input_format='NCTVM'),
37 |     dict(type='Collect', keys=['keypoint', 'label'], meta_keys=[]),
38 |     dict(type='ToTensor', keys=['keypoint'])
39 | ]
40 | data = dict(videos_per_gpu=32,
41 |             workers_per_gpu=2,
42 |             test_dataloader=dict(videos_per_gpu=1),
43 |             train=dict(type=dataset_type,
44 |                        ann_file=ann_file_train,
45 |                        data_prefix='',
46 |                        pipeline=train_pipeline),
47 |             val=dict(type=dataset_type,
48 |                      ann_file=ann_file_val,
49 |                      data_prefix='',
50 |                      pipeline=val_pipeline),
51 |             test=dict(type=dataset_type,
52 |                       ann_file=ann_file_val,
53 |                       data_prefix='',
54 |                       pipeline=test_pipeline))
55 | 
56 | # optimizer
57 | optimizer = dict(type='SGD',
58 |                  lr=0.1,
59 |                  momentum=0.9,
60 |                  weight_decay=0.0001,
61 |                  nesterov=True)
62 | optimizer_config = dict(grad_clip=None)
63 | # learning policy
64 | lr_config = dict(policy='step', step=[30, 40, 520])
65 | total_epochs = 640
66 | checkpoint_config = dict(interval=40)
67 | evaluation = dict(interval=5,
68 |                   metrics=['top_k_accuracy', 'mean_class_accuracy'],
69 |                   topk=(1, 2, 3, 4, 5))
70 | eval_config = dict(metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4,
71 |                                                                  5))), )
72 | log_config = dict(interval=20, hooks=[dict(type='TextLoggerHook')])
73 | 
74 | # runtime settings
75 | dist_params = dict(backend='nccl')
76 | log_level = 'INFO'
77 | load_from = None
78 | resume_from = None
79 | workflow = [('train', 1)]
80 | 


--------------------------------------------------------------------------------
/configs/skeleton/posec3d/slowonly_r50_u54_640e_pr-kinetics.py:
--------------------------------------------------------------------------------
  1 | # * dataset settings
  2 | dataset_type = 'PoseDataset'
  3 | data_root = ('/home/jovyan/mmaction2/data')
  4 | data_root_val = data_root
  5 | data_root_test = data_root
  6 | ann_file_train = f'{data_root}/kinesphere_train.pkl'
  7 | ann_file_val = f'{data_root_val}/kinesphere_val.pkl'
  8 | ann_file_test = f'{data_root_test}/kinesphere_val.pkl'
  9 | num_classes = 6
 10 | left_kp = [1, 3, 5, 7, 9, 11, 13, 15]
 11 | right_kp = [2, 4, 6, 8, 10, 12, 14, 16]
 12 | 
 13 | # model settings
 14 | model = dict(type='Recognizer3D',
 15 |              backbone=dict(type='ResNet3dSlowOnly',
 16 |                            depth=50,
 17 |                            pretrained=None,
 18 |                            in_channels=17,
 19 |                            base_channels=32,
 20 |                            num_stages=3,
 21 |                            out_indices=(2, ),
 22 |                            stage_blocks=(4, 6, 3),
 23 |                            conv1_stride_s=1,
 24 |                            pool1_stride_s=1,
 25 |                            inflate=(0, 1, 1),
 26 |                            spatial_strides=(2, 2, 2),
 27 |                            temporal_strides=(1, 1, 2),
 28 |                            dilations=(1, 1, 1)),
 29 |              cls_head=dict(type='I3DHead',
 30 |                            in_channels=512,
 31 |                            num_classes=num_classes,
 32 |                            spatial_type='avg',
 33 |                            dropout_ratio=0.7,
 34 |                            topk=(1, 2, 3, 4, 5)),
 35 |              train_cfg=dict(),
 36 |              test_cfg=dict(average_clips='prob'))
 37 | 
 38 | train_pipeline = [
 39 |     # * 54 (25% of 210) sampled frames seems better
 40 |     # 48 frames = 22.8%
 41 |     dict(type='UniformSampleFrames', clip_len=54),
 42 |     dict(type='PoseDecode'),
 43 |     dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True),
 44 |     dict(type='Resize', scale=(-1, 64)),
 45 |     dict(type='RandomResizedCrop', area_range=(0.56, 1.0)),
 46 |     dict(type='Resize', scale=(56, 56), keep_ratio=False),
 47 |     dict(type='Flip', flip_ratio=0.5, left_kp=left_kp, right_kp=right_kp),
 48 |     dict(type='GeneratePoseTarget',
 49 |          sigma=0.6,
 50 |          use_score=True,
 51 |          with_kp=True,
 52 |          with_limb=False),
 53 |     dict(type='FormatShape', input_format='NCTHW'),
 54 |     dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
 55 |     dict(type='ToTensor', keys=['imgs', 'label'])
 56 | ]
 57 | val_pipeline = [
 58 |     dict(type='UniformSampleFrames', clip_len=54, num_clips=1, test_mode=True),
 59 |     dict(type='PoseDecode'),
 60 |     dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True),
 61 |     dict(type='Resize', scale=(-1, 64)),
 62 |     dict(type='CenterCrop', crop_size=64),
 63 |     dict(type='GeneratePoseTarget',
 64 |          sigma=0.6,
 65 |          use_score=True,
 66 |          with_kp=True,
 67 |          with_limb=False),
 68 |     dict(type='FormatShape', input_format='NCTHW'),
 69 |     dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
 70 |     dict(type='ToTensor', keys=['imgs'])
 71 | ]
 72 | test_pipeline = [
 73 |     dict(type='UniformSampleFrames', clip_len=54, num_clips=10,
 74 |          test_mode=True),
 75 |     dict(type='PoseDecode'),
 76 |     dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True),
 77 |     dict(type='Resize', scale=(-1, 64)),
 78 |     dict(type='CenterCrop', crop_size=64),
 79 |     dict(type='GeneratePoseTarget',
 80 |          sigma=0.6,
 81 |          use_score=True,
 82 |          with_kp=True,
 83 |          with_limb=False,
 84 |          double=True,
 85 |          left_kp=left_kp,
 86 |          right_kp=right_kp),
 87 |     dict(type='FormatShape', input_format='NCTHW'),
 88 |     dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
 89 |     dict(type='ToTensor', keys=['imgs'])
 90 | ]
 91 | data = dict(videos_per_gpu=12,
 92 |             workers_per_gpu=2,
 93 |             test_dataloader=dict(videos_per_gpu=1),
 94 |             train=dict(type=dataset_type,
 95 |                        ann_file=ann_file_train,
 96 |                        data_prefix='',
 97 |                        pipeline=train_pipeline),
 98 |             val=dict(type=dataset_type,
 99 |                      ann_file=ann_file_val,
100 |                      data_prefix='',
101 |                      pipeline=val_pipeline),
102 |             test=dict(type=dataset_type,
103 |                       ann_file=ann_file_test,
104 |                       data_prefix='',
105 |                       pipeline=test_pipeline))
106 | 
107 | # optimizer
108 | optimizer = dict(type='SGD', lr=0.0375, momentum=0.9,
109 |                  weight_decay=0.0003)  # this lr is used for 8 gpus
110 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
111 | 
112 | # learning policy
113 | lr_config = dict(policy='CosineAnnealing', by_epoch=False, min_lr=0)
114 | total_epochs = 640
115 | checkpoint_config = dict(interval=40)
116 | workflow = [('train', 10)]
117 | evaluation = dict(interval=5,
118 |                   metrics=['top_k_accuracy', 'mean_class_accuracy'],
119 |                   topk=(1, 2, 3, 4, 5))
120 | eval_config = dict(metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4,
121 |                                                                  5))), )
122 | log_config = dict(interval=20, hooks=[
123 |     dict(type='TextLoggerHook'),
124 | ])
125 | 
126 | dist_params = dict(backend='nccl')
127 | log_level = 'INFO'
128 | load_from = (
129 |     'https://download.openmmlab.com/mmaction/skeleton/posec3d/'
130 |     'slowonly_kinetics400_pretrained_r50_u48_120e_ucf101_split1_keypoint/'
131 |     'slowonly_kinetics400_pretrained_r50_u48_120e_ucf101_split1_keypoint'
132 |     '-cae8aa4a.pth')
133 | resume_from = None
134 | find_unused_parameters = False
135 | 


--------------------------------------------------------------------------------
/configs/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb.py:
--------------------------------------------------------------------------------
  1 | # * dataset settings
  2 | dataset_type = 'VideoDataset'
  3 | data_root = '/home/rejnald/projects/side_projects/phar/mmaction2/data/phar/'
  4 | data_root_val = data_root
  5 | data_root_test = data_root
  6 | ann_file_train = f'{data_root}/train.txt'
  7 | ann_file_val = f'{data_root_val}/val.txt'
  8 | ann_file_test = f'{data_root_test}/test.txt'
  9 | num_classes = 17
 10 | img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
 11 |                     std=[58.395, 57.12, 57.375],
 12 |                     to_bgr=False)
 13 | 
 14 | # * model settings
 15 | model = dict(
 16 |     type='Recognizer3D',
 17 |     backbone=dict(
 18 |         type='ResNet3dSlowFast',
 19 |         pretrained=None,
 20 |         resample_rate=8,  # tau
 21 |         speed_ratio=8,  # alpha
 22 |         channel_ratio=8,  # beta_inv
 23 |         slow_pathway=dict(type='resnet3d',
 24 |                           depth=50,
 25 |                           pretrained=None,
 26 |                           lateral=True,
 27 |                           conv1_kernel=(1, 7, 7),
 28 |                           dilations=(1, 1, 1, 1),
 29 |                           conv1_stride_t=1,
 30 |                           pool1_stride_t=1,
 31 |                           inflate=(0, 0, 1, 1),
 32 |                           norm_eval=False),
 33 |         fast_pathway=dict(type='resnet3d',
 34 |                           depth=50,
 35 |                           pretrained=None,
 36 |                           lateral=False,
 37 |                           base_channels=8,
 38 |                           conv1_kernel=(5, 7, 7),
 39 |                           conv1_stride_t=1,
 40 |                           pool1_stride_t=1,
 41 |                           norm_eval=False)),
 42 |     cls_head=dict(
 43 |         type='SlowFastHead',
 44 |         in_channels=2304,  # 2048+256
 45 |         num_classes=num_classes,
 46 |         spatial_type='avg',
 47 |         dropout_ratio=0.7,
 48 |         topk=(1, 2, 3, 4, 5)),
 49 |     # model training and testing settings
 50 |     train_cfg=None,
 51 |     test_cfg=dict(average_clips='prob'))
 52 | 
 53 | # * pipelines
 54 | train_pipeline = [
 55 |     dict(type='DecordInit'),
 56 |     dict(type='SampleFrames', clip_len=24, frame_interval=2, num_clips=1),
 57 |     dict(type='DecordDecode'),
 58 |     dict(type='Resize', scale=(-1, 256)),
 59 |     dict(type='RandomResizedCrop'),
 60 |     dict(type='Resize', scale=(224, 224), keep_ratio=False),
 61 |     dict(type='Flip', flip_ratio=0.5),
 62 |     dict(type='Normalize', **img_norm_cfg),
 63 |     dict(type='FormatShape', input_format='NCTHW'),
 64 |     dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
 65 |     dict(type='ToTensor', keys=['imgs', 'label'])
 66 | ]
 67 | val_pipeline = [
 68 |     dict(type='DecordInit'),
 69 |     dict(type='SampleFrames',
 70 |          clip_len=24,
 71 |          frame_interval=2,
 72 |          num_clips=1,
 73 |          test_mode=True),
 74 |     dict(type='DecordDecode'),
 75 |     dict(type='Resize', scale=(-1, 256)),
 76 |     dict(type='CenterCrop', crop_size=224),
 77 |     dict(type='Normalize', **img_norm_cfg),
 78 |     dict(type='FormatShape', input_format='NCTHW'),
 79 |     dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
 80 |     dict(type='ToTensor', keys=['imgs'])
 81 | ]
 82 | test_pipeline = [
 83 |     dict(type='DecordInit'),
 84 |     dict(type='SampleFrames',
 85 |          clip_len=24,
 86 |          frame_interval=2,
 87 |          num_clips=10,
 88 |          test_mode=True),
 89 |     dict(type='DecordDecode'),
 90 |     dict(type='Resize', scale=(-1, 256)),
 91 |     dict(type='ThreeCrop', crop_size=256),
 92 |     dict(type='Normalize', **img_norm_cfg),
 93 |     dict(type='FormatShape', input_format='NCTHW'),
 94 |     dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
 95 |     dict(type='ToTensor', keys=['imgs'])
 96 | ]
 97 | data = dict(videos_per_gpu=8,
 98 |             workers_per_gpu=1,
 99 |             test_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1),
100 |             val_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1),
101 |             train=dict(type=dataset_type,
102 |                        ann_file=ann_file_train,
103 |                        data_prefix='',
104 |                        pipeline=train_pipeline),
105 |             val=dict(type=dataset_type,
106 |                      ann_file=ann_file_val,
107 |                      data_prefix='',
108 |                      pipeline=val_pipeline),
109 |             test=dict(type=dataset_type,
110 |                       ann_file=ann_file_test,
111 |                       data_prefix='',
112 |                       pipeline=test_pipeline))
113 | 
114 | # set the top-k accuracy during validation
115 | evaluation = dict(
116 |     interval=5,  # Interval to perform evaluation
117 |     metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 5))),
118 | )
119 | # set the top-k accuracy during testing
120 | eval_config = dict(metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4,
121 |                                                                  5))), )
122 | # * optimizer
123 | optimizer = dict(type='SGD', lr=0.025, momentum=0.9, weight_decay=0.0001)
124 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
125 | # learning policy
126 | lr_config = dict(policy='CosineAnnealing',
127 |                  min_lr=0,
128 |                  warmup='linear',
129 |                  warmup_by_epoch=True,
130 |                  warmup_iters=34)
131 | total_epochs = 256
132 | 
133 | # * runtime settings
134 | checkpoint_config = dict(interval=1)
135 | log_config = dict(
136 |     interval=200,
137 |     hooks=[
138 |         dict(type='TextLoggerHook'),
139 |         # dict(type='TensorboardLoggerHook'),
140 |     ])
141 | dist_params = dict(backend='nccl')
142 | log_level = 'INFO'
143 | find_unused_parameters = False
144 | load_from = (
145 |     'https://download.openmmlab.com/mmaction/recognition/'
146 |     'slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb/'
147 |     'slowfast_r50_video_4x16x1_256e_kinetics400_rgb_20200826-f85b90c5.pth')
148 | resume_from = None
149 | workflow = [('train', 1)]
150 | # disable opencv multithreading to avoid system being overloaded
151 | opencv_num_threads = 0
152 | # set multi-process start method as `fork` to speed up the training
153 | mp_start_method = 'fork'
154 | 


--------------------------------------------------------------------------------
/configs/slowonly/slowonly_nl_embedded_gaussian_r50_8x8x1_150e.py:
--------------------------------------------------------------------------------
  1 | # * dataset settings
  2 | dataset_type = 'VideoDataset'
  3 | data_root = '/home/rejnald/projects/side_projects/phar/mmaction2/data/phar/'
  4 | data_root_val = data_root
  5 | data_root_test = data_root
  6 | ann_file_train = f'{data_root}/train_aug.txt'
  7 | ann_file_val = f'{data_root_val}/val.txt'
  8 | ann_file_test = f'{data_root_test}/val.txt'
  9 | num_classes = 17
 10 | 
 11 | # * model settings
 12 | model = dict(
 13 |     type='Recognizer3D',
 14 |     backbone=dict(type='ResNet3dSlowOnly',
 15 |                   depth=50,
 16 |                   pretrained='torchvision://resnet50',
 17 |                   lateral=False,
 18 |                   conv1_kernel=(1, 7, 7),
 19 |                   conv1_stride_t=1,
 20 |                   pool1_stride_t=1,
 21 |                   inflate=(0, 0, 1, 1),
 22 |                   norm_eval=False,
 23 |                   non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0),
 24 |                              (0, 0, 0)),
 25 |                   non_local_cfg=dict(sub_sample=True,
 26 |                                      use_scale=True,
 27 |                                      norm_cfg=dict(type='BN3d',
 28 |                                                    requires_grad=True),
 29 |                                      mode='embedded_gaussian')),
 30 |     cls_head=dict(type='I3DHead',
 31 |                   in_channels=2048,
 32 |                   num_classes=num_classes,
 33 |                   spatial_type='avg',
 34 |                   dropout_ratio=0.7,
 35 |                   topk=(1, 2, 3, 4, 5)),
 36 |     # model training and testing settings
 37 |     train_cfg=None,
 38 |     test_cfg=dict(average_clips='prob'))
 39 | 
 40 | img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
 41 |                     std=[58.395, 57.12, 57.375],
 42 |                     to_bgr=False)
 43 | train_pipeline = [
 44 |     dict(type='DecordInit'),
 45 |     dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
 46 |     dict(type='DecordDecode'),
 47 |     dict(type='Resize', scale=(-1, 256)),
 48 |     dict(type='RandomResizedCrop'),
 49 |     dict(type='Resize', scale=(224, 224), keep_ratio=False),
 50 |     dict(type='Flip', flip_ratio=0.5),
 51 |     dict(type='Normalize', **img_norm_cfg),
 52 |     dict(type='FormatShape', input_format='NCTHW'),
 53 |     dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
 54 |     dict(type='ToTensor', keys=['imgs', 'label'])
 55 | ]
 56 | val_pipeline = [
 57 |     dict(type='DecordInit'),
 58 |     dict(type='SampleFrames',
 59 |          clip_len=8,
 60 |          frame_interval=8,
 61 |          num_clips=1,
 62 |          test_mode=True),
 63 |     dict(type='DecordDecode'),
 64 |     dict(type='Resize', scale=(-1, 256)),
 65 |     dict(type='CenterCrop', crop_size=224),
 66 |     dict(type='Normalize', **img_norm_cfg),
 67 |     dict(type='FormatShape', input_format='NCTHW'),
 68 |     dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
 69 |     dict(type='ToTensor', keys=['imgs'])
 70 | ]
 71 | test_pipeline = [
 72 |     dict(type='DecordInit'),
 73 |     dict(type='SampleFrames',
 74 |          clip_len=8,
 75 |          frame_interval=8,
 76 |          num_clips=10,
 77 |          test_mode=True),
 78 |     dict(type='DecordDecode'),
 79 |     dict(type='Resize', scale=(-1, 256)),
 80 |     dict(type='ThreeCrop', crop_size=256),
 81 |     dict(type='Normalize', **img_norm_cfg),
 82 |     dict(type='FormatShape', input_format='NCTHW'),
 83 |     dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
 84 |     dict(type='ToTensor', keys=['imgs'])
 85 | ]
 86 | data = dict(videos_per_gpu=4,
 87 |             workers_per_gpu=1,
 88 |             test_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1),
 89 |             val_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1),
 90 |             train=dict(type=dataset_type,
 91 |                        ann_file=ann_file_train,
 92 |                        data_prefix='',
 93 |                        pipeline=train_pipeline),
 94 |             val=dict(type=dataset_type,
 95 |                      ann_file=ann_file_val,
 96 |                      data_prefix='',
 97 |                      pipeline=val_pipeline),
 98 |             test=dict(type=dataset_type,
 99 |                       ann_file=ann_file_test,
100 |                       data_prefix='',
101 |                       pipeline=test_pipeline))
102 | # set the top-k accuracy during validation
103 | evaluation = dict(
104 |     interval=5,  # Interval to perform evaluation
105 |     metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 5))),
106 | )
107 | # set the top-k accuracy during testing
108 | eval_config = dict(metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4,
109 |                                                                  5))), )
110 | 
111 | # * optimizer
112 | optimizer = dict(type='SGD', lr=0.00625, momentum=0.9, weight_decay=0.0001)
113 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
114 | # learning policy
115 | lr_config = dict(policy='step',
116 |                  step=[90, 130],
117 |                  warmup='linear',
118 |                  warmup_by_epoch=True,
119 |                  warmup_iters=10)
120 | total_epochs = 150
121 | 
122 | # * runtime settings
123 | checkpoint_config = dict(interval=1)
124 | log_config = dict(
125 |     interval=200,
126 |     hooks=[
127 |         dict(type='TextLoggerHook'),
128 |         # dict(type='TensorboardLoggerHook'),
129 |     ])
130 | dist_params = dict(backend='nccl')
131 | log_level = 'INFO'
132 | load_from = ('https://download.openmmlab.com/mmaction/recognition/slowonly/'
133 |              'slowonly_nl_embedded_gaussian_r50_8x8x1_150e_kinetics400_rgb/'
134 |              'slowonly_nl_embedded_gaussian_r50_8x8x1_150e_kinetics400_rgb_'
135 |              '20210308-e8dd9e82.pth')
136 | resume_from = None
137 | workflow = [('train', 1)]
138 | 
139 | # disable opencv multithreading to avoid system being overloaded
140 | opencv_num_threads = 0
141 | # set multi-process start method as `fork` to speed up the training
142 | mp_start_method = 'fork'
143 | 


--------------------------------------------------------------------------------
/configs/timesformer/timesformer_divST_16x12x1_15e_kinetics400_rgb.py:
--------------------------------------------------------------------------------
  1 | # * dataset settings
  2 | dataset_type = 'VideoDataset'
  3 | data_root = '/home/rejnald/projects/side_projects/phar/mmaction2/data/phar/'
  4 | data_root_val = data_root
  5 | data_root_test = data_root
  6 | ann_file_train = f'{data_root}/train_aug.txt'
  7 | ann_file_val = f'{data_root_val}/val.txt'
  8 | ann_file_test = f'{data_root_test}/val.txt'
  9 | num_classes = 17
 10 | img_norm_cfg = dict(mean=[127.5, 127.5, 127.5],
 11 |                     std=[127.5, 127.5, 127.5],
 12 |                     to_bgr=False)
 13 | 
 14 | # * model settings
 15 | model = dict(
 16 |     type='Recognizer3D',
 17 |     backbone=dict(
 18 |         type='TimeSformer',
 19 |         pretrained=  # noqa: E251
 20 |         'https://download.openmmlab.com/mmaction/recognition/timesformer/vit_base_patch16_224.pth',  # noqa: E501
 21 |         num_frames=16,
 22 |         img_size=224,
 23 |         patch_size=16,
 24 |         embed_dims=768,
 25 |         in_channels=3,
 26 |         dropout_ratio=0.2,
 27 |         transformer_layers=None,
 28 |         # divided attention is the best strategy
 29 |         attention_type='divided_space_time',
 30 |         norm_cfg=dict(type='LN', eps=1e-6)),
 31 |     cls_head=dict(type='TimeSformerHead',
 32 |                   num_classes=num_classes,
 33 |                   in_channels=768,
 34 |                   topk=(1, 2, 3, 4, 5)),
 35 |     # model training and testing settings
 36 |     train_cfg=None,
 37 |     test_cfg=dict(average_clips='prob'))
 38 | 
 39 | train_pipeline = [
 40 |     dict(type='DecordInit'),
 41 |     # * frame_interval has been selected for 7s clips
 42 |     dict(type='SampleFrames', clip_len=16, frame_interval=12, num_clips=1),
 43 |     dict(type='DecordDecode'),
 44 |     dict(type='RandomRescale', scale_range=(256, 320)),
 45 |     dict(type='RandomCrop', size=224),
 46 |     dict(type='Flip', flip_ratio=0.5),
 47 |     dict(type='Normalize', **img_norm_cfg),
 48 |     dict(type='FormatShape', input_format='NCTHW'),
 49 |     dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
 50 |     dict(type='ToTensor', keys=['imgs', 'label'])
 51 | ]
 52 | val_pipeline = [
 53 |     dict(type='DecordInit'),
 54 |     dict(type='SampleFrames',
 55 |          clip_len=16,
 56 |          frame_interval=12,
 57 |          num_clips=1,
 58 |          test_mode=True),
 59 |     dict(type='DecordDecode'),
 60 |     dict(type='Resize', scale=(-1, 256)),
 61 |     dict(type='CenterCrop', crop_size=224),
 62 |     dict(type='Normalize', **img_norm_cfg),
 63 |     dict(type='FormatShape', input_format='NCTHW'),
 64 |     dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
 65 |     dict(type='ToTensor', keys=['imgs', 'label'])
 66 | ]
 67 | test_pipeline = [
 68 |     dict(type='DecordInit'),
 69 |     dict(type='SampleFrames',
 70 |          clip_len=16,
 71 |          frame_interval=12,
 72 |          num_clips=1,
 73 |          test_mode=True),
 74 |     dict(type='DecordDecode'),
 75 |     dict(type='Resize', scale=(-1, 224)),
 76 |     dict(type='ThreeCrop', crop_size=224),
 77 |     dict(type='Normalize', **img_norm_cfg),
 78 |     dict(type='FormatShape', input_format='NCTHW'),
 79 |     dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
 80 |     dict(type='ToTensor', keys=['imgs', 'label'])
 81 | ]
 82 | data = dict(videos_per_gpu=1,
 83 |             workers_per_gpu=1,
 84 |             test_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1),
 85 |             val_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1),
 86 |             train=dict(type=dataset_type,
 87 |                        ann_file=ann_file_train,
 88 |                        data_prefix='',
 89 |                        pipeline=train_pipeline),
 90 |             val=dict(type=dataset_type,
 91 |                      ann_file=ann_file_val,
 92 |                      data_prefix='',
 93 |                      pipeline=val_pipeline),
 94 |             test=dict(type=dataset_type,
 95 |                       ann_file=ann_file_test,
 96 |                       data_prefix='',
 97 |                       pipeline=test_pipeline))
 98 | 
 99 | # set the top-k accuracy during validation
100 | evaluation = dict(
101 |     interval=1,  # Interval to perform evaluation
102 |     metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4, 5))),
103 | )
104 | # set the top-k accuracy during testing
105 | eval_config = dict(metric_options=dict(top_k_accuracy=dict(topk=(1, 2, 3, 4,
106 |                                                                  5))), )
107 | 
108 | # optimizer
109 | optimizer = dict(type='SGD',
110 |                  lr=0.0015625,
111 |                  momentum=0.9,
112 |                  paramwise_cfg=dict(
113 |                      custom_keys={
114 |                          '.backbone.cls_token': dict(decay_mult=0.0),
115 |                          '.backbone.pos_embed': dict(decay_mult=0.0),
116 |                          '.backbone.time_embed': dict(decay_mult=0.0)
117 |                      }),
118 |                  weight_decay=1e-4,
119 |                  nesterov=True)  # this lr is used for 8 gpus
120 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
121 | 
122 | # learning policy
123 | lr_config = dict(policy='step', step=[5, 10])
124 | total_epochs = 25
125 | 
126 | # * runtime settings
127 | checkpoint_config = dict(interval=1)
128 | log_config = dict(
129 |     interval=1000,
130 |     hooks=[
131 |         dict(type='TextLoggerHook'),
132 |         # dict(type='TensorboardLoggerHook'),
133 |     ])
134 | # runtime settings
135 | dist_params = dict(backend='nccl')
136 | log_level = 'INFO'
137 | load_from = ('https://download.openmmlab.com/mmaction/recognition/timesformer/'
138 |              'timesformer_divST_8x32x1_15e_kinetics400_rgb/'
139 |              'timesformer_divST_8x32x1_15e_kinetics400_rgb-3f8e5d03.pth')
140 | resume_from = None
141 | workflow = [('train', 1)]
142 | 
143 | # disable opencv multithreading to avoid system being overloaded
144 | opencv_num_threads = 0
145 | # set multi-process start method as `fork` to speed up the training
146 | mp_start_method = 'fork'
147 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG PYTORCH="1.8.0"
 2 | ARG CUDA="11.1"
 3 | ARG CUDNN="8"
 4 | FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
 5 | 
 6 | ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 8.0+PTX"
 7 | ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
 8 | ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
 9 | 
10 | # ! https://github.com/NVIDIA/nvidia-docker/issues/1632
11 | # currently image not working properly
12 | RUN apt-key del 7fa2af80
13 | RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub
14 | RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64/7fa2af80.pub
15 | 
16 | RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 \
17 |     libxrender-dev libxext6 ffmpeg nano p7zip-full imagemagick wget unzip \
18 |     && apt-get clean \
19 |     && rm -rf /var/lib/apt/lists/*
20 | 
21 | RUN pip install mmcv-full==1.3.18 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html
22 | RUN git clone --recurse-submodules https://github.com/rlleshi/phar.git phar
23 | 
24 | # install mmaction, mmpose, mmdet
25 | WORKDIR /workspace/phar/mmaction2
26 | ENV FORCE_CUDA="1"
27 | RUN pip install cython --no-cache-dir
28 | RUN pip install --no-cache-dir -e .
29 | WORKDIR /workspace/phar/mmpose
30 | RUN pip install -r requirements.txt
31 | RUN pip install -v -e .
32 | RUN pip install mmdet==2.12.0
33 | 
34 | # install extra dependencies
35 | WORKDIR /workspace/phar
36 | RUN pip install -r requirements/extra.txt
37 | 
38 | # download models
39 | RUN wget https://github.com/rlleshi/phar/releases/download/v1.0.0/audio.pth -O checkpoints/har/audio.pth \
40 |     && wget https://github.com/rlleshi/phar/releases/download/v1.0.0/posec3d.pth -O checkpoints/har/posec3d.pth \
41 |     && wget https://github.com/rlleshi/phar/releases/download/v1.0.0/timeSformer.pth -O checkpoints/har/timeSformer.pth \
42 |     && wget https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth -O checkpoints/pose/hrnet_w32_coco_256x192.pth \
43 |     && wget http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
44 |     -O checkpoints/detector/faster_rcnn_r50_fpn_1x_coco-person.pth
45 | 


--------------------------------------------------------------------------------
/requirements/extra.txt:
--------------------------------------------------------------------------------
 1 | librosa==0.8.1
 2 | lws==1.2.7
 3 | mlflow
 4 | moviepy==1.0.3
 5 | numpy==1.22.4
 6 | pyloudnorm==0.1.0
 7 | rich
 8 | schedule
 9 | SoundFile==0.10.3.post1
10 | 


--------------------------------------------------------------------------------
/requirements/requirements.txt:
--------------------------------------------------------------------------------
  1 | addict==2.4.0
  2 | albumentations==1.1.0
  3 | alembic==1.4.1
  4 | appdirs==1.4.4
  5 | attrs==21.4.0
  6 | audioread==2.1.9
  7 | autopep8==1.6.0
  8 | beautifulsoup4==4.9.3
  9 | certifi==2020.12.5
 10 | cffi==1.15.0
 11 | cfgv==3.3.1
 12 | chardet==4.0.0
 13 | chumpy==0.70
 14 | click==7.1.2
 15 | cloudpickle==1.6.0
 16 | colorama==0.4.4
 17 | commonmark==0.9.1
 18 | coverage==6.3.1
 19 | cycler==0.10.0
 20 | Cython==0.29.23
 21 | databricks-cli==0.14.3
 22 | decorator==4.4.2
 23 | decord==0.4.1
 24 | defusedxml==0.7.1
 25 | distlib==0.3.4
 26 | docker==5.0.0
 27 | einops==0.3.0
 28 | entrypoints==0.3
 29 | ez-setup==0.9
 30 | ffmpeg-python==0.2.0
 31 | filelock==3.4.2
 32 | flake8==4.0.1
 33 | flake8-import-order==0.18.1
 34 | Flask==2.0.0
 35 | flatbuffers==2.0
 36 | freetype-py==2.2.0
 37 | future==0.18.2
 38 | gitdb==4.0.7
 39 | GitPython==3.1.17
 40 | greenlet==1.1.0
 41 | gunicorn==20.1.0
 42 | identify==2.4.4
 43 | idna==2.10
 44 | imageio==2.9.0
 45 | imageio-ffmpeg==0.4.3
 46 | importlib-metadata==4.8.2
 47 | iniconfig==1.1.1
 48 | interrogate==1.5.0
 49 | isort==4.3.21
 50 | itsdangerous==2.0.0
 51 | Jinja2==3.0.0
 52 | joblib==1.1.0
 53 | json-tricks==3.15.5
 54 | kiwisolver==1.3.1
 55 | librosa==0.8.1
 56 | llvmlite==0.38.0
 57 | lws==1.2.7
 58 | Mako==1.1.4
 59 | Markdown==3.3.6
 60 | MarkupSafe==2.0.0
 61 | matplotlib==3.4.2
 62 | mccabe==0.6.1
 63 | mlflow==1.17.0
 64 | -e git+ssh://git@github.com/open-mmlab/mmaction2.git@255bbc08634c21e6400af7b9d1a470b52285ebcd#egg=mmaction2
 65 | mmcv-full==1.3.18
 66 | mmdet==2.12.0
 67 | -e git+https://github.com/open-mmlab/mmpose.git@5c8ba2657b26ee9487451c45ba794823fa607cfd#egg=mmpose
 68 | model-index==0.1.11
 69 | motmetrics==1.2.0
 70 | moviepy==1.0.3
 71 | munkres==1.1.4
 72 | networkx==2.6.3
 73 | nodeenv==1.6.0
 74 | numba==0.55.0
 75 | numpy==1.20.2
 76 | odfpy==1.4.1
 77 | onnx==1.10.2
 78 | onnxruntime==1.10.0
 79 | opencv-contrib-python==4.5.4.60
 80 | opencv-python==4.5.4.60
 81 | opencv-python-headless==4.5.5.62
 82 | openmim==0.1.5
 83 | ordered-set==4.0.2
 84 | packaging==21.3
 85 | pandas==1.1.5
 86 | Pillow==8.2.0
 87 | platformdirs==2.4.1
 88 | pluggy==1.0.0
 89 | pooch==1.6.0
 90 | poseval==0.1.0
 91 | pre-commit==2.17.0
 92 | proglog==0.1.9
 93 | prometheus-client==0.10.1
 94 | prometheus-flask-exporter==0.18.2
 95 | protobuf==3.17.0
 96 | py==1.11.0
 97 | py-cpuinfo==8.0.0
 98 | pycocotools==2.0.2
 99 | pycodestyle==2.8.0
100 | pycparser==2.21
101 | pyflakes==2.4.0
102 | pyglet==1.5.21
103 | Pygments==2.10.0
104 | pyloudnorm==0.1.0
105 | PyOpenGL==3.1.0
106 | pyparsing==2.4.7
107 | pyrender==0.1.45
108 | pytest==7.0.0
109 | pytest-benchmark==3.4.1
110 | pytest-runner==5.3.1
111 | python-dateutil==2.8.1
112 | python-editor==1.0.4
113 | pytz==2021.1
114 | PyWavelets==1.2.0
115 | PyYAML==5.4.1
116 | qudida==0.0.4
117 | querystring-parser==1.2.4
118 | requests==2.25.1
119 | resampy==0.2.2
120 | rich==10.9.0
121 | scenedetect==0.5.6.1
122 | schedule==1.1.0
123 | scikit-image==0.19.1
124 | scikit-learn==1.0.2
125 | scipy==1.6.3
126 | seaborn==0.11.1
127 | Shapely==1.8.0
128 | six==1.16.0
129 | smmap==4.0.0
130 | smplx==0.1.28
131 | SoundFile==0.10.3.post1
132 | soupsieve==2.2.1
133 | SQLAlchemy==1.4.15
134 | sqlparse==0.4.1
135 | tabulate==0.8.9
136 | terminaltables==3.1.0
137 | threadpoolctl==3.0.0
138 | tifffile==2022.2.2
139 | toml==0.10.2
140 | tomli==2.0.1
141 | torch==1.10.0+cu113
142 | torchaudio==0.10.0+cu113
143 | torchvision==0.11.1+cu113
144 | tqdm==4.60.0
145 | trimesh==3.10.0
146 | typing-extensions==3.10.0.0
147 | urllib3==1.26.4
148 | vidaug==0.1
149 | virtualenv==20.13.0
150 | webcolors==1.11.1
151 | websocket-client==0.59.0
152 | Werkzeug==2.0.0
153 | xdoctest==0.15.10
154 | xmltodict==0.12.0
155 | xtcocotools==1.10
156 | yapf==0.31.0
157 | zipp==3.4.1
158 | 


--------------------------------------------------------------------------------
/resources/ann_dist_clip.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/resources/ann_dist_clip.jpg


--------------------------------------------------------------------------------
/resources/ann_dist_clips.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "kissing": 598,
 3 |     "fondling": 409,
 4 |     "handjob": 1403,
 5 |     "fingering": 952,
 6 |     "titjob": 1174,
 7 |     "blowjob": 1883,
 8 |     "cunnilingus": 1733,
 9 |     "deepthroat": 2057,
10 |     "doggy": 1883,
11 |     "the_snake": 1595,
12 |     "anal": 1560,
13 |     "missionary": 1882,
14 |     "cowgirl": 1663,
15 |     "scoop_up": 1336,
16 |     "cumshot": 570,
17 |     "facial_cumshot": 781,
18 |     "69": 1132,
19 |     "total": 22611,
20 |     "average": 1330
21 | }
22 | 


--------------------------------------------------------------------------------
/resources/annotation_distribution(min).json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "kissing": 72.7,
 3 |     "fondling": 48.1,
 4 |     "handjob": 164.9,
 5 |     "fingering": 112.7,
 6 |     "titjob": 137.2,
 7 |     "blowjob": 222.6,
 8 |     "cunnilingus": 203.5,
 9 |     "deepthroat": 250.2,
10 |     "doggy": 221.2,
11 |     "the_snake": 186.4,
12 |     "anal": 184.4,
13 |     "missionary": 226.9,
14 |     "cowgirl": 193.7,
15 |     "scoop_up": 155.3,
16 |     "cumshot": 67.7,
17 |     "facial_cumshot": 94.4,
18 |     "69": 132.7,
19 |     "total": 2674.6000000000004
20 | }
21 | 


--------------------------------------------------------------------------------
/resources/annotation_distribution.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/resources/annotation_distribution.jpg


--------------------------------------------------------------------------------
/resources/annotations/annotations.txt:
--------------------------------------------------------------------------------
 1 | kissing
 2 | fondling
 3 | handjob
 4 | fingering
 5 | titjob
 6 | blowjob
 7 | cunnilingus
 8 | deepthroat
 9 | doggy
10 | the-snake
11 | anal
12 | missionary
13 | cowgirl
14 | scoop-up
15 | cumshot
16 | facial-cumshot
17 | 69
18 | 


--------------------------------------------------------------------------------
/resources/annotations/annotations_audio.txt:
--------------------------------------------------------------------------------
1 | anal
2 | deepthroat
3 | doggy
4 | blowjob
5 | 


--------------------------------------------------------------------------------
/resources/annotations/annotations_pose.txt:
--------------------------------------------------------------------------------
1 | blowjob
2 | doggy
3 | missionary
4 | cowgirl
5 | 69
6 | kissing
7 | 


--------------------------------------------------------------------------------
/resources/annotations/current_annotations.txt:
--------------------------------------------------------------------------------
 1 | blowjob
 2 | ass-eating
 3 | deep-throat
 4 | cunnilingus
 5 | tit-fuck
 6 | handjob
 7 | cumshot
 8 | anal
 9 | fingering
10 | kissing
11 | tit-sucking
12 | squirting
13 | gaping
14 | other
15 | doggystyle
16 | cowgirl
17 | 69
18 | missionary
19 | reverse-cowgirl
20 | spooning
21 | 


--------------------------------------------------------------------------------
/resources/annotations/temp.txt:
--------------------------------------------------------------------------------
1 | kissing
2 | 


--------------------------------------------------------------------------------
/resources/audio/db_20_config.yml:
--------------------------------------------------------------------------------
1 | anal: -46.23
2 | deepthroat: -45.69
3 | doggy: -48.04
4 | blowjob: -53.72
5 | cumshot: -45.03
6 | cunnilingus: -48.73
7 | miscellaneous: -49.02
8 | kissing: -51.1
9 | 


--------------------------------------------------------------------------------
/resources/audio/db_30_config.yml:
--------------------------------------------------------------------------------
1 | anal: -40.81
2 | deepthroat: -41.32
3 | doggy: -42.63
4 | blowjob: -45.28
5 | 


--------------------------------------------------------------------------------
/resources/metrics/audio_cm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/resources/metrics/audio_cm.png


--------------------------------------------------------------------------------
/resources/metrics/audio_loss.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/resources/metrics/audio_loss.jpg


--------------------------------------------------------------------------------
/resources/metrics/posec3d_loss.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/resources/metrics/posec3d_loss.jpg


--------------------------------------------------------------------------------
/resources/metrics/skeleton_cm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/resources/metrics/skeleton_cm.png


--------------------------------------------------------------------------------
/resources/metrics/timesformer_loss.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/resources/metrics/timesformer_loss.jpg


--------------------------------------------------------------------------------
/src/__int__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/src/__int__.py


--------------------------------------------------------------------------------
/src/analysis/__int__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/src/analysis/__int__.py


--------------------------------------------------------------------------------
/src/analysis/audio_filter.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import shutil
 3 | from argparse import ArgumentParser
 4 | from pathlib import Path
 5 | 
 6 | import numpy as np
 7 | import pyloudnorm as pyln
 8 | import soundfile as sf
 9 | from rich.console import Console
10 | from tqdm import tqdm
11 | 
12 | CONSOLE = Console()
13 | 
14 | EXTS = ['.wav']
15 | 
16 | 
17 | def parse_args():
18 |     parser = ArgumentParser(prog='filter audio based on loudness. '
19 |                             'Removes a certain percentile')
20 |     parser.add_argument('src_dir', help='src directory')
21 |     parser.add_argument('out_dir', help='out directory')
22 |     parser.add_argument('--percentile',
23 |                         type=int,
24 |                         default=20,
25 |                         help='thresholding percentile for loudness in db')
26 |     parser.add_argument('--level',
27 |                         type=int,
28 |                         default=1,
29 |                         help='directory level of data')
30 |     args = parser.parse_args()
31 |     return args
32 | 
33 | 
34 | def main():
35 |     args = parse_args()
36 |     Path(args.out_dir).mkdir(parents=True, exist_ok=True)
37 |     CONSOLE.print(
38 |         f'Thresholding all audios found in {args.src_dir} with the '
39 |         f'{args.percentile}-th percentile',
40 |         style='green')
41 | 
42 |     audios = glob.glob(args.src_dir + '/*' * args.level)
43 |     audios = [
44 |         audio for audio in audios if any(audio.endswith(ext) for ext in EXTS)
45 |     ]
46 | 
47 |     # assuming that all audios have same rate
48 |     _, rate = sf.read(audios[0])
49 |     meter = pyln.Meter(rate)  # meter works with decibels
50 |     loudness = []
51 | 
52 |     for audio in tqdm(audios):
53 |         data, _ = sf.read(audio)
54 |         loudness.append((audio, meter.integrated_loudness(data)))
55 | 
56 |     min_db = np.percentile([loud[1] for loud in loudness], args.percentile)
57 |     CONSOLE.print(f'{args.percentile}-th percentile is {min_db}',
58 |                   style='green')
59 | 
60 |     filtered_audios = list(filter(lambda x: x[1] > min_db, loudness))
61 |     for audio in filtered_audios:
62 |         shutil.copy(audio[0], args.out_dir)
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     main()
67 | 


--------------------------------------------------------------------------------
/src/analysis/class_distribution_clips.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | import os.path as osp
 5 | import sys
 6 | 
 7 | import pandas as pd
 8 | import seaborn as sns
 9 | from rich.console import Console
10 | 
11 | sys.path.append('./tools')  # noqa
12 | import utils as utils  # noqa isort:skip
13 | 
14 | CONSOLE = Console()
15 | PLOT_SPLIT_THR = 26
16 | 
17 | 
18 | def parse_args():
19 |     parser = argparse.ArgumentParser(
20 |         description='calculates number of clips / annotation classes')
21 |     parser.add_argument('--src-dir',
22 |                         default='mmaction2/data/phar',
23 |                         help='the dir that contains all the videos')
24 |     parser.add_argument('--splits',
25 |                         nargs='+',
26 |                         default=['train', 'val', 'test'],
27 |                         choices=['train', 'val', 'test'],
28 |                         help='the splits where clips are found')
29 |     parser.add_argument('--ann',
30 |                         type=str,
31 |                         default='resources/annotations/annotations.txt',
32 |                         help='annotation file')
33 |     parser.add_argument('--out-dir',
34 |                         default='resources/',
35 |                         help='directory to store output files')
36 |     args = parser.parse_args()
37 |     return args
38 | 
39 | 
40 | def main():
41 |     args = parse_args()
42 |     result = utils.annotations_dic(args.ann)
43 |     result = {k: 0 for k, _ in result.items()}
44 | 
45 |     for split in args.splits:
46 |         path_to_label = osp.join(args.src_dir, split)
47 |         for label in os.listdir(path_to_label):
48 |             result[label] += len(os.listdir(osp.join(path_to_label, label)))
49 | 
50 |     labels = list(result.keys())
51 |     values = list(result.values())
52 |     result['total'] = sum(values)
53 |     result['average'] = round(result['total'] / len(values))
54 | 
55 |     # save json
56 |     result_json = json.dumps(result, indent=4)
57 |     f = open(osp.join(args.out_dir, 'ann_dist_clips.json'), 'w')
58 |     print(result_json, file=f)
59 |     f.close()
60 | 
61 |     # save plot
62 |     dfs = []
63 |     if len(labels) >= PLOT_SPLIT_THR:
64 |         # have to split in at least 2 groups for readability
65 |         dfs.append(
66 |             pd.DataFrame({
67 |                 'Class': labels[:int(len(labels) / 2)],
68 |                 'Value': values[:int(len(values) / 2)]
69 |             }))
70 |         dfs.append(
71 |             pd.DataFrame({
72 |                 'Class': labels[int(len(labels) / 2):],
73 |                 'Value': values[int(len(values) / 2):]
74 |             }))
75 |     else:
76 |         dfs.append(pd.DataFrame({'Class': labels, 'Value': values}))
77 | 
78 |     for df in dfs:
79 |         sns.set(rc={'figure.figsize': (15, 13)})
80 |         fig = sns.barplot(x='Class', y='Value', data=df)
81 |         fig.set_xticklabels(fig.get_xticklabels(), rotation=30)
82 |         fig.axes.set_title('Sample Distribution / Class ', fontsize=40)
83 |         fig.set_xlabel('Class', fontsize=30)
84 |         fig.set_ylabel('Value', fontsize=20)
85 |         output = fig.get_figure()
86 |         output.savefig(
87 |             osp.join(args.out_dir, f'ann_dist_clips_{utils.gen_id(2)}.jpg'))
88 | 
89 | 
90 | if __name__ == '__main__':
91 |     main()
92 | 


--------------------------------------------------------------------------------
/src/analysis/class_distribution_time.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import json
  3 | import os.path as osp
  4 | import sys
  5 | from argparse import ArgumentParser
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import seaborn as sns
 10 | from rich.console import Console
 11 | 
 12 | sys.path.append('./src')  # noqa: E501
 13 | import utils as utils  # noqa isort:skip
 14 | 
 15 | CONSOLE = Console()
 16 | ANN_EXT = '.csv'
 17 | 
 18 | 
 19 | def get_actions_with_timestamps(path):
 20 |     """Given the path to a csv file, get its timestamps.
 21 | 
 22 |     The function is specific to the temporal csv annotations produced by the
 23 |     VIA annotator.
 24 |     """
 25 |     results = []
 26 |     try:
 27 |         df = pd.read_csv(path)
 28 |     except Exception:
 29 |         CONSOLE.print(f'Error reading {path}', style='error')
 30 |     for i in range(1, len(df)):
 31 |         temp = str(df.iloc[i].value_counts()).split(' ')
 32 |         results.append({
 33 |             'action':
 34 |             temp[0].split(':"')[1].strip('}"'),
 35 |             'video':
 36 |             ''.join(list(filter(lambda x: x not in '["],', temp[6]))),
 37 |             'start':
 38 |             float(temp[7][:-1]),
 39 |             'end':
 40 |             float(temp[8][:-2])
 41 |         })
 42 |     return results
 43 | 
 44 | 
 45 | def parse_args():
 46 |     parser = ArgumentParser(prog='time analysis of annotation distribution '
 47 |                             'based on the CSV files annotations.')
 48 |     parser.add_argument('--csv-dir',
 49 |                         default='dataset/',
 50 |                         help='directory of csv annotations')
 51 |     parser.add_argument('--out_dir', default='resources/')
 52 |     parser.add_argument('--ann',
 53 |                         type=str,
 54 |                         default='resources/annotations/annotations.txt',
 55 |                         help='annotation file')
 56 |     parser.add_argument('--level',
 57 |                         type=int,
 58 |                         default=1,
 59 |                         choices=[1, 2],
 60 |                         help='directory level of data')
 61 |     args = parser.parse_args()
 62 |     return args
 63 | 
 64 | 
 65 | def save_results(out, result):
 66 |     cls = [k for k in result.keys()]
 67 |     val = [v for v in result.values()]
 68 |     tot = sum(val)
 69 |     val = list(map(lambda x: x / tot, val))
 70 | 
 71 |     # save json
 72 |     result['total'] = tot
 73 |     result_json = json.dumps(result, indent=4)
 74 |     f = open(osp.join(out, 'annotation_distribution(min).json'), 'w')
 75 |     print(result_json, file=f)
 76 |     f.close()
 77 | 
 78 |     # save plot
 79 |     df = pd.DataFrame({'Class': cls, 'Value': val})
 80 |     sns.set(rc={'figure.figsize': (15, 13)})
 81 |     fig = sns.barplot(x='Class', y='Value', data=df)
 82 |     fig.set_xticklabels(fig.get_xticklabels(), rotation=15)
 83 |     fig.axes.set_title('Sample Distribution / Class ', fontsize=40)
 84 |     fig.set_xlabel('Class', fontsize=30)
 85 |     fig.set_ylabel('Total %', fontsize=20)
 86 |     output = fig.get_figure()
 87 |     output.savefig(osp.join(out, 'annotation_distribution.jpg'))
 88 | 
 89 | 
 90 | def main():
 91 |     args = parse_args()
 92 |     ann_count = dict.fromkeys(utils.annotations_dic(args.ann), 0)
 93 |     if args.level == 1:
 94 |         search = osp.join(args.csv_dir, '*')
 95 |     elif args.level == 2:
 96 |         search = osp.join(args.csv_dir, '*', '*')
 97 |     annotations = [
 98 |         item for item in glob.glob(search) if item.endswith(ANN_EXT)
 99 |     ]
100 | 
101 |     for ann in annotations:
102 |         for action in get_actions_with_timestamps(ann):
103 |             label = action['action'].replace('-', '_')
104 |             duration = action['end'] - action['start']
105 |             if np.isnan(duration):
106 |                 # faulty annotation
107 |                 continue
108 |             try:
109 |                 ann_count[label] += duration
110 |             except KeyError:
111 |                 CONSOLE.print(f'{ann} has misspelled label {label}',
112 |                               style='yellow')
113 | 
114 |     ann_count = {k: round(v / 60, 1) for k, v in ann_count.items()}
115 |     save_results(args.out_dir, ann_count)
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     main()
120 | 


--------------------------------------------------------------------------------
/src/analysis/evaluate_acc_per_cls.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import os.path as osp
  3 | import pickle
  4 | import sys
  5 | from argparse import ArgumentParser
  6 | 
  7 | import pandas as pd
  8 | import seaborn as sns
  9 | import torch
 10 | from mmaction.apis import inference_recognizer, init_recognizer
 11 | from rich.console import Console
 12 | from tqdm import tqdm
 13 | 
 14 | # https://stackoverflow.com/questions/4383571/importing-files-from-different-folder
 15 | sys.path.append('human-action-recognition/')  # noqa
 16 | import har.tools.helpers as helpers  # noqa isort:skips
 17 | 
 18 | # sys.path.append('/mmaction2/')
 19 | CONSOLE = Console()
 20 | 
 21 | 
 22 | def parse_args():
 23 |     parser = ArgumentParser(prog='accuracy per class for a bunch of clips')
 24 |     parser.add_argument('checkpoint', help='model')
 25 |     parser.add_argument('split', type=str, help='train/validation/test')
 26 |     parser.add_argument('--src-dir',
 27 |                         type=str,
 28 |                         default='/mmaction2/data/tanz/videos_val/',
 29 |                         help='source dir of videos to be evaluated as clips')
 30 |     parser.add_argument('--out',
 31 |                         type=str,
 32 |                         default='/mnt/data_transfer/write/',
 33 |                         help='out dir')
 34 |     parser.add_argument('--config', type=str, help='model config file')
 35 |     parser.add_argument(
 36 |         '--ann',
 37 |         type=str,
 38 |         default=('human-action-recognition/har/annotations/BAST/base/'
 39 |                  'tanz_annotations.txt'),
 40 |         help='classes/labels')
 41 |     parser.add_argument('--device', type=str, default='cuda:0', help='cpu/gpu')
 42 |     parser.add_argument('--type',
 43 |                         default='rgb',
 44 |                         choices=['rgb', 'skeleton'],
 45 |                         help='rgb or skeleton')
 46 |     parser.add_argument('--topk',
 47 |                         type=int,
 48 |                         nargs='+',
 49 |                         default=[1, 2, 3],
 50 |                         choices=[1, 2, 3, 4, 5],
 51 |                         help='top-k accuracy to evaluate')
 52 |     args = parser.parse_args()
 53 |     return args
 54 | 
 55 | 
 56 | def save(args, results):
 57 |     # if args.out.endswith('.json'):
 58 |     # import json
 59 |     # results_json = json.dumps(results, indent=4)
 60 |     # f = open(out, 'w')
 61 |     # print(results , file=f)
 62 |     # f.close()
 63 |     out = osp.join(args.out, args.split + '_acc_per_class.csv')
 64 |     df = pd.DataFrame(results)
 65 |     df.to_csv(out, index=False)
 66 |     print('Saved {} csv file'.format(out))
 67 | 
 68 |     sns.set(rc={'figure.figsize': (11.7, 8.27)})
 69 |     fig = sns.barplot(x='Class', y='Value', hue='Accuracy', data=df)  #
 70 |     fig.set_xticklabels(fig.get_xticklabels(), rotation=30)  #
 71 |     fig.axes.set_title('Top 3 Accuracy ' + args.split + '-set', fontsize=40)
 72 |     fig.set_xlabel('Class', fontsize=30)
 73 |     fig.set_ylabel('Value', fontsize=20)
 74 |     output = fig.get_figure()  #
 75 |     out = osp.splitext(out)[0] + '.jpg'
 76 |     output.savefig(out)
 77 |     print('Saved {} plot'.format(out))
 78 | 
 79 | 
 80 | def skeleton(
 81 |     args,
 82 |     number_to_label,
 83 |     model,
 84 | ):
 85 |     total_count = helpers.bast_annotations_to_dict(args.ann)
 86 |     dist = {k: helpers.bast_annotations_to_dict(args.ann) for k in args.topk}
 87 | 
 88 |     for sample in tqdm(os.listdir(args.src_dir)):
 89 |         with open(osp.join(args.src_dir, sample), 'rb') as f:
 90 |             ann = pickle.load(f)
 91 |         ann['start_index'] = 0
 92 |         ann['modality'] = 'Pose'
 93 |         label = number_to_label[ann['label']]
 94 |         total_count[label] += 1
 95 |         result = inference_recognizer(model, ann)
 96 | 
 97 |         previous_k = 0
 98 |         for k in args.topk:
 99 |             # if its in top 1 & 2 it will count for top 3
100 |             for i in range(previous_k, k):
101 |                 if number_to_label[result[i][0]] == label:
102 |                     dist[k][label] += 1
103 |                     for j in args.topk:
104 |                         # if its in top 3 it will count for top 4
105 |                         if (j != k) & (j > k):
106 |                             dist[j][label] += 1
107 |             previous_k = k
108 | 
109 |     results = []
110 |     for i in dist.keys():
111 |         for k, v in dist[i].items():
112 |             acc = (v / total_count[k]) if total_count[k] != 0 else 0
113 |             results.append({'Class': k, 'Accuracy': f'acc_{i}', 'Value': acc})
114 |     save(args, results)
115 | 
116 |     no_labels = 0
117 |     for k in total_count.keys():
118 |         if total_count[k] > 0:
119 |             no_labels += 1
120 |     for i in dist.keys():
121 |         macro_acc = 0
122 |         for k, v in dist[i].items():
123 |             if total_count[k] == 0:
124 |                 macro_acc += 0
125 |             else:
126 |                 macro_acc += v / total_count[k]
127 | 
128 |         CONSOLE.print(
129 |             f'Macro top-{i} Acc: '
130 |             f'{round(100 * macro_acc / no_labels, 3)}',
131 |             style='green')
132 | 
133 | 
134 | def rgb(args, number_to_label, model):
135 |     total_count = helpers.bast_annotations_to_dict(args.ann)
136 |     dist = {k: helpers.bast_annotations_to_dict(args.ann) for k in args.topk}
137 | 
138 |     for label in tqdm(os.listdir(args.src_dir)):
139 |         class_dir = osp.join(args.src_dir, label)
140 | 
141 |         for clip in tqdm(os.listdir(class_dir)):
142 |             previous_k = 0
143 |             total_count[label] += 1
144 |             result = inference_recognizer(model, osp.join(class_dir, clip))
145 | 
146 |             for k in args.topk:
147 |                 # if its in top 1 & 2 it will count for top 3
148 |                 for i in range(previous_k, k):
149 |                     if number_to_label[result[i][0]] == label:
150 |                         dist[k][label] += 1
151 |                         for j in args.topk:
152 |                             # if its in top 3 it will count for top 4
153 |                             if (j != k) & (j > k):
154 |                                 dist[j][label] += 1
155 |                 previous_k = k
156 | 
157 |     results = []
158 |     for i in dist.keys():
159 |         for k, v in dist[i].items():
160 |             acc = (v / total_count[k]) if total_count[k] != 0 else 0
161 |             results.append({'Class': k, 'Accuracy': f'acc_{i}', 'Value': acc})
162 |     save(args, results)
163 | 
164 |     no_labels = 0
165 |     for k in total_count.keys():
166 |         if total_count[k] > 0:
167 |             no_labels += 1
168 |     for i in dist.keys():
169 |         macro_acc = 0
170 |         for k, v in dist[i].items():
171 |             if total_count[k] == 0:
172 |                 macro_acc += 0
173 |             else:
174 |                 macro_acc += v / total_count[k]
175 | 
176 |         CONSOLE.print(
177 |             f'Macro top-{i} Acc: '
178 |             f'{round(100 * macro_acc / no_labels, 3)}',
179 |             style='green')
180 | 
181 | 
182 | def main():
183 |     args = parse_args()
184 |     model = init_recognizer(args.config, args.checkpoint,
185 |                             torch.device(args.device))
186 |     CONSOLE.print(
187 |         f'# Evaluating accuracy per class for the {args.split}-set '
188 |         f'of config file {args.config.split("/")[-1]}',
189 |         style='green')
190 |     number_to_label = helpers.bast_number_to_label(args.ann)
191 |     callback = rgb if args.type == 'rgb' else skeleton
192 |     callback(args, number_to_label, model)
193 | 
194 | 
195 | if __name__ == '__main__':
196 |     main()
197 | 


--------------------------------------------------------------------------------
/src/analysis/pose_feasibility.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | import os
  3 | import os.path as osp
  4 | import sys
  5 | import warnings
  6 | from argparse import ArgumentParser
  7 | from pathlib import Path
  8 | 
  9 | import pandas as pd
 10 | import seaborn as sns
 11 | from rich.console import Console
 12 | from tqdm import tqdm
 13 | 
 14 | sys.path.append('./src')  # noqa: E501
 15 | import data.pose_extraction as pose_extraction  # noqa isort:skip
 16 | 
 17 | try:
 18 |     from mmdet.apis import init_detector
 19 |     from mmpose.apis import init_pose_model
 20 | except ImportError:
 21 |     warnings.warn(
 22 |         'Please install MMDet and MMPose for pose extraction.')  # noqa: E501
 23 | 
 24 | CONSOLE = Console()
 25 | POSE_EXTR_PATH = 'src/data/skeleton/pose_extraction.py'
 26 | PROGRESS_FILE = 'pose_feasibility_progress.txt'
 27 | 
 28 | 
 29 | def parse_args():
 30 |     parser = ArgumentParser(prog='check the pose feasibility for a class'
 31 |                             'Also generates the .pkl pose dicts.')
 32 |     parser.add_argument('label', help='class/label to examine')
 33 |     parser.add_argument('--src-dir',
 34 |                         default='mmaction2/data/phar',
 35 |                         help='directory of dataset')
 36 |     parser.add_argument('--out-dir', default='mmaction2/data/phar/pose')
 37 |     parser.add_argument('--ann',
 38 |                         type=str,
 39 |                         default='resources/annotations/pose.txt',
 40 |                         help='annotation file')
 41 |     parser.add_argument('--splits',
 42 |                         nargs='+',
 43 |                         default=['train', 'val', 'test'],
 44 |                         choices=['train', 'val', 'test'],
 45 |                         help='the splits where clips are found')
 46 |     parser.add_argument('--pose-score-thr',
 47 |                         type=float,
 48 |                         default=0.2,
 49 |                         help='pose estimation score threshold')
 50 |     parser.add_argument('--resume',
 51 |                         action='store_true',
 52 |                         help='ggf. resume analysis from previous run')
 53 |     parser.add_argument('--device', default='cuda:0', help='device')
 54 |     parser.add_argument(
 55 |         '--det-config',
 56 |         default=('mmdetection/configs/faster_rcnn/'
 57 |                  'faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py'),
 58 |         help='human detector config')
 59 |     parser.add_argument(
 60 |         '--det-checkpoint',
 61 |         default='checkpoints/detector/faster_rcnn_r50_fpn_1x_coco-person.pth',
 62 |         help='human detector checkpoint')
 63 |     parser.add_argument(
 64 |         '--pose-config',
 65 |         default=('mmpose/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/'
 66 |                  'coco/hrnet_w32_coco_256x192.py'),
 67 |         help='pose estimation config')
 68 |     parser.add_argument('--pose-checkpoint',
 69 |                         default='checkpoints/pose/hrnet_w32_coco_256x192.pth',
 70 |                         help='pose estimation checkpoint')
 71 |     args = parser.parse_args()
 72 |     return args
 73 | 
 74 | 
 75 | def get_pose(args, d_model, p_model):
 76 |     """Perform pose estimation given a video.
 77 | 
 78 |     Args:
 79 |         args (dict): parsed args
 80 |         d_model: detection model
 81 |         p_model: pose model
 82 | 
 83 |     Returns:
 84 |         int: correct poses rate
 85 |     """
 86 |     return pose_extraction.main(args, d_model, p_model)
 87 | 
 88 | 
 89 | def main():
 90 |     args = parse_args()
 91 |     # percentiles of clips having correct poses:
 92 |     # for a certain percentile it means: {n_videos_in_percentile / total_vids}
 93 |     # have {percentile %} of their poses with a confidence higher than
 94 |     # {args.pose_score_thr}
 95 |     results = {k: 0 for k in range(0, 101, 10)}
 96 |     det_model = init_detector(args.det_config, args.det_checkpoint,
 97 |                               args.device)
 98 |     pose_model = init_pose_model(args.pose_config, args.pose_checkpoint,
 99 |                                  args.device)
100 | 
101 |     sub_args = abc.ABC()
102 |     sub_args = abc.abstractproperty()
103 |     sub_args.device = args.device
104 |     sub_args.det_score_thr = 0.5
105 |     sub_args.pose_score_thr = args.pose_score_thr
106 |     sub_args.ann = args.ann
107 |     sub_args.correct_rate = 0.2
108 |     sub_args.filter_pose = False
109 | 
110 |     resume_list = []
111 |     if args.resume:
112 |         if osp.exists(PROGRESS_FILE):
113 |             with open(PROGRESS_FILE, 'r') as resume_from:
114 |                 resume_list = resume_from.readlines()[0].split(',')
115 |         else:
116 |             CONSOLE.print(
117 |                 f'Resume option selected but {PROGRESS_FILE} not'
118 |                 ' found.',
119 |                 style='yellow')
120 | 
121 |     for split in args.splits:
122 |         out_dir = osp.join(args.out_dir, split, args.label)
123 |         in_dir = osp.join(args.src_dir, split, args.label)
124 |         Path(out_dir).mkdir(parents=True, exist_ok=True)
125 |         sub_args.out_dir = out_dir
126 | 
127 |         for clip in tqdm(os.listdir(in_dir)):
128 |             if clip in resume_list:
129 |                 CONSOLE.print(f'Already processed. Skipping {clip}.',
130 |                               style='green')
131 |                 continue
132 | 
133 |             sub_args.video = osp.join(in_dir, clip)
134 |             result = get_pose(sub_args, det_model, pose_model)
135 |             if result is None:
136 |                 CONSOLE.print(f'{clip} already exists. Skipping.',
137 |                               style='green')
138 |                 continue
139 | 
140 |             result *= 100
141 |             for k in results.keys():
142 |                 if result > k:
143 |                     results[k] += 1
144 |             with open(PROGRESS_FILE, 'a+') as out:
145 |                 out.write(f'{clip},')
146 | 
147 |     # plot
148 |     df = pd.DataFrame({
149 |         '%': list(results.keys()),
150 |         'Value': list(results.values())
151 |     })
152 |     sns.set(rc={'figure.figsize': (15, 13)})
153 |     fig = sns.barplot(x='%', y='Value', data=df)
154 |     fig.set_xticklabels(fig.get_xticklabels(), rotation=30)
155 |     fig.axes.set_title(f'Correct Poses ({args.pose_score_thr}-conf-thr)',
156 |                        fontsize=40)
157 |     fig.set_xlabel('%', fontsize=30)
158 |     fig.set_ylabel('Values', fontsize=20)
159 |     output = fig.get_figure()
160 | 
161 |     out = osp.join(args.out_dir, f'correct_poses_rate_{args.label}.jpg')
162 |     output.savefig(out)
163 |     CONSOLE.print(f'Saved @{out}')
164 | 
165 | 
166 | if __name__ == '__main__':
167 |     main()
168 | 


--------------------------------------------------------------------------------
/src/analysis/print_layers.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from mmaction.apis import init_recognizer
 4 | 
 5 | 
 6 | def parse_args():
 7 |     parser = argparse.ArgumentParser(prog='model layer printer')
 8 |     parser.add_argument('config', help='config file path')
 9 |     parser.add_argument('checkpoint', help='checkpoint file')
10 |     parser.add_argument('--device',
11 |                         type=str,
12 |                         default='cuda:0',
13 |                         help='CPU/CUDA device option')
14 |     args = parser.parse_args()
15 |     return args
16 | 
17 | 
18 | def print_layers(model, layer_name):
19 |     if len(model._modules) == 0:
20 |         print(layer_name)
21 |     else:
22 |         for key in model._modules:
23 |             name = key if len(layer_name) == 0 else layer_name + '/' + key
24 |             print_layers(model._modules[key], name)
25 | 
26 | 
27 | def main():
28 |     args = parse_args()
29 |     model = init_recognizer(args.config, args.checkpoint, device=args.device)
30 |     print_layers(model, '')
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     main()
35 | 


--------------------------------------------------------------------------------
/src/data/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # phar
 3 | 
 4 | Make Porn Great Again
 5 | 
 6 | ## Building datasets
 7 | 
 8 | ### Build Video Dataset
 9 | 
10 | 1. Define the annotations @resources/annotations/annotations.txt
11 | 2. Create the clip dataset using `src/data/generate_dataset.py`
12 | 3. Downgrade the quality of the videos using `src/data/resize_videos.py`. Training will be much faster as resize overhead is removed.
13 | 4. Ggf. use `RepeatDataset` to further speed up training.
14 | 5. Use `mmaction2/src/analysis/check_videos.py` to check if the dataset is valid.
15 | 
16 | ### Build Pose Dataset
17 | 
18 | 1. Define the annotations @resources/annotations/annotations_pose.txt
19 | 2. Extract the pose information from the videos with `src/analysis/pose_feasibility.py` or `src/data/skeleton/generate_dataset_pose.py`
20 |     - Best to extract with `pose_feasibility` as it will not save those poses with low confidence and it also gives feedback on the hardness of the dataset to extract poses.
21 | 3. Merge the poses into lists with `merge_pose` @`src/misc.py`
22 | 
23 | ### Build Audio Dataset
24 | 
25 | 1. Define the annotations @resources/annotations/annotations_audio.txt
26 | 2. Extract the audio from the videos with `mmaction2/src/data/extract_audio.py`
27 |     - `Stream map '0:a' matches no streams` means that the videos have no audio!
28 | 3. Ggf. filter audios based on their loudness with `src/analysis/audio_filter.py`
29 | 4. Extract spectogram features with `mmaction2/src/data/build_audio_features.py`
30 | 5. Generate annotation list with `src/data/audio/build_file_list.py`
31 | 


--------------------------------------------------------------------------------
/src/data/__int__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/src/data/__int__.py


--------------------------------------------------------------------------------
/src/data/augment_dataset.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import glob
  3 | import os.path as osp
  4 | import random
  5 | import shutil
  6 | import sys
  7 | from itertools import repeat
  8 | from multiprocessing import cpu_count
  9 | from pathlib import Path
 10 | 
 11 | import cv2
 12 | import numpy as np
 13 | from rich.console import Console
 14 | from tqdm import tqdm
 15 | from tqdm.contrib.concurrent import process_map
 16 | from vidaug import augmentors as va
 17 | 
 18 | sys.path.append('./src')  # noqa
 19 | import utils as utils  # noqa isort:skip
 20 | 
 21 | CONSOLE = Console()
 22 | AUGS = [
 23 |     va.InvertColor(),
 24 |     va.InvertColor(),
 25 |     va.Add(value=100),
 26 |     va.Add(value=-100),
 27 |     va.Pepper(ratio=45),
 28 |     va.Pepper(ratio=15),
 29 |     va.Salt(ratio=100),
 30 |     va.Salt(ratio=25),
 31 |     va.GaussianBlur(sigma=1.2),
 32 |     va.GaussianBlur(sigma=2),
 33 |     va.GaussianBlur(sigma=3.5),
 34 |     va.ElasticTransformation(alpha=1.5, sigma=0.5),
 35 |     va.ElasticTransformation(alpha=3.5, sigma=0.5),
 36 |     va.PiecewiseAffineTransform(displacement=4,
 37 |                                 displacement_kernel=2,
 38 |                                 displacement_magnification=3),
 39 |     va.PiecewiseAffineTransform(displacement=2,
 40 |                                 displacement_kernel=1,
 41 |                                 displacement_magnification=2)
 42 | ]
 43 | 
 44 | 
 45 | def parse_args():
 46 |     parser = argparse.ArgumentParser(description='Augmenting train set script')
 47 |     parser.add_argument('--src-dir',
 48 |                         default='mmaction2/data/phar/train',
 49 |                         help='source video directory')
 50 |     parser.add_argument('--out-dir',
 51 |                         default='mmaction2/data/phar/train_aug/',
 52 |                         help='augmented video directory')
 53 |     parser.add_argument('--rate',
 54 |                         type=float,
 55 |                         default=0.3,
 56 |                         help='replacement rate for videos')
 57 |     parser.add_argument('--ann',
 58 |                         type=str,
 59 |                         default='resources/annotations/annotations.txt',
 60 |                         help='annotation file')
 61 |     parser.add_argument('--num-processes',
 62 |                         type=int,
 63 |                         default=(cpu_count() - 2 or 1),
 64 |                         help='number of processes used')
 65 |     args = parser.parse_args()
 66 |     return args
 67 | 
 68 | 
 69 | def augment_video(items):
 70 |     """Augments a video.
 71 | 
 72 |     Args:
 73 |         clip (str): path to video
 74 |         out_dir (str): path to out dir
 75 |     """
 76 |     clip, out_dir, random_clips = items
 77 |     if clip not in random_clips:
 78 |         # no augmentation, just copy it
 79 |         shutil.copy(clip, out_dir)
 80 |         return
 81 | 
 82 |     video = cv2.VideoCapture(clip)
 83 |     out = osp.join(out_dir, osp.basename(clip))
 84 |     video_writer = cv2.VideoWriter(
 85 |         out, cv2.VideoWriter_fourcc(*'mp4v'), video.get(cv2.CAP_PROP_FPS),
 86 |         (round(video.get(cv2.CAP_PROP_FRAME_WIDTH)),
 87 |          round(video.get(cv2.CAP_PROP_FRAME_HEIGHT))))
 88 | 
 89 |     frames = []
 90 |     while cv2.waitKey(1) < 0:
 91 |         success, frame = video.read()
 92 |         if not success:
 93 |             video.release()
 94 |             break
 95 |         frames.append(frame)
 96 | 
 97 |     aug = random.choice(AUGS)
 98 |     frames = aug(np.array(frames))
 99 |     for frame in frames:
100 |         video_writer.write(frame)
101 | 
102 | 
103 | def main():
104 |     args = parse_args()
105 |     Path(args.out_dir).mkdir(parents=True, exist_ok=True)
106 |     assert 0 < args.rate < 1.0
107 | 
108 |     for label in tqdm(utils.annotations_list(args.ann)):
109 |         out_dir_label = osp.join(args.out_dir, label)
110 |         Path(out_dir_label).mkdir(parents=True, exist_ok=True)
111 |         clips = glob.glob(osp.join(args.src_dir, label, '*'))
112 |         random_clips = random.sample(clips, round(len(clips) * args.rate))
113 |         CONSOLE.print(f'Augmenting {len(random_clips)} clips for {label}...',
114 |                       style='bold green')
115 | 
116 |         process_map(augment_video,
117 |                     zip(clips, repeat(out_dir_label), repeat(random_clips)),
118 |                     max_workers=args.num_processes,
119 |                     total=len(clips))
120 | 
121 | 
122 | if __name__ == '__main__':
123 |     main()
124 | 


--------------------------------------------------------------------------------
/src/data/build_file_list.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | import shutil
 3 | import sys
 4 | from argparse import ArgumentParser
 5 | 
 6 | import numpy as np
 7 | from rich.console import Console
 8 | 
 9 | sys.path.append('./tools')  # noqa
10 | import utils as utils  # noqa isort:skip
11 | 
12 | CONSOLE = Console()
13 | SPEC_EXT = '.npy'
14 | 
15 | 
16 | def parse_args():
17 |     parser = ArgumentParser(prog='generate file list for audio dataset based '
18 |                             'on video list')
19 |     parser.add_argument(
20 |         'src_dir',
21 |         type=str,
22 |         help='root dir for video dataset where the ann files are generated')
23 |     parser.add_argument(
24 |         '--audio-dir',
25 |         type=str,
26 |         default='audio_feature',
27 |         help='audio subdir inside the src_dir that contains spectograms')
28 |     parser.add_argument('--split',
29 |                         type=str,
30 |                         nargs='+',
31 |                         default=['train', 'val', 'test'],
32 |                         help='splits where the spectograms are located')
33 |     parser.add_argument('--ann',
34 |                         type=str,
35 |                         default='resources/annotations/annotations_audio.txt',
36 |                         help='audio annotations')
37 |     args = parser.parse_args()
38 |     return args
39 | 
40 | 
41 | def main():
42 |     args = parse_args()
43 |     if not osp.exists(args.src_dir):
44 |         CONSOLE.print(f'{args.src_dir} not found', style='red')
45 |         return
46 | 
47 |     ann_to_list = utils.annotations_dic(args.ann)
48 |     for split in args.split:
49 |         split = split + '.txt'
50 |         out_dir = osp.join(args.src_dir, args.audio_dir, split)
51 |         shutil.copyfile(osp.join(args.src_dir, split), out_dir)
52 | 
53 |         with open(out_dir, 'r') as out:
54 |             content = out.read()
55 | 
56 |         path = osp.splitext(out_dir)[0]
57 |         with open(out_dir, 'w') as out:
58 | 
59 |             for line in content.split('\n'):
60 |                 if len(line) == 0:
61 |                     continue
62 | 
63 |                 _, category, clip = line.rsplit(osp.sep, 2)
64 |                 new_path = osp.join(path, category, clip).split(' ')[0]
65 |                 new_class_id = ann_to_list.get(category, None)
66 | 
67 |                 if new_class_id is not None:
68 |                     new_path = new_path.split('.')[0] + SPEC_EXT
69 |                     if not osp.exists(new_path):
70 |                         # corresponding .npy file doesn't exist (e.g. filtered)
71 |                         continue
72 | 
73 |                     count = len(np.load(new_path))
74 |                     out.write(f'{new_path} {count} {new_class_id}\n')
75 | 
76 |         CONSOLE.print(f'Created list file @{out_dir}', style='green')
77 | 
78 | 
79 | if __name__ == '__main__':
80 |     main()
81 | 


--------------------------------------------------------------------------------
/src/data/generate_dataset.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os.path as osp
  3 | import random
  4 | import string
  5 | import sys
  6 | from argparse import ArgumentParser
  7 | from itertools import repeat
  8 | from multiprocessing import cpu_count
  9 | from pathlib import Path
 10 | 
 11 | import moviepy.editor as mpy
 12 | import numpy as np
 13 | import pandas as pd
 14 | from rich.console import Console
 15 | from tqdm.contrib.concurrent import process_map
 16 | 
 17 | sys.path.append('./src')  # noqa: E501
 18 | import utils as utils  # noqa isort:skip
 19 | 
 20 | CONSOLE = Console()
 21 | 
 22 | VIDEO_EXTS = ['mp4']
 23 | ANN_EXT = '.csv'
 24 | ANN_TO_INDEX = dict()
 25 | 
 26 | 
 27 | def gen_id(size=8):
 28 |     """Generate a random id."""
 29 |     chars = string.ascii_uppercase + string.digits
 30 |     return ''.join(random.choice(chars) for _ in range(size))
 31 | 
 32 | 
 33 | def generate_structure(out_dir: str, annotations: str):
 34 |     """Generate the videos dataset structure.
 35 | 
 36 |     Args:
 37 |         out_dir (str): directory to generate structure for
 38 |         annotations (str): path to annotation file that has classes
 39 |     """
 40 |     classes = utils.annotations_list(annotations)
 41 |     Path(out_dir).mkdir(parents=True, exist_ok=True)
 42 | 
 43 |     for split in ['train', 'val', 'test']:
 44 |         for c in classes:
 45 |             Path(osp.join(out_dir, split, c)).mkdir(parents=True,
 46 |                                                     exist_ok=True)
 47 |         open(osp.join(out_dir, f'{split}.txt'), 'w').close()
 48 | 
 49 | 
 50 | def get_video_annotation(id: int, anns: list) -> str:
 51 |     """Gets the annotation for a video based on its id. The assumption here is
 52 |     that both the video and its corresponding annotation have been named with a
 53 |     number.
 54 | 
 55 |     Args:
 56 |         id (int): video id
 57 |         anns (list): list of annotation paths
 58 | 
 59 |     Returns:
 60 |         ann (str): path to annotations
 61 |     """
 62 |     return (ann for ann in anns if ann.split(osp.sep)[-1][:-4] == id)
 63 | 
 64 | 
 65 | def parse_args():
 66 |     parser = ArgumentParser(prog='generate video dataset.'
 67 |                             'Videos have the same name as annotations')
 68 |     parser.add_argument('--src_dir',
 69 |                         default='dataset/',
 70 |                         help='source video directory')
 71 |     parser.add_argument('--ann',
 72 |                         type=str,
 73 |                         default='resources/annotations/all.txt',
 74 |                         help='annotation file')
 75 |     parser.add_argument('--out-dir',
 76 |                         default='mmaction2/data/phar',
 77 |                         help='out video directory')
 78 |     parser.add_argument('--split',
 79 |                         type=float,
 80 |                         nargs='+',
 81 |                         default=[0.8, 0.2, 0],
 82 |                         help='train/val/test split')
 83 |     parser.add_argument('--clip-len',
 84 |                         type=int,
 85 |                         default=10,
 86 |                         help='length of each clip')
 87 |     parser.add_argument('--num-processes',
 88 |                         type=int,
 89 |                         default=(cpu_count() - 2 or 1),
 90 |                         help='number of processes used')
 91 |     parser.add_argument('--level',
 92 |                         type=int,
 93 |                         default=1,
 94 |                         choices=[1, 2],
 95 |                         help='directory level to find videos')
 96 |     args = parser.parse_args()
 97 |     return args
 98 | 
 99 | 
100 | def write_annotation(path: str):
101 |     """Write the corresponding annotation to the annotation file. The
102 |     annotation consists of the path to the video + label number.
103 | 
104 |       `mmaction2/data/temp/train/the_snake/DOZ9WC51.mp4 9`
105 | 
106 |     Args:
107 |         path (str): path to the clip
108 |     """
109 |     path_to_ann_f, label = osp.split(osp.dirname(path))
110 |     with open(f'{path_to_ann_f}.txt', 'a') as out:
111 |         out.write(f'{path} {ANN_TO_INDEX[label]}')
112 |         out.write('\n')
113 | 
114 | 
115 | def get_actions_with_timestamps(path: str) -> list:
116 |     """Given the path to a csv file, get its timestamps.
117 | 
118 |     This function is specific to the temporal csv annotations
119 |     produced by the VIA annotator:
120 | 
121 |     `Show/Hide attribute editor` -> Add `Activity`
122 |         Name: "Activity";
123 |         Anchor: "Temporal Segment in Video or Audio";
124 |         Description: "Activity"
125 | 
126 |     Args:
127 |         path (str): path to annotation
128 | 
129 |     Returns:
130 |         list: list of timestamps
131 |     """
132 |     results = []
133 |     df = pd.read_csv(path)
134 |     for i in range(1, len(df)):
135 |         temp = str(df.iloc[i].value_counts()).split(' ')
136 |         results.append({
137 |             'action':
138 |             temp[0].split(':"')[1].strip('}"'),
139 |             'video':
140 |             ''.join(list(filter(lambda x: x not in '["],', temp[6]))),
141 |             'start':
142 |             float(temp[7][:-1]),
143 |             'end':
144 |             float(temp[8][:-2])
145 |         })
146 |     return results
147 | 
148 | 
149 | def extract_clips(items):
150 |     """Extract clips of length `args.clip_len` given a video and its
151 |     annotations."""
152 |     video_f, anns, args = items
153 |     ann = get_video_annotation(id=video_f.split(osp.sep)[-1][:-4], anns=anns)
154 |     ann = next(ann, None)
155 |     if ann is None:
156 |         CONSOLE.print(f'Video {video_f} has no annotations.', style='yellow')
157 |         return
158 | 
159 |     clip_len = args.clip_len
160 |     min_remainder = clip_len / 2  # ggf. overlap
161 |     np.random.seed()
162 |     split = np.random.choice(['train', 'val', 'test'], p=args.split)
163 |     video = mpy.VideoFileClip(video_f)
164 | 
165 |     for action in get_actions_with_timestamps(ann):
166 |         duration = action['end'] - action['start']
167 |         if np.isnan(duration):
168 |             # faulty annotation
169 |             continue
170 |         if duration < clip_len:
171 |             continue
172 | 
173 |         label = action['action'].replace('-', '_')
174 | 
175 |         if ANN_TO_INDEX.get(label, None) is None:
176 |             # skip if label not found
177 |             continue
178 | 
179 |         n_clips = int(duration / clip_len)
180 |         remainder = duration % clip_len
181 | 
182 |         for i in range(n_clips):
183 |             start = action['start'] + i * clip_len
184 |             end = start + clip_len
185 |             subclip = video.subclip(start, end)
186 |             out_f = f'{osp.join(args.out_dir, split, label, gen_id())}.mp4'
187 | 
188 |             try:
189 |                 subclip.write_videofile(out_f, logger=None)
190 |                 write_annotation(out_f)
191 |             except OSError:
192 |                 CONSOLE.print(f'{video_f} has bad annotations', style='red')
193 |                 continue
194 | 
195 |         if remainder >= min_remainder:
196 |             # small overlap will exist, but we savor`min_remainder` footage
197 |             out_f = f'{osp.join(args.out_dir, split, label, gen_id())}.mp4'
198 |             subclip = video.subclip(action['end'] - clip_len, action['end'])
199 |             try:
200 |                 subclip.write_videofile(out_f, logger=None)
201 |                 write_annotation(out_f)
202 |             except OSError:
203 |                 CONSOLE.print(f'{video_f} has bad annotations', style='red')
204 |                 pass
205 | 
206 | 
207 | def main():
208 |     args = parse_args()
209 |     assert sum(args.split) == 1, 'train/val/test split must equal to 1'
210 |     assert osp.exists(args.ann), 'provide label map file'
211 |     generate_structure(args.out_dir, args.ann)
212 |     global ANN_TO_INDEX
213 |     ANN_TO_INDEX = utils.annotations_dic(args.ann)
214 | 
215 |     if args.level == 1:
216 |         items = glob.glob(osp.join(args.src_dir, '*'))
217 |     elif args.level == 2:
218 |         items = glob.glob(osp.join(args.src_dir, '*', '*'))
219 | 
220 |     videos = [
221 |         item for item in items if any(
222 |             item.endswith(ext) for ext in VIDEO_EXTS)
223 |     ]
224 |     annotations = [item for item in items if item.endswith(ANN_EXT)]
225 |     np.random.shuffle(videos)
226 | 
227 |     process_map(extract_clips,
228 |                 zip(videos, repeat(annotations), repeat(args)),
229 |                 max_workers=args.num_processes,
230 |                 total=len(videos))
231 | 
232 | 
233 | if __name__ == '__main__':
234 |     main()
235 | 


--------------------------------------------------------------------------------
/src/data/generate_dataset_pose.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import os.path as osp
  4 | import pickle
  5 | import random
  6 | import subprocess
  7 | import sys
  8 | from argparse import ArgumentParser
  9 | from collections import defaultdict
 10 | from itertools import repeat
 11 | from multiprocessing import Pool
 12 | from pathlib import Path
 13 | 
 14 | import numpy as np
 15 | from tqdm import tqdm
 16 | 
 17 | sys.path.append('human-action-recognition/')  # noqa
 18 | 
 19 | CLIPS_PATH = 'clips'
 20 | RESULT_PATH = 'results'
 21 | 
 22 | 
 23 | def merge_pose_data(in_dir, out_dir, split):
 24 |     """Given the pose estimation of single videos stored as dictionaries in.
 25 | 
 26 |     .pkl format, merge them together and form a list of dictionaries.
 27 | 
 28 |     Args:
 29 |         in_dir ([string]): path to the .pkl files for individual clips
 30 |         out_dir ([string]): path to the out dir
 31 |         split ([string]): train, val, test
 32 |     """
 33 |     result = []
 34 |     for ann in os.listdir(in_dir):
 35 |         if ann.endswith('.pkl'):
 36 |             with open(osp.join(in_dir, ann), 'rb') as f:
 37 |                 annotations = pickle.load(f)
 38 |         result.append(annotations)
 39 | 
 40 |     out_file = osp.join(out_dir, f'bast_{split}.pkl')
 41 |     with open(out_file, 'wb') as out:
 42 |         pickle.dump(result, out, protocol=pickle.HIGHEST_PROTOCOL)
 43 | 
 44 | 
 45 | def generate_structure(path):
 46 |     Path(path).mkdir(parents=True, exist_ok=True)
 47 |     for split in ['train', 'val', 'test']:
 48 |         Path(osp.join(path, CLIPS_PATH, split)).mkdir(parents=True,
 49 |                                                       exist_ok=True)
 50 |     Path(osp.join(path, RESULT_PATH)).mkdir(parents=True, exist_ok=True)
 51 | 
 52 | 
 53 | def parse_args():
 54 |     parser = ArgumentParser(prog='generate pose data for skeleton-based-har '
 55 |                             'based on a VideoDataset directory.')
 56 |     parser.add_argument('src_dir', type=str, help='VideoDataset directory')
 57 |     parser.add_argument('split_set',
 58 |                         nargs='+',
 59 |                         choices=['train', 'val', 'test'],
 60 |                         help='type of sets to generate the pose dataset for')
 61 |     parser.add_argument('--out-dir',
 62 |                         type=str,
 63 |                         default='data/skeleton/bast_base/',
 64 |                         help='resulting dataset dir')
 65 |     parser.add_argument(
 66 |         '--ann',
 67 |         type=str,
 68 |         default=('human-action-recognition/har/annotations/BAST/base/'
 69 |                  'tanz_annotations.txt'),
 70 |         help='annotations')
 71 |     parser.add_argument('--devices',
 72 |                         nargs='+',
 73 |                         choices=['cuda:0', 'cuda:1', 'cuda:2', 'cuda:3'],
 74 |                         help='gpu to use; can parallelize for each split-set')
 75 |     args = parser.parse_args()
 76 |     return args
 77 | 
 78 | 
 79 | def merge_results(args):
 80 |     for split in args.split_set:
 81 |         in_dir = osp.join(args.out_dir, CLIPS_PATH, split)
 82 |         out_dir = osp.join(args.out_dir, RESULT_PATH)
 83 |         merge_pose_data(in_dir, out_dir, split)
 84 | 
 85 | 
 86 | def get_pose(video, args, split, gpu):
 87 |     script_path = ('human-action-recognition/har/tools/data/skeleton/'
 88 |                    'pose_extraction.py')
 89 |     if split == '':
 90 |         split = 'test'
 91 |     else:
 92 |         split = split.split('_')[1]
 93 | 
 94 |     out_dir = osp.join(args.out_dir, CLIPS_PATH, split)
 95 |     subargs = [
 96 |         'python', script_path, video, args.ann, '--out-dir', out_dir,
 97 |         '--device', gpu
 98 |     ]
 99 |     try:
100 |         logging.info(subprocess.run(subargs))
101 |     except subprocess.CalledProcessError as e:
102 |         logging.exception(f'Error while generating pose data for {video}: {e}')
103 | 
104 | 
105 | def extract_pose(pose_items):
106 |     split_label, gpu, args = pose_items
107 |     split, labels = split_label
108 |     label_path = osp.join(args.src_dir, split)
109 | 
110 |     for label in labels:
111 |         print(f'Extracting pose for {split} - {label}')
112 |         clip_path = osp.join(label_path, label)
113 | 
114 |         for clip in tqdm(os.listdir(clip_path)):
115 |             get_pose(osp.join(clip_path, clip), args, split, gpu)
116 | 
117 | 
118 | def main():
119 |     logging.basicConfig(filename='skeleton_dataset.log', level=logging.DEBUG)
120 |     args = parse_args()
121 |     generate_structure(args.out_dir)
122 |     n_gpus = len(args.devices)
123 |     pool = Pool(n_gpus)
124 | 
125 |     split_labels = []
126 |     for split in args.split_set:
127 |         # based on the current structure of the `data-transfer` volume
128 |         if split == 'test':
129 |             split = ''
130 |         else:
131 |             split = 'videos_' + split
132 | 
133 |         labels = os.listdir(osp.join(args.src_dir, split))
134 |         random.shuffle(labels)
135 |         # split_labels = [(train, [walk, ..., stamp]), ...
136 |         #   (val, [contract_expand, ..., fall])]
137 |         n_splits = int(n_gpus / len(args.split_set))
138 |         split_labels += [(split, label_split)
139 |                          for label_split in np.array_split(labels, n_splits)]
140 | 
141 |     if len(args.devices) > 1:
142 |         pool.map(extract_pose, zip(split_labels, args.devices, repeat(args)))
143 |     else:
144 |         print('Running on a single GPU')
145 |         dd = defaultdict(list)
146 |         # merge the splits
147 |         for key, value in split_labels:
148 |             if len(dd[key]) == 0:
149 |                 dd[key] = value
150 |             else:
151 |                 for v in value:
152 |                     dd[key].append(v)
153 |         split_labels = list(dd.items())
154 |         for split_label in split_labels:
155 |             extract_pose((split_label, args.devices[0], args))
156 | 
157 |     merge_results(args)
158 | 
159 | 
160 | if __name__ == '__main__':
161 |     main()
162 | 


--------------------------------------------------------------------------------
/src/data/pose_extraction.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | import argparse
  3 | import logging
  4 | import os
  5 | import os.path as osp
  6 | import random as rd
  7 | import shutil
  8 | import string
  9 | import sys
 10 | import warnings
 11 | from collections import defaultdict
 12 | 
 13 | import cv2
 14 | import mmcv
 15 | import numpy as np
 16 | from rich.console import Console
 17 | 
 18 | try:
 19 |     from mmdet.apis import inference_detector, init_detector
 20 |     from mmpose.apis import inference_top_down_pose_model, init_pose_model
 21 | except ImportError:
 22 |     warnings.warn(
 23 |         'Please install MMDet and MMPose for pose extraction.')  # noqa: E501
 24 | 
 25 | sys.path.append('src/')  # noqa
 26 | import utils as utils  # noqa isort:skip
 27 | 
 28 | MMDET_ROOT = 'mmdetection'
 29 | MMPOSE_ROOT = 'mmpose'
 30 | args = abc.ABC()
 31 | args = abc.abstractproperty()
 32 | args.det_config = f'{MMDET_ROOT}/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py'  # noqa: E501
 33 | args.det_checkpoint = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth'  # noqa: E501
 34 | args.pose_config = f'{MMPOSE_ROOT}/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192.py'  # noqa: E501
 35 | args.pose_checkpoint = 'https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth'  # noqa: E501
 36 | 
 37 | N_PERSON = 2  # * for bboxes
 38 | ANN_TO_INDEX = dict()
 39 | CONSOLE = Console()
 40 | 
 41 | 
 42 | def gen_id(size=8):
 43 |     chars = string.ascii_uppercase + string.digits
 44 |     return ''.join(rd.choice(chars) for _ in range(size))
 45 | 
 46 | 
 47 | def extract_frame(video_path):
 48 |     dname = gen_id()
 49 |     os.makedirs(dname, exist_ok=True)
 50 |     frame_tmpl = osp.join(dname, 'img_{:05d}.jpg')
 51 |     vid = cv2.VideoCapture(video_path)
 52 |     frame_paths = []
 53 |     flag, frame = vid.read()
 54 |     first_frame = frame
 55 |     cnt = 0
 56 |     while flag:
 57 |         frame_path = frame_tmpl.format(cnt + 1)
 58 |         frame_paths.append(frame_path)
 59 | 
 60 |         cv2.imwrite(frame_path, frame)
 61 |         cnt += 1
 62 |         flag, frame = vid.read()
 63 | 
 64 |     # corrupted video, no frame
 65 |     if first_frame is None:
 66 |         return None, None
 67 | 
 68 |     return frame_paths, first_frame.shape[:2]
 69 | 
 70 | 
 71 | def detection_inference(args, frame_paths, det_model=None):
 72 |     if det_model is None:
 73 |         model = init_detector(args.det_config, args.det_checkpoint,
 74 |                               args.device)
 75 |     else:
 76 |         model = det_model
 77 |     assert model.CLASSES[0] == 'person', ('We require you to use a detector '
 78 |                                           'trained on COCO')
 79 |     results = []
 80 |     CONSOLE.print('Performing Human Detection for each frame...',
 81 |                   style='green')
 82 |     prog_bar = mmcv.ProgressBar(len(frame_paths))
 83 |     for frame_path in frame_paths:
 84 |         result = inference_detector(model, frame_path)
 85 |         # We only keep human detections with score larger than det_score_thr
 86 |         result = result[0][result[0][:, 4] >= args.det_score_thr]
 87 |         results.append(result)
 88 |         prog_bar.update()
 89 |     return results
 90 | 
 91 | 
 92 | def intersection(b0, b1):
 93 |     l, r = max(b0[0], b1[0]), min(b0[2], b1[2])
 94 |     u, d = max(b0[1], b1[1]), min(b0[3], b1[3])
 95 |     return max(0, r - l) * max(0, d - u)
 96 | 
 97 | 
 98 | def iou(b0, b1):
 99 |     i = intersection(b0, b1)
100 |     u = area(b0) + area(b1) - i
101 |     return i / u
102 | 
103 | 
104 | def area(b):
105 |     return (b[2] - b[0]) * (b[3] - b[1])
106 | 
107 | 
108 | def removedup(bbox):
109 |     def inside(box0, box1, thre=0.8):
110 |         return intersection(box0, box1) / area(box0) > thre
111 | 
112 |     num_bboxes = bbox.shape[0]
113 |     if num_bboxes == 1 or num_bboxes == 0:
114 |         return bbox
115 |     valid = []
116 |     for i in range(num_bboxes):
117 |         flag = True
118 |         for j in range(num_bboxes):
119 |             if i != j and inside(bbox[i],
120 |                                  bbox[j]) and bbox[i][4] <= bbox[j][4]:
121 |                 flag = False
122 |                 break
123 |         if flag:
124 |             valid.append(i)
125 |     return bbox[valid]
126 | 
127 | 
128 | def is_easy_example(det_results, num_person):
129 |     threshold = 0.95
130 | 
131 |     def thre_bbox(bboxes, thre=threshold):
132 |         shape = [sum(bbox[:, -1] > thre) for bbox in bboxes]
133 |         ret = np.all(np.array(shape) == shape[0])
134 |         return shape[0] if ret else -1
135 | 
136 |     if thre_bbox(det_results) == num_person:
137 |         det_results = [x[x[..., -1] > 0.95] for x in det_results]
138 |         return True, np.stack(det_results)
139 |     return False, thre_bbox(det_results)
140 | 
141 | 
142 | def bbox2tracklet(bbox):
143 |     iou_thre = 0.6
144 |     tracklet_id = -1
145 |     tracklet_st_frame = {}
146 |     tracklets = defaultdict(list)
147 |     for t, box in enumerate(bbox):
148 |         for idx in range(box.shape[0]):
149 |             matched = False
150 |             for tlet_id in range(tracklet_id, -1, -1):
151 |                 cond1 = iou(tracklets[tlet_id][-1][-1], box[idx]) >= iou_thre
152 |                 cond2 = (t - tracklet_st_frame[tlet_id] -
153 |                          len(tracklets[tlet_id]) < 10)
154 |                 cond3 = tracklets[tlet_id][-1][0] != t
155 |                 if cond1 and cond2 and cond3:
156 |                     matched = True
157 |                     tracklets[tlet_id].append((t, box[idx]))
158 |                     break
159 |             if not matched:
160 |                 tracklet_id += 1
161 |                 tracklet_st_frame[tracklet_id] = t
162 |                 tracklets[tracklet_id].append((t, box[idx]))
163 |     return tracklets
164 | 
165 | 
166 | def drop_tracklet(tracklet):
167 |     tracklet = {k: v for k, v in tracklet.items() if len(v) > 5}
168 | 
169 |     def meanarea(track):
170 |         boxes = np.stack([x[1] for x in track]).astype(np.float32)
171 |         areas = (boxes[..., 2] - boxes[..., 0]) * (boxes[..., 3] -
172 |                                                    boxes[..., 1])
173 |         return np.mean(areas)
174 | 
175 |     tracklet = {k: v for k, v in tracklet.items() if meanarea(v) > 5000}
176 |     return tracklet
177 | 
178 | 
179 | def distance_tracklet(tracklet):
180 |     dists = {}
181 |     for k, v in tracklet.items():
182 |         bboxes = np.stack([x[1] for x in v])
183 |         c_x = (bboxes[..., 2] + bboxes[..., 0]) / 2.
184 |         c_y = (bboxes[..., 3] + bboxes[..., 1]) / 2.
185 |         c_x -= 480
186 |         c_y -= 270
187 |         c = np.concatenate([c_x[..., None], c_y[..., None]], axis=1)
188 |         dist = np.linalg.norm(c, axis=1)
189 |         dists[k] = np.mean(dist)
190 |     return dists
191 | 
192 | 
193 | def tracklet2bbox(track, num_frame):
194 |     # assign_prev
195 |     bbox = np.zeros((num_frame, 5))
196 |     trackd = {}
197 |     for k, v in track:
198 |         bbox[k] = v
199 |         trackd[k] = v
200 |     for i in range(num_frame):
201 |         if bbox[i][-1] <= 0.5:
202 |             mind = np.Inf
203 |             for k in trackd:
204 |                 if np.abs(k - i) < mind:
205 |                     mind = np.abs(k - i)
206 |             bbox[i] = bbox[k]
207 |     return bbox
208 | 
209 | 
210 | def tracklets2bbox(tracklet, num_frame):
211 |     dists = distance_tracklet(tracklet)
212 |     sorted_inds = sorted(dists, key=lambda x: dists[x])
213 |     dist_thre = np.Inf
214 |     for i in sorted_inds:
215 |         if len(tracklet[i]) >= num_frame / 2:
216 |             dist_thre = 2 * dists[i]
217 |             break
218 | 
219 |     dist_thre = max(50, dist_thre)
220 | 
221 |     bbox = np.zeros((num_frame, 5))
222 |     bboxd = {}
223 |     for idx in sorted_inds:
224 |         if dists[idx] < dist_thre:
225 |             for k, v in tracklet[idx]:
226 |                 if bbox[k][-1] < 0.01:
227 |                     bbox[k] = v
228 |                     bboxd[k] = v
229 |     bad = 0
230 |     for idx in range(num_frame):
231 |         if bbox[idx][-1] < 0.01:
232 |             bad += 1
233 |             mind = np.Inf
234 |             mink = None
235 |             for k in bboxd:
236 |                 if np.abs(k - idx) < mind:
237 |                     mind = np.abs(k - idx)
238 |                     mink = k
239 |             bbox[idx] = bboxd[mink]
240 |     return bad, bbox
241 | 
242 | 
243 | def bboxes2bbox(bbox, num_frame):
244 |     ret = np.zeros((num_frame, 2, 5))
245 |     for t, item in enumerate(bbox):
246 |         if item.shape[0] <= 2:
247 |             ret[t, :item.shape[0]] = item
248 |         else:
249 |             inds = sorted(list(range(item.shape[0])),
250 |                           key=lambda x: -item[x, -1])
251 |             ret[t] = item[inds[:2]]
252 |     for t in range(num_frame):
253 |         if ret[t, 0, -1] <= 0.01:
254 |             ret[t] = ret[t - 1]
255 |         elif ret[t, 1, -1] <= 0.01:
256 |             if t:
257 |                 if ret[t - 1, 0, -1] > 0.01 and ret[t - 1, 1, -1] > 0.01:
258 |                     if iou(ret[t, 0], ret[t - 1, 0]) > iou(
259 |                             ret[t, 0], ret[t - 1, 1]):
260 |                         ret[t, 1] = ret[t - 1, 1]
261 |                     else:
262 |                         ret[t, 1] = ret[t - 1, 0]
263 |     return ret
264 | 
265 | 
266 | def det_postproc(det_results, vid):
267 |     det_results = [removedup(x) for x in det_results]
268 |     CONSOLE.print(f'\nn_person={N_PERSON}', style='green')
269 | 
270 |     is_easy, bboxes = is_easy_example(det_results, N_PERSON)
271 |     if is_easy:
272 |         msg = f'\n{vid} Easy Example'
273 |         logging.info(msg)
274 |         CONSOLE.print(msg, style='green')
275 |         return bboxes
276 | 
277 |     tracklets = bbox2tracklet(det_results)
278 |     tracklets = drop_tracklet(tracklets)
279 | 
280 |     msg = (f'\n{vid } Hard {N_PERSON}-person Example, '
281 |            f'found {len(tracklets)} tracklet')
282 |     logging.info(msg)
283 |     CONSOLE.print(msg, style='green')
284 | 
285 |     if N_PERSON == 1:
286 |         if len(tracklets) == 1:
287 |             tracklet = list(tracklets.values())[0]
288 |             det_results = tracklet2bbox(tracklet, len(det_results))
289 |             # * return np.stack(det_results) - specific to the NTU dataset
290 |             return np.stack(
291 |                 np.array([np.array([det_res]) for det_res in det_results]))
292 |         else:
293 |             _, det_results = tracklets2bbox(tracklets, len(det_results))
294 |             return np.array([np.array([det_res]) for det_res in det_results])
295 |             # * return det_results - specific to the NTU dataset
296 | 
297 |     # * n_person = 2
298 | 
299 |     if len(tracklets) == 0:
300 |         # no bboxes found at all
301 |         return []
302 | 
303 |     if len(tracklets) <= 2:
304 |         tracklets = list(tracklets.values())
305 |         bboxes = []
306 |         for tracklet in tracklets:
307 |             bboxes.append(tracklet2bbox(tracklet, len(det_results))[:, None])
308 |         bbox = np.concatenate(bboxes, axis=1)
309 |         return bbox
310 |     else:
311 |         return bboxes2bbox(det_results, len(det_results))
312 | 
313 | 
314 | def pose_inference(args, frame_paths, det_results, pose_model=None):
315 |     if pose_model is None:
316 |         model = init_pose_model(args.pose_config, args.pose_checkpoint,
317 |                                 args.device)
318 |     else:
319 |         model = pose_model
320 |     CONSOLE.print('Performing Human Pose Estimation for each frame...',
321 |                   style='green')
322 |     prog_bar = mmcv.ProgressBar(len(frame_paths))
323 | 
324 |     num_frame = len(det_results)
325 |     num_person = max([len(x) for x in det_results])
326 |     kp = np.zeros((num_person, num_frame, 17, 3), dtype=np.float32)
327 | 
328 |     for i, (f, d) in enumerate(zip(frame_paths, det_results)):
329 |         # Align input format
330 |         d = [dict(bbox=x) for x in list(d) if x[-1] > 0.5]
331 |         pose = inference_top_down_pose_model(model, f, d, format='xyxy')[0]
332 |         for j, item in enumerate(pose):
333 |             kp[j, i] = item['keypoints']
334 |         prog_bar.update()
335 |     return kp
336 | 
337 | 
338 | def pose_extraction(vid,
339 |                     filter_pose,
340 |                     thr=None,
341 |                     det_model=None,
342 |                     pose_model=None):
343 |     frame_paths, img_shape = extract_frame(vid)
344 |     if frame_paths is None and img_shape is None:
345 |         CONSOLE.print(f'{vid} is corrupted', style='red')
346 |         return -1, -1
347 | 
348 |     det_results = detection_inference(args, frame_paths, det_model)
349 |     det_results = det_postproc(det_results, vid)
350 |     if 0 == len(det_results):
351 |         CONSOLE.print(f'No bounding boxes found for {vid}.', style='yellow')
352 |         return None, None
353 | 
354 |     pose_results = pose_inference(args, frame_paths, det_results, pose_model)
355 |     anno = dict()
356 |     anno['keypoint'] = pose_results[..., :2]
357 |     anno['keypoint_score'] = pose_results[..., 2]
358 |     anno['frame_dir'] = osp.splitext(osp.basename(vid))[0]
359 |     anno['img_shape'] = img_shape
360 |     anno['original_shape'] = img_shape
361 |     anno['total_frames'] = pose_results.shape[1]
362 |     anno['label'] = ANN_TO_INDEX[vid.split('/')[-2]]
363 | 
364 |     # filter pose estimation based on threshold
365 |     n_person = anno['keypoint_score'].shape[0]
366 |     n_frames = len(anno['keypoint_score'][0])
367 |     count_0 = 0
368 |     for k in range(0, n_person):
369 |         for i in range(0, n_frames):
370 |             for j in range(0, 17):  # 17 defined keypoints
371 |                 if anno['keypoint_score'][k][i][j] < thr:
372 |                     if filter_pose:
373 |                         anno['keypoint'][k][i][j] = 0
374 |                     count_0 += 1
375 | 
376 |     correct_rate = 1 - round(count_0 / (n_person * n_frames * 17), 3)
377 |     CONSOLE.print(
378 |         f'\n{100*correct_rate}% of poses have a threshold higher '
379 |         f'than {thr}',
380 |         style='yellow')
381 |     shutil.rmtree(osp.dirname(frame_paths[0]))
382 | 
383 |     return anno, correct_rate
384 | 
385 | 
386 | def parse_args():
387 |     parser = argparse.ArgumentParser(
388 |         description='Generate Pose Annotation for a single video')
389 |     parser.add_argument('video', type=str, help='source video')
390 |     parser.add_argument('ann', type=str, help='dataset annotations')
391 |     parser.add_argument('--out-dir',
392 |                         type=str,
393 |                         default='mmaction2/data/phar/pose',
394 |                         help='output dir')
395 |     parser.add_argument('--det-score-thr',
396 |                         type=float,
397 |                         default=0.5,
398 |                         help='detection score threshold')
399 |     parser.add_argument('--pose-score-thr',
400 |                         type=float,
401 |                         default=0.5,
402 |                         help='pose estimation score threshold')
403 |     parser.add_argument('--correct-rate',
404 |                         type=float,
405 |                         default=0.5,
406 |                         help=('if less than this rate of frame poses have a '
407 |                               'lower confidence than `poses-score-thr`, do not'
408 |                               'save the pkl result'))
409 |     parser.add_argument(
410 |         '--filter-pose',
411 |         action='store_true',
412 |         help='whether to set the pose estimation of frames '
413 |         'with score confidence less than the threshold to zero')
414 |     parser.add_argument('--device', type=str, default='cuda:0')
415 |     args = parser.parse_args()
416 |     return args
417 | 
418 | 
419 | def main(sub_args, det_model=None, pose_model=None):
420 |     out = osp.join(sub_args.out_dir,
421 |                    osp.splitext(sub_args.video.split('/')[-1])[0]) + '.pkl'
422 |     if osp.exists(out):
423 |         CONSOLE.print(f'{out} exists. Skipping...', style='yellow')
424 |         return
425 | 
426 |     global ANN_TO_INDEX, args
427 |     args = sub_args
428 |     ANN_TO_INDEX = utils.annotations_dic(args.ann)
429 |     anno, correct_rate = pose_extraction(args.video, args.filter_pose,
430 |                                          args.pose_score_thr, det_model,
431 |                                          pose_model)
432 |     if anno is None and correct_rate is None:
433 |         return 0
434 |     elif anno == -1 and correct_rate == -1:
435 |         return
436 | 
437 |     # save poses if they don't have more than `args.incorrect_thr %` of poses
438 |     # with a lower confidence than `args.poses_score_thr`
439 |     if correct_rate > args.correct_rate:
440 |         mmcv.dump(anno, out)
441 | 
442 |     return correct_rate
443 | 
444 | 
445 | if __name__ == '__main__':
446 |     logging.basicConfig(filename='pose_extraction.log', level=logging.DEBUG)
447 |     global_args = parse_args()
448 |     args.device = global_args.device
449 |     args.video = global_args.video
450 |     args.out_dir = global_args.out_dir
451 |     args.det_score_thr = global_args.det_score_thr
452 |     args.pose_score_thr = global_args.pose_score_thr
453 |     args.ann = global_args.ann
454 |     args.correct_rate = global_args.correct_rate
455 |     main(args)
456 | 


--------------------------------------------------------------------------------
/src/demo/__int__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlleshi/phar/241b2041f7bcebe319b772a99b26af0cd3f345fe/src/demo/__int__.py


--------------------------------------------------------------------------------
/src/demo/demo_audio.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import argparse
 3 | 
 4 | import torch
 5 | from mmaction.apis import inference_recognizer, init_recognizer
 6 | from mmcv import Config, DictAction
 7 | from rich.console import Console
 8 | 
 9 | CONSOLE = Console()
10 | 
11 | 
12 | def parse_args():
13 |     parser = argparse.ArgumentParser(description='MMAction2 demo')
14 |     parser.add_argument('config', help='test config file path')
15 |     parser.add_argument('checkpoint', help='checkpoint file/url')
16 |     parser.add_argument('audio', help='audio file')
17 |     parser.add_argument('--label',
18 |                         default='resources/annotations/annotations_audio.txt',
19 |                         help='label file')
20 |     parser.add_argument(
21 |         '--cfg-options',
22 |         nargs='+',
23 |         action=DictAction,
24 |         default={},
25 |         help='override some settings in the used config, the key-value pair '
26 |         'in xxx=yyy format will be merged into config file. For example, '
27 |         "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
28 |     parser.add_argument('--device',
29 |                         type=str,
30 |                         default='cuda:0',
31 |                         help='CPU/CUDA device option')
32 |     args = parser.parse_args()
33 |     return args
34 | 
35 | 
36 | def main():
37 |     args = parse_args()
38 |     device = torch.device(args.device)
39 |     cfg = Config.fromfile(args.config)
40 |     cfg.merge_from_dict(args.cfg_options)
41 | 
42 |     # build the recognizer from a config file and checkpoint file/url
43 |     model = init_recognizer(cfg, args.checkpoint, device=device)
44 |     if not args.audio.endswith('.npy'):
45 |         raise NotImplementedError('Demo works on extracted audio features')
46 | 
47 |     results = inference_recognizer(model, args.audio)
48 | 
49 |     labels = open(args.label).readlines()
50 |     labels = [x.strip() for x in labels]
51 |     results = [(labels[k[0]], k[1]) for k in results]
52 | 
53 |     CONSOLE.print('Scores:', style='green')
54 |     for result in results:
55 |         CONSOLE.print(f'{result[0]}: ', result[1])
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     main()
60 | 


--------------------------------------------------------------------------------
/src/demo/demo_skeleton.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import os.path as osp
  4 | import shutil
  5 | 
  6 | import cv2
  7 | import mmcv
  8 | import numpy as np
  9 | import torch
 10 | from mmaction.apis import inference_recognizer, init_recognizer
 11 | from mmcv import DictAction
 12 | from rich.console import Console
 13 | 
 14 | try:
 15 |     from mmdet.apis import inference_detector, init_detector
 16 | except (ImportError, ModuleNotFoundError):
 17 |     raise ImportError('Failed to import `inference_detector` and '
 18 |                       '`init_detector` form `mmdet.apis`. These apis are '
 19 |                       'required in this demo! ')
 20 | 
 21 | try:
 22 |     from mmpose.apis import (inference_top_down_pose_model, init_pose_model,
 23 |                              vis_pose_result)
 24 | except (ImportError, ModuleNotFoundError):
 25 |     raise ImportError('Failed to import `inference_top_down_pose_model`, '
 26 |                       '`init_pose_model`, and `vis_pose_result` form '
 27 |                       '`mmpose.apis`. These apis are required in this demo! ')
 28 | 
 29 | try:
 30 |     import moviepy.editor as mpy
 31 | except ImportError:
 32 |     raise ImportError('Please install moviepy to enable output file')
 33 | 
 34 | CONSOLE = Console()
 35 | 
 36 | FONTFACE = cv2.FONT_HERSHEY_DUPLEX
 37 | FONTSCALE = 0.85
 38 | FONTCOLOR = (255, 255, 0)  # BGR, white
 39 | FONTCOLOR_SCORE = (0, 165, 255)
 40 | THICKNESS = 1
 41 | LINETYPE = 1
 42 | 
 43 | # TODO: add json option
 44 | 
 45 | 
 46 | def parse_args():
 47 |     parser = argparse.ArgumentParser(description='MMAction2 demo')
 48 |     parser.add_argument('video', help='video file/url')
 49 |     parser.add_argument('out_filename', help='output filename')
 50 |     parser.add_argument(
 51 |         '--config',
 52 |         default=('configs/skeleton/posec3d/'
 53 |                  'slowonly_r50_u48_240e_ntu120_xsub_keypoint.py'),
 54 |         help='skeleton model config file path')
 55 |     parser.add_argument(
 56 |         '--checkpoint',
 57 |         default=('https://download.openmmlab.com/mmaction/skeleton/posec3d/'
 58 |                  'slowonly_r50_u48_240e_ntu120_xsub_keypoint/'
 59 |                  'slowonly_r50_u48_240e_ntu120_xsub_keypoint-6736b03f.pth'),
 60 |         help='skeleton model checkpoint file/url')
 61 |     parser.add_argument(
 62 |         '--det-config',
 63 |         default='mmaction2/demo/faster_rcnn_r50_fpn_2x_coco.py',
 64 |         help='human detection config file path (from mmdet)')
 65 |     parser.add_argument(
 66 |         '--det-checkpoint',
 67 |         default=('http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/'
 68 |                  'faster_rcnn_r50_fpn_2x_coco/'
 69 |                  'faster_rcnn_r50_fpn_2x_coco_'
 70 |                  'bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'),
 71 |         help='human detection checkpoint file/url')
 72 |     parser.add_argument(
 73 |         '--pose-config',
 74 |         default='mmaction2/demo/hrnet_w32_coco_256x192.py',
 75 |         help='human pose estimation config file path (from mmpose)')
 76 |     parser.add_argument(
 77 |         '--pose-checkpoint',
 78 |         default=('https://download.openmmlab.com/mmpose/top_down/hrnet/'
 79 |                  'hrnet_w32_coco_256x192-c78dce93_20200708.pth'),
 80 |         help='human pose estimation checkpoint file/url')
 81 |     parser.add_argument('--det-score-thr',
 82 |                         type=float,
 83 |                         default=0.8,
 84 |                         help='the threshold of human detection score')
 85 |     parser.add_argument('--label-map',
 86 |                         default='tools/data/skeleton/label_map_ntu120.txt',
 87 |                         help='label map file')
 88 |     parser.add_argument('--device',
 89 |                         type=str,
 90 |                         default='cuda:0',
 91 |                         help='CPU/CUDA device option')
 92 |     parser.add_argument('--short-side',
 93 |                         type=int,
 94 |                         default=480,
 95 |                         help='specify the short-side length of the image')
 96 |     parser.add_argument(
 97 |         '--cfg-options',
 98 |         nargs='+',
 99 |         action=DictAction,
100 |         default={},
101 |         help='override some settings in the used config, the key-value pair '
102 |         'in xxx=yyy format will be merged into config file. For example, '
103 |         "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
104 |     parser.add_argument('--pose-score-thr',
105 |                         type=float,
106 |                         default=0.4,
107 |                         help='pose estimation score threshold')
108 |     parser.add_argument(
109 |         '--correct-rate',
110 |         type=float,
111 |         default=0.4,
112 |         help=('if less than this rate of frame poses have a '
113 |               'lower confidence than `poses-score-thr`, skip the demo'))
114 |     args = parser.parse_args()
115 |     return args
116 | 
117 | 
118 | def frame_extraction(video_path, short_side):
119 |     """Extract frames given video_path.
120 | 
121 |     Args:
122 |         video_path (str): The video_path.
123 |     """
124 |     # Load the video, extract frames into ./tmp/video_name
125 |     target_dir = osp.join('./tmp', osp.basename(osp.splitext(video_path)[0]))
126 |     os.makedirs(target_dir, exist_ok=True)
127 |     # Should be able to handle videos up to several hours
128 |     frame_tmpl = osp.join(target_dir, 'img_{:06d}.jpg')
129 |     vid = cv2.VideoCapture(video_path)
130 |     frames = []
131 |     frame_paths = []
132 |     flag, frame = vid.read()
133 |     cnt = 0
134 |     new_h, new_w = None, None
135 |     while flag:
136 |         if new_h is None:
137 |             h, w, _ = frame.shape
138 |             new_w, new_h = mmcv.rescale_size((w, h), (short_side, np.Inf))
139 | 
140 |         frame = mmcv.imresize(frame, (new_w, new_h))
141 | 
142 |         frames.append(frame)
143 |         frame_path = frame_tmpl.format(cnt + 1)
144 |         frame_paths.append(frame_path)
145 | 
146 |         cv2.imwrite(frame_path, frame)
147 |         cnt += 1
148 |         flag, frame = vid.read()
149 | 
150 |     return frame_paths, frames
151 | 
152 | 
153 | def detection_inference(args, frame_paths):
154 |     """Detect human boxes given frame paths.
155 | 
156 |     Args:
157 |         args (argparse.Namespace): The arguments.
158 |         frame_paths (list[str]): The paths of frames to do detection inference.
159 | 
160 |     Returns:
161 |         list[np.ndarray]: The human detection results.
162 |     """
163 |     model = init_detector(args.det_config, args.det_checkpoint, args.device)
164 |     assert model.CLASSES[0] == 'person', ('We require you to use a detector '
165 |                                           'trained on COCO')
166 |     results = []
167 |     print('Performing Human Detection for each frame')
168 |     prog_bar = mmcv.ProgressBar(len(frame_paths))
169 |     for frame_path in frame_paths:
170 |         result = inference_detector(model, frame_path)
171 |         # We only keep human detections with score larger than det_score_thr
172 |         result = result[0][result[0][:, 4] >= args.det_score_thr]
173 |         results.append(result)
174 |         prog_bar.update()
175 |     return results
176 | 
177 | 
178 | def pose_inference(args, frame_paths, det_results):
179 |     model = init_pose_model(args.pose_config, args.pose_checkpoint,
180 |                             args.device)
181 |     ret = []
182 |     print('Performing Human Pose Estimation for each frame')
183 |     prog_bar = mmcv.ProgressBar(len(frame_paths))
184 |     for f, d in zip(frame_paths, det_results):
185 |         # Align input format
186 |         d = [dict(bbox=x) for x in list(d)]
187 |         pose = inference_top_down_pose_model(model, f, d, format='xyxy')[0]
188 |         ret.append(pose)
189 |         prog_bar.update()
190 |     return ret
191 | 
192 | 
193 | def main():
194 |     args = parse_args()
195 | 
196 |     frame_paths, original_frames = frame_extraction(args.video,
197 |                                                     args.short_side)
198 |     num_frame = len(frame_paths)
199 |     h, w, _ = original_frames[0].shape
200 | 
201 |     # Get clip_len, frame_interval and calculate center index of each clip
202 |     config = mmcv.Config.fromfile(args.config)
203 |     config.merge_from_dict(args.cfg_options)
204 |     for component in config.data.test.pipeline:
205 |         if component['type'] == 'PoseNormalize':
206 |             component['mean'] = (w // 2, h // 2, .5)
207 |             component['max_value'] = (w, h, 1.)
208 | 
209 |     model = init_recognizer(config, args.checkpoint, args.device)
210 | 
211 |     # Load label_map
212 |     label_map = [x.strip() for x in open(args.label_map).readlines()]
213 | 
214 |     # Get Human detection results
215 |     det_results = detection_inference(args, frame_paths)
216 |     torch.cuda.empty_cache()
217 | 
218 |     pose_results = pose_inference(args, frame_paths, det_results)
219 |     torch.cuda.empty_cache()
220 | 
221 |     fake_anno = dict(frame_dir='',
222 |                      label=-1,
223 |                      img_shape=(h, w),
224 |                      original_shape=(h, w),
225 |                      start_index=0,
226 |                      modality='Pose',
227 |                      total_frames=num_frame)
228 |     num_person = max([len(x) for x in pose_results])
229 |     num_person = 2  # TODO: one person can also be in the frame
230 |     CONSOLE.print(f'# Persons: {num_person}\n', style='green')
231 | 
232 |     num_keypoint = 17
233 |     keypoint = np.zeros((num_person, num_frame, num_keypoint, 2),
234 |                         dtype=np.float16)
235 |     keypoint_score = np.zeros((num_person, num_frame, num_keypoint),
236 |                               dtype=np.float16)
237 |     for i, poses in enumerate(pose_results):
238 |         for j, pose in enumerate(poses):
239 |             pose = pose['keypoints']
240 |             try:
241 |                 keypoint[j, i] = pose[:, :2]
242 |             except IndexError:
243 |                 continue
244 |             keypoint_score[j, i] = pose[:, 2]
245 | 
246 |     fake_anno['keypoint'] = keypoint
247 |     fake_anno['keypoint_score'] = keypoint_score
248 |     count_0 = 0
249 | 
250 |     for k in range(0, num_person):
251 |         for i in range(0, num_frame):
252 |             for j in range(0, 17):  # 17 defined keypoints
253 |                 if fake_anno['keypoint_score'][k][i][j] < args.pose_score_thr:
254 |                     # fake_anno['keypoint'][k][i][j] = 0
255 |                     count_0 += 1
256 | 
257 |     correct_rate = 1 - round(count_0 / (num_person * num_frame * 17), 3)
258 |     if correct_rate < args.correct_rate:
259 |         CONSOLE.print((f'Clip has correct rate of {correct_rate} lower than '
260 |                        f'the threshold of {args.correct_rate}. Skipping...'),
261 |                       style='red')
262 |         tmp_frame_dir = osp.dirname(frame_paths[0])
263 |         shutil.rmtree(tmp_frame_dir)
264 |         return
265 | 
266 |     results = inference_recognizer(model, fake_anno)
267 | 
268 |     top_actions = 3
269 |     action_labels = [label_map[results[i][0]] for i in range(top_actions)]
270 |     action_scores = [results[i][1] for i in range(top_actions)]
271 | 
272 |     pose_model = init_pose_model(args.pose_config, args.pose_checkpoint,
273 |                                  args.device)
274 |     vis_frames = [
275 |         vis_pose_result(pose_model, frame_paths[i], pose_results[i])
276 |         for i in range(num_frame)
277 |     ]
278 |     x, y = 10, 30
279 |     x_y_dist = 200
280 |     for frame in vis_frames:
281 |         i = 0
282 |         for label, score in zip(action_labels, action_scores):
283 |             i += 1
284 |             cv2.putText(frame, label, (x, y * i), FONTFACE, FONTSCALE,
285 |                         FONTCOLOR, THICKNESS, LINETYPE)
286 |             cv2.putText(frame, str(round(100 * score,
287 |                                          2)), (x + x_y_dist, y * i), FONTFACE,
288 |                         FONTSCALE, FONTCOLOR_SCORE, THICKNESS, LINETYPE)
289 | 
290 |     vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames], fps=24)
291 |     vid.write_videofile(args.out_filename, remove_temp=True)
292 | 
293 |     tmp_frame_dir = osp.dirname(frame_paths[0])
294 |     shutil.rmtree(tmp_frame_dir)
295 | 
296 | 
297 | if __name__ == '__main__':
298 |     main()
299 | 


--------------------------------------------------------------------------------
/src/demo/long_video_demo_clips.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import os.path as osp
  4 | import random
  5 | import string
  6 | import subprocess
  7 | from itertools import repeat
  8 | from multiprocessing import Manager, Pool, cpu_count
  9 | 
 10 | import moviepy.editor as mpy
 11 | import numpy as np
 12 | from rich.console import Console
 13 | 
 14 | CONSOLE = Console()
 15 | manager = Manager()
 16 | clips = manager.list()
 17 | json_res = manager.list()
 18 | 
 19 | MIN_CLIP_DUR = None
 20 | 
 21 | 
 22 | def gen_id(size=8):
 23 |     """Generate a random id."""
 24 |     chars = string.ascii_uppercase + string.digits
 25 |     return ''.join(random.choice(chars) for _ in range(size))
 26 | 
 27 | 
 28 | def prettify(byte_content):
 29 |     decoded = byte_content.decode('utf-8')
 30 |     formatted_output = decoded.replace('\\n', '\n').replace('\\t', '\t')
 31 |     return formatted_output
 32 | 
 33 | 
 34 | def delete_clips(clips):
 35 |     for clip in clips:
 36 |         try:
 37 |             os.unlink(clip)
 38 |         except FileNotFoundError:
 39 |             pass
 40 | 
 41 | 
 42 | def parse_args():
 43 |     parser = argparse.ArgumentParser(
 44 |         description='long video demo based on clips')
 45 |     parser.add_argument('video', help='video file')
 46 |     parser.add_argument('config', help='model config file')
 47 |     parser.add_argument('checkpoint', help='model checkpoint')
 48 |     parser.add_argument('out', help='out file. Video or Json')
 49 |     parser.add_argument('--ann',
 50 |                         type=str,
 51 |                         default='resources/annotations/annotations_pose.txt',
 52 |                         help='for base or eval annotations')
 53 |     parser.add_argument('--type',
 54 |                         type=str,
 55 |                         default='pose',
 56 |                         choices=['pose', 'recognition'],
 57 |                         help='whether the demo will be pose or recognition')
 58 |     parser.add_argument('--num-processes',
 59 |                         type=int,
 60 |                         default=(cpu_count() - 1 or 1),
 61 |                         help='Number of processes to extract subclips')
 62 |     parser.add_argument('--num-gpus',
 63 |                         type=int,
 64 |                         default=1,
 65 |                         help='Number of gpus to perform pose-har')
 66 |     parser.add_argument('--subclip-duration',
 67 |                         type=int,
 68 |                         default=7,
 69 |                         help='duration of subclips')
 70 |     args = parser.parse_args()
 71 |     return args
 72 | 
 73 | 
 74 | def pose(items):
 75 |     gpu, clips, args = items
 76 |     script_path = 'src/demo/demo_skeleton.py'
 77 |     if not osp.exists(script_path):
 78 |         CONSOLE.print(f'{script_path} does not exist', style='red')
 79 |     for clip in clips:
 80 |         subargs = [
 81 |             'python',
 82 |             script_path,
 83 |             clip,
 84 |             clip,  # overwrite original clip
 85 |             '--config',
 86 |             args.config,
 87 |             '--checkpoint',
 88 |             args.checkpoint,
 89 |             '--label-map',
 90 |             args.ann,  # class annotations
 91 |             '--device',
 92 |             gpu
 93 |         ]
 94 |         result = subprocess.run(subargs, capture_output=True)
 95 |         error = result.stderr.decode('utf-8')
 96 |         if error:
 97 |             CONSOLE.print(error, style='red')
 98 | 
 99 | 
100 | def recognition(items):
101 |     gpu, clips, args = items
102 |     script_path = 'demo/demo.py'
103 |     for clip in clips:
104 |         subargs = [
105 |             'python',
106 |             script_path,
107 |             args.config,
108 |             args.checkpoint,
109 |             clip,
110 |             args.ann,  # class annotations
111 |             '--font-color',
112 |             'blue',
113 |             '--out-filename',
114 |             clip,  # overwrite original clip
115 |             '--device',
116 |             gpu
117 |         ]
118 |         try:
119 |             subprocess.check_output(subargs)
120 |         except Exception as e:
121 |             CONSOLE.print(e, style='bold red')
122 | 
123 | 
124 | def extract_subclip(items):
125 |     ts, timestamps, video = items
126 |     video = mpy.VideoFileClip(video)
127 |     start = timestamps[ts[0]]
128 |     finish = timestamps[ts[1]]
129 | 
130 |     clip_pth = f'{ts[0]}_{gen_id()}.mp4'
131 |     clips.append(clip_pth)
132 | 
133 |     try:
134 |         clip = video.subclip(start, finish)
135 |         if clip.duration < MIN_CLIP_DUR:
136 |             CONSOLE.print(f'Subclip duration < {MIN_CLIP_DUR}. Skipping...',
137 |                           style='yellow')
138 |             return
139 |         clip.write_videofile(clip_pth, logger=None, audio=False)
140 |     except OSError as e:
141 |         CONSOLE.print(e, style='bold red')
142 |         pass
143 |     finally:
144 |         video.close()
145 | 
146 | 
147 | def merge_clips(clips, out):
148 |     clips = sorted(clips, key=lambda x: int(x[2:4]))
149 |     video_clips = []
150 |     for clip in clips:
151 |         try:
152 |             video_clips.append(mpy.VideoFileClip(clip))
153 |         except OSError:
154 |             pass
155 | 
156 |     result = mpy.concatenate_videoclips(video_clips, method='compose')
157 |     result.write_videofile(out)
158 |     delete_clips(clips)
159 | 
160 | 
161 | def merge_json(json_res, time_segments, out):
162 |     result = {}
163 |     json_res = sorted(json_res, key=lambda x: int(x[:2]))
164 |     for tup in zip(time_segments, json_res):
165 |         result[str(tup[0])] = tup[1].split(' ', 1)[1].strip()
166 | 
167 |     import json
168 |     with open(out, 'w') as f:
169 |         json.dump(result, f, indent=2)
170 | 
171 | 
172 | def main():
173 |     args = parse_args()
174 |     global MIN_CLIP_DUR
175 |     MIN_CLIP_DUR = args.subclip_duration
176 | 
177 |     splits = int(
178 |         mpy.VideoFileClip(args.video).duration / args.subclip_duration)
179 |     timestamps = {
180 |         f'ts{i:02}': args.subclip_duration * i
181 |         for i in range(0, splits + 1)
182 |     }
183 |     time_segments = [(f'ts{i:02}', f'ts{i+1:02}') for i in range(0, splits)]
184 |     # add a timestamp for any remaining segments < 10s
185 |     rest_timestamp = f'ts{int(list(timestamps.keys())[-1][2:]) + 1}'
186 |     timestamps[rest_timestamp] = None
187 |     time_segments.append(
188 |         (list(timestamps.keys())[-2], list(timestamps.keys())[-1]))
189 | 
190 |     CONSOLE.print('Extracting subclips...', style='green')
191 |     pool1 = Pool(args.num_processes)
192 |     gpus = [f'cuda:{i}' for i in range(args.num_gpus)]
193 |     pool1.map(extract_subclip,
194 |               zip(time_segments, repeat(timestamps), repeat(args.video)))
195 | 
196 |     pool2 = Pool(len(gpus))
197 |     callback = pose if args.type == 'pose' else recognition
198 |     CONSOLE.print(f'Performing {args.type}...', style='green')
199 |     clips_per_gpus = [
200 |         label_split for label_split in np.array_split(clips, args.num_gpus)
201 |     ]
202 |     pool2.map(callback, zip(gpus, clips_per_gpus, repeat(args)))
203 | 
204 |     merge_clips(clips, args.out.split('.')[0] + '.mp4')
205 |     if args.out.endswith('.json'):
206 |         merge_json(json_res, time_segments, args.out)
207 | 
208 | 
209 | if __name__ == '__main__':
210 |     main()
211 | 


--------------------------------------------------------------------------------
/src/demo/visualize_heatmap_volume.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os.path as osp
  3 | import sys
  4 | 
  5 | import cv2
  6 | import decord
  7 | import moviepy.editor as mpy
  8 | import numpy as np
  9 | from mmaction.datasets.pipelines import Compose
 10 | from mmcv import load
 11 | 
 12 | from mmpose.apis import vis_pose_result
 13 | from mmpose.models import TopDown
 14 | 
 15 | sys.path.append('src/')  # noqa
 16 | import utils as utils  # noqa isort:skip
 17 | 
 18 | keypoint_pipeline = [
 19 |     dict(type='PoseDecode'),
 20 |     dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True),
 21 |     dict(type='Resize', scale=(-1, 64)),
 22 |     dict(type='CenterCrop', crop_size=64),
 23 |     dict(type='GeneratePoseTarget',
 24 |          sigma=0.6,
 25 |          use_score=True,
 26 |          with_kp=True,
 27 |          with_limb=False)
 28 | ]
 29 | 
 30 | limb_pipeline = [
 31 |     dict(type='PoseDecode'),
 32 |     dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True),
 33 |     dict(type='Resize', scale=(-1, 64)),
 34 |     dict(type='CenterCrop', crop_size=64),
 35 |     dict(type='GeneratePoseTarget',
 36 |          sigma=0.6,
 37 |          use_score=True,
 38 |          with_kp=False,
 39 |          with_limb=True)
 40 | ]
 41 | 
 42 | FONTFACE = cv2.FONT_HERSHEY_DUPLEX
 43 | FONTSCALE = 0.6
 44 | FONTCOLOR = (255, 255, 255)
 45 | BGBLUE = (0, 119, 182)
 46 | THICKNESS = 1
 47 | LINETYPE = 1
 48 | 
 49 | 
 50 | def add_label(frame, label, BGCOLOR=BGBLUE):
 51 |     threshold = 30
 52 | 
 53 |     def split_label(label):
 54 |         label = label.split()
 55 |         lines, cline = [], ''
 56 |         for word in label:
 57 |             if len(cline) + len(word) < threshold:
 58 |                 cline = cline + ' ' + word
 59 |             else:
 60 |                 lines.append(cline)
 61 |                 cline = word
 62 |         if cline != '':
 63 |             lines += [cline]
 64 |         return lines
 65 | 
 66 |     if len(label) > 30:
 67 |         label = split_label(label)
 68 |     else:
 69 |         label = [label]
 70 |     label = ['Action: '] + label
 71 | 
 72 |     sizes = []
 73 |     for line in label:
 74 |         sizes.append(cv2.getTextSize(line, FONTFACE, FONTSCALE, THICKNESS)[0])
 75 |     box_width = max([x[0] for x in sizes]) + 10
 76 |     text_height = sizes[0][1]
 77 |     box_height = len(sizes) * (text_height + 6)
 78 | 
 79 |     cv2.rectangle(frame, (0, 0), (box_width, box_height), BGCOLOR, -1)
 80 |     for i, line in enumerate(label):
 81 |         location = (5, (text_height + 6) * i + text_height + 3)
 82 |         cv2.putText(frame, line, location, FONTFACE, FONTSCALE, FONTCOLOR,
 83 |                     THICKNESS, LINETYPE)
 84 |     return frame
 85 | 
 86 | 
 87 | def vis_skeleton(vid_path, anno, category_name=None, ratio=0.5):
 88 |     vid = decord.VideoReader(vid_path)
 89 |     frames = [x.asnumpy() for x in vid]
 90 | 
 91 |     h, w, _ = frames[0].shape
 92 |     new_shape = (int(w * ratio), int(h * ratio))
 93 |     frames = [cv2.resize(f, new_shape) for f in frames]
 94 | 
 95 |     assert len(frames) == anno['total_frames']
 96 |     # The shape is N x T x K x 3
 97 |     kps = np.concatenate([anno['keypoint'], anno['keypoint_score'][..., None]],
 98 |                          axis=-1)
 99 |     kps[..., :2] *= ratio
100 |     # Convert to T x N x K x 3
101 |     kps = kps.transpose([1, 0, 2, 3])
102 |     vis_frames = []
103 | 
104 |     # we need an instance of TopDown model, so build a minimal one
105 |     model = TopDown(backbone=dict(type='ShuffleNetV1'))
106 | 
107 |     for f, kp in zip(frames, kps):
108 |         bbox = np.zeros([0, 4], dtype=np.float32)
109 |         result = [dict(bbox=bbox, keypoints=k) for k in kp]
110 |         vis_frame = vis_pose_result(model, f, result)
111 | 
112 |         if category_name is not None:
113 |             vis_frame = add_label(vis_frame, category_name)
114 | 
115 |         vis_frames.append(vis_frame)
116 |     return vis_frames
117 | 
118 | 
119 | def get_pseudo_heatmap(anno, flag='keypoint'):
120 |     assert flag in ['keypoint', 'limb']
121 |     pipeline = Compose(keypoint_pipeline if flag ==
122 |                        'keypoint' else limb_pipeline)
123 |     return pipeline(anno)['imgs']
124 | 
125 | 
126 | def vis_heatmaps(heatmaps, channel=-1, ratio=8):
127 |     # if channel is -1, draw all keypoints / limbs on the same map
128 |     import matplotlib.cm as cm
129 |     h, w, _ = heatmaps[0].shape
130 |     newh, neww = int(h * ratio), int(w * ratio)
131 | 
132 |     if channel == -1:
133 |         heatmaps = [np.max(x, axis=-1) for x in heatmaps]
134 |     cmap = cm.viridis
135 |     heatmaps = [(cmap(x)[..., :3] * 255).astype(np.uint8) for x in heatmaps]
136 |     heatmaps = [cv2.resize(x, (neww, newh)) for x in heatmaps]
137 |     return heatmaps
138 | 
139 | 
140 | def parse_args():
141 |     parser = argparse.ArgumentParser(description='Visualize Pose & Heatmap')
142 |     parser.add_argument('video', type=str, help='source video')
143 |     parser.add_argument('pose_ann', type=str, help='pose pickle annotation')
144 |     parser.add_argument('--ann',
145 |                         type=str,
146 |                         default='resources/annotations/annotations_pose.txt',
147 |                         help='dataset annotations')
148 |     parser.add_argument('--det-score-thr',
149 |                         type=float,
150 |                         help='detection score threshold')
151 |     parser.add_argument('--out-dir', type=str, default='demos/')
152 |     parser.add_argument('--device', type=str, default='cuda:0')
153 |     args = parser.parse_args()
154 |     return args
155 | 
156 | 
157 | def main():
158 |     args = parse_args()
159 |     anno = load(args.pose_ann)
160 |     categories = utils.annotations_list(args.ann)
161 |     video_name = osp.splitext(args.video.split('/')[-1])[0]
162 | 
163 |     # visualize skeleton
164 |     vis_frames = vis_skeleton(args.video,
165 |                               anno,
166 |                               categories[anno['label']],
167 |                               ratio=1)
168 |     cv2.imwrite(osp.join(args.out_dir, f'{video_name}_pose.jpg'),
169 |                 vis_frames[int(len(vis_frames) / 2)])
170 |     vid = mpy.ImageSequenceClip(vis_frames, fps=24)
171 |     vid.write_videofile(osp.join(args.out_dir, f'{video_name}_pose.mp4'))
172 | 
173 |     # visualize heatmaps
174 |     keypoint_heatmap = get_pseudo_heatmap(anno)
175 |     keypoint_mapvis = vis_heatmaps(keypoint_heatmap)
176 |     keypoint_mapvis = [
177 |         add_label(f, categories[anno['label']]) for f in keypoint_mapvis
178 |     ]
179 |     vid = mpy.ImageSequenceClip(keypoint_mapvis, fps=24)
180 |     vid.write_videofile(osp.join(args.out_dir, f'{video_name}_heatmap.mp4'))
181 | 
182 | 
183 | if __name__ == '__main__':
184 |     main()
185 | 


--------------------------------------------------------------------------------
/src/late_fusion.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os.path as osp
  3 | import sys
  4 | 
  5 | from mmaction.core.evaluation import (get_weighted_score, mean_class_accuracy,
  6 |                                       top_k_accuracy)
  7 | from mmcv import load
  8 | from rich.console import Console
  9 | from scipy.special import softmax
 10 | 
 11 | sys.path.append('./tools')  # noqa
 12 | import utils as utils  # noqa isort:skip
 13 | 
 14 | CONSOLE = Console()
 15 | 
 16 | 
 17 | def get_class_id(path: str) -> int:
 18 |     """Get the label id of a clip given its path (e.g. 1).
 19 | 
 20 |     Args:
 21 |         path (str): path to clip
 22 | 
 23 |     Returns:
 24 |         int: label id of clips
 25 |     """
 26 |     return int(osp.splitext(osp.basename(path.split()[1]))[0])
 27 | 
 28 | 
 29 | def get_clip_id(path: str) -> str:
 30 |     """Get the name (id) of a clip given its path (e.g. XNXXNAER).
 31 | 
 32 |     Args:
 33 |         path (str): path to clip
 34 | 
 35 |     Returns:
 36 |         str: clip name(id)
 37 |     """
 38 |     return osp.splitext(osp.basename(path.split()[0]))[0]
 39 | 
 40 | 
 41 | def clip_ids(datalist: list) -> list:
 42 |     """Returns a list of clip ids given the datalist.
 43 | 
 44 |     Args:
 45 |         datalist (list): label map
 46 | 
 47 |     Returns:
 48 |         list: of ids
 49 |     """
 50 |     return [get_clip_id(d) for d in datalist]
 51 | 
 52 | 
 53 | def parse_args():
 54 |     parser = argparse.ArgumentParser(description='Fusing multiple scores')
 55 |     parser.add_argument('--scores',
 56 |                         nargs='+',
 57 |                         help='list of scores',
 58 |                         default=['demo/fuse/rgb.pkl', 'demo/fuse/flow.pkl'])
 59 |     parser.add_argument('--coefficients',
 60 |                         nargs='+',
 61 |                         type=float,
 62 |                         help='coefficients of each score file',
 63 |                         default=[1.0, 1.0])
 64 |     parser.add_argument(
 65 |         '--datalists',
 66 |         nargs='+',
 67 |         help='list of testing data',
 68 |         default=[
 69 |             'mmaction2/data/phar/val.txt',
 70 |             'mmaction2/data/phar/audio_feature/filtered_20/val.txt'
 71 |         ])
 72 |     parser.add_argument('--apply-softmax', action='store_true')
 73 |     parser.add_argument('--top-k',
 74 |                         nargs='+',
 75 |                         type=int,
 76 |                         default=[1, 2, 3, 4, 5],
 77 |                         help='top k accuracy to calculate')
 78 |     parser.add_argument('--label-map',
 79 |                         nargs='+',
 80 |                         help='annotation files',
 81 |                         default=[
 82 |                             'resources/annotations/annotations.txt',
 83 |                             'resources/annotations/annotations_audio.txt'
 84 |                         ])
 85 |     args = parser.parse_args()
 86 |     return args
 87 | 
 88 | 
 89 | def main():
 90 |     args = parse_args()
 91 |     assert len(args.scores) == len(args.coefficients) == len(args.label_map)
 92 | 
 93 |     lmaps = []
 94 |     for lmap in args.label_map:
 95 |         lmaps.append(utils.annotations_dict_rev(lmap))
 96 |     score_list = [load(f) for f in args.scores]
 97 |     data = [open(dl).readlines() for dl in args.datalists]
 98 | 
 99 |     # superset contains all the samples to be tested
100 |     superset = max(data, key=len)
101 |     superset_score = max(score_list, key=len)
102 |     superset_lmap = max(lmaps, key=len)
103 |     # remove the superset from the lists
104 |     i = 0
105 |     while i < len(data):
106 |         if data[i] is superset:
107 |             data.remove(data[i])
108 |             score_list.remove(score_list[i])
109 |             lmaps.remove(lmaps[i])
110 |             break
111 |         i += 1
112 | 
113 |     # reload superset labels
114 |     superset_lmap = utils.annotations_dic(args.label_map[i])
115 |     labels = [int(x.strip().split()[-1]) for x in superset]
116 |     superset_ids = clip_ids(superset)
117 |     for d in data:
118 |         # CONSOLE.print(set(clip_ids(d)).difference(superset_ids))
119 |         assert set(clip_ids(d)).issubset(superset_ids)
120 | 
121 |     # order & fill in the scores of the subsets according to the superset
122 |     ordered_scores = []
123 |     superset_ids = clip_ids(superset)
124 |     zeros = [0 for _ in range(len(superset_score[0]))]
125 |     for i in range(len(score_list)):
126 |         ordered_scores.append(list())
127 |         data_ids = clip_ids(data[i])
128 |         for clip in superset:
129 |             id = get_clip_id(clip)
130 |             if id not in data_ids:
131 |                 ordered_scores[i].append(zeros)
132 |             else:
133 |                 score = score_list[i][data_ids.index(id)]
134 |                 to_add = zeros.copy()
135 |                 for j in range(len(score)):
136 |                     # add the scores of the models with less classes in the
137 |                     # exact same position as it is in the model that contains
138 |                     # all the classes
139 |                     index = superset_lmap[lmaps[i][j]]
140 |                     to_add[index] = score[j]
141 |                 ordered_scores[i].append(to_add)
142 | 
143 |     ordered_scores.insert(0, superset_score)
144 | 
145 |     if args.apply_softmax:
146 | 
147 |         def apply_softmax(scores):
148 |             return [softmax(score) for score in scores]
149 | 
150 |         ordered_scores = [apply_softmax(scores) for scores in ordered_scores]
151 | 
152 |     weighted_scores = get_weighted_score(ordered_scores, args.coefficients)
153 |     CONSOLE.print('Weighted Scores', style='green')
154 |     mean_class_acc = mean_class_accuracy(weighted_scores, labels)
155 |     top_k = top_k_accuracy(weighted_scores, labels, args.top_k)
156 |     print(f'Mean Class Accuracy: {mean_class_acc:.04f}')
157 |     for k, topk in enumerate(top_k):
158 |         CONSOLE.print(f'Top {k+1} Accuracy: {topk:.04f}')
159 | 
160 | 
161 | if __name__ == '__main__':
162 |     main()
163 | 


--------------------------------------------------------------------------------
/src/record_experiment.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import os.path as osp
  3 | import re
  4 | from argparse import ArgumentParser
  5 | from pathlib import Path
  6 | 
  7 | import mlflow
  8 | from rich.console import Console
  9 | 
 10 | CONSOLE = Console()
 11 | 
 12 | 
 13 | def parse_args():
 14 |     parser = ArgumentParser(prog='track experiments with mlflow tracking'
 15 |                             'https://mlflow.org/docs/latest/tracking.html')
 16 |     parser.add_argument(
 17 |         'experiment_name',
 18 |         help='name of experiment. Should correspond the model name')
 19 |     parser.add_argument(
 20 |         'run_name',
 21 |         help='name of experiment run. Add things like hyperparameters here.')
 22 |     parser.add_argument('work_dir', help='dir where model files are stored')
 23 |     parser.add_argument('--mlrun-dir',
 24 |                         default='./mlruns',
 25 |                         help='mlrun storage dir. Leave default.')
 26 |     parser.add_argument('--data-dir',
 27 |                         default='mmaction2/data/phar/',
 28 |                         help='path to train/val/test dataset')
 29 |     args = parser.parse_args()
 30 |     return args
 31 | 
 32 | 
 33 | def get_train_acc(log, start, topk_length, top_train):
 34 |     """Get training accuracy from mmaction2 log files."""
 35 |     # * play with these two parameters if the results aren't perfect
 36 |     # audio: 1400, 6
 37 |     look_back, n_back = 1400, 6
 38 |     # train indexes start before needles[1]
 39 |     train_index = start
 40 |     # take average of last n_back readings
 41 |     for row in log[train_index - look_back:train_index].split('\t'):
 42 |         for i in range(1, 6):
 43 |             t = f'top{i}'
 44 |             sub_index = row.find(t)
 45 |             if sub_index == -1:
 46 |                 break
 47 | 
 48 |             topk = row[sub_index:sub_index + topk_length]
 49 |             topk = float(topk.split('acc: ')[1])
 50 |             top_train[t] += topk
 51 | 
 52 |     top_train = {k: round(v / n_back, 3) for k, v in top_train.items()}
 53 |     return top_train
 54 | 
 55 | 
 56 | def get_train_val_acc(logs):
 57 |     """Get the validation & training accuracy from mmaction2 log files."""
 58 | 
 59 |     # specific to mmaction2 logs
 60 |     needles = ('Now best checkpoint is saved as', 'Evaluating top_k_accuracy')
 61 |     topk_length = 15
 62 | 
 63 |     top_val = {f'top{k}': 0 for k in range(1, 6)}
 64 |     top_train = {f'top{k}': 0 for k in range(1, 6)}
 65 | 
 66 |     for log in logs:
 67 |         # find all indexes for new best models logs
 68 |         new_best_indexes = [m.start() for m in re.finditer(needles[0], log)]
 69 | 
 70 |         for index in new_best_indexes:
 71 |             # topks are replaced only if top1 is exceeded
 72 |             replace = False
 73 |             # find the start of the new best models log
 74 |             start = log[:index].rfind(needles[1])
 75 | 
 76 |             for i in range(1, 6):
 77 |                 t = f'top{i}'
 78 |                 sub_index = log[start:index].find(t)
 79 |                 topk = log[start + sub_index:start + sub_index + topk_length]
 80 |                 topk = float(topk.split('acc')[1])
 81 | 
 82 |                 if topk > top_val[t] and t == 'top1':
 83 |                     replace = True
 84 |                 if replace:
 85 |                     top_val[t] = topk
 86 | 
 87 |             if not replace:
 88 |                 continue
 89 | 
 90 |             try:
 91 |                 top_train = get_train_acc(log, start, topk_length, top_train)
 92 |             except IndexError:
 93 |                 CONSOLE.print('Log is missing train infos', style='yellow')
 94 | 
 95 |     return top_train, top_val
 96 | 
 97 | 
 98 | def get_last_model(dir):
 99 |     """Get the latest checkpoint of a model."""
100 |     latest = osp.join(dir, 'latest.pth')
101 |     if os.path.exists(latest):
102 |         os.remove(osp.join(latest))
103 |     models = [m for m in os.listdir(dir) if m.endswith('.pth')]
104 | 
105 |     return sorted(models,
106 |                   key=lambda x: int(''.join([d for d in x if d.isdigit()])),
107 |                   reverse=True)
108 | 
109 | 
110 | def get_top_model(dir):
111 |     return [model for model in os.listdir(dir) if model[:4] == 'best']
112 | 
113 | 
114 | def find_artifact(dir, ext, hint=''):
115 |     """Given a folder, find files based on their extension and part of name."""
116 |     return [
117 |         file for file in os.listdir(dir)
118 |         if (osp.splitext(file)[1] == ext and hint in file)
119 |     ]
120 | 
121 | 
122 | def main():
123 |     args = parse_args()
124 |     CONSOLE.print(f'Logging {args.experiment_name}-{args.run_name}...',
125 |                   style='green')
126 |     Path(args.mlrun_dir).mkdir(parents=True, exist_ok=True)
127 |     mlflow.set_tracking_uri(args.mlrun_dir)
128 |     mlflow.set_experiment(args.experiment_name)
129 | 
130 |     with mlflow.start_run(run_name=args.run_name):
131 |         logs = []
132 |         # log artifacts from work dir
133 |         for ext in ['.json', '.log', '.py', '.txt', '.pkl']:
134 |             for artifact in find_artifact(args.work_dir, ext):
135 |                 mlflow.log_artifact(osp.join(args.work_dir, artifact))
136 |                 if ext == '.log':
137 |                     with open(osp.join(args.work_dir, artifact), 'r') as f:
138 |                         logs.append(f.read())
139 | 
140 |         for ext in ['.txt', '.pkl']:
141 |             for artifact in find_artifact(args.data_dir, ext):
142 |                 mlflow.log_artifact(osp.join(args.data_dir, artifact))
143 | 
144 |         top_model = get_top_model(args.work_dir)
145 |         if not top_model:
146 |             CONSOLE.print(f'No best model found @{args.work_dir}',
147 |                           style='yellow')
148 |         else:
149 |             mlflow.log_artifact(osp.join(args.work_dir, top_model[0]))
150 | 
151 |         last_model = get_last_model(args.work_dir)
152 |         if not last_model or len(last_model) == 1:
153 |             CONSOLE.print(f'Last saved checkpoint not found @{args.work_dir}',
154 |                           style='yellow')
155 |         else:
156 |             last_model = list(
157 |                 filter(lambda x: not x.startswith('best'), last_model))
158 |             mlflow.log_artifact(osp.join(args.work_dir, last_model[0]))
159 | 
160 |         train_acc, val_acc = get_train_val_acc(logs)
161 | 
162 |         mlflow.log_params({
163 |             'model': args.experiment_name,
164 |             'run': args.run_name,
165 |             'train acc': f'{train_acc}',
166 |             'val acc': f'{val_acc}',
167 |             'test acc': 'NA'
168 |         })
169 | 
170 | 
171 | if __name__ == '__main__':
172 |     main()
173 | 


--------------------------------------------------------------------------------
/src/schedule_stuff.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import os.path as osp
  3 | import subprocess
  4 | 
  5 | import schedule
  6 | from rich.console import Console
  7 | 
  8 | CONSOLE = Console()
  9 | 
 10 | # https://schedule.readthedocs.io/en/stable/examples.html
 11 | 
 12 | 
 13 | def pose_feasibility(cat, out_dir='mmaction2/data/phar/pose'):
 14 |     """Schedule for the pose_feasibility.py script."""
 15 |     CONSOLE.print(f'Checking pose feasibility for {cat}...', style='green')
 16 |     script_path = 'tools/analysis/pose_feasibility.py'
 17 | 
 18 |     subargs = ['python', script_path, cat, '--out-dir', out_dir, '--resume']
 19 |     subprocess.run(subargs)
 20 |     return schedule.CancelJob
 21 | 
 22 | 
 23 | def extract_audio(in_dir, out_dir):
 24 |     """Scheduler to extract audio from videos_val.
 25 | 
 26 |     Args:
 27 |         in_dir (_type_): _description_
 28 |         out_dir (_type_): _description_
 29 | 
 30 |     Returns:
 31 |         _type_: _description_
 32 |     """
 33 |     import time
 34 |     script_dir = '/mmaction2/tools/data/extract_audio.py'
 35 |     for dIr in os.listdir(in_dir):
 36 |         CONSOLE.print(f'Extracting videos for {dIr}...', style='green')
 37 |         CONSOLE.print(osp.join(in_dir, dIr))
 38 |         CONSOLE.print(osp.join(out_dir, dIr))
 39 | 
 40 |         subargs = [
 41 |             'python', script_dir,
 42 |             osp.join(in_dir, dIr),
 43 |             osp.join(out_dir, dIr), '--level', '1', '--ext', 'avi'
 44 |         ]
 45 |         subprocess.run(subargs)
 46 |         time.sleep(30)
 47 |     return schedule.CancelJob
 48 | 
 49 | 
 50 | def extract_audio_feature(in_dir, out_dir):
 51 |     """Extract spectogram features from audio.
 52 | 
 53 |     Args:
 54 |         in_dir (_type_): _description_
 55 |         out_dir (_type_): _description_
 56 | 
 57 |     Returns:
 58 |         _type_: _description_
 59 |     """
 60 |     script_dir = '/mmaction2/tools/data/build_audio_features.py'
 61 |     for dIr in os.listdir(in_dir):
 62 |         dir_path = osp.join(in_dir, dIr)
 63 |         for audio in os.listdir(dir_path):
 64 |             audio_path = osp.join(dir_path, audio)
 65 |             subargs = [
 66 |                 'python', script_dir, audio_path,
 67 |                 osp.join(out_dir,
 68 |                          audio.split('.')[0] + '.npy'), '--level', '1',
 69 |                 '--ext', 'avi'
 70 |             ]
 71 |             subprocess.run(subargs)
 72 | 
 73 |     return schedule.CancelJob
 74 | 
 75 | 
 76 | def train_model(config: str,
 77 |                 work_dir: str,
 78 |                 resume_from=None,
 79 |                 cfg_options=None):
 80 |     script_path = 'mmaction2/tools/dist_train.sh'
 81 |     no_gpus = 1
 82 |     subargs = [
 83 |         'bash', script_path, config,
 84 |         str(no_gpus), '--work-dir', work_dir, '--validate'
 85 |     ]
 86 |     if resume_from:
 87 |         subargs.append('--resume-from')
 88 |         subargs.append(resume_from)
 89 |     if cfg_options:
 90 |         subargs.append('--cfg-options')
 91 |         for tup in cfg_options.items():
 92 |             subargs.append(f'{tup[0]}={tup[1]}')
 93 |     subprocess.run(subargs)
 94 | 
 95 | 
 96 | def demo(in_video, out_video):
 97 |     script_path = 'src/demo/multimodial_demo.py'
 98 |     subargs = ['python', script_path, in_video, out_video]
 99 |     subprocess.run(subargs)
100 | 
101 | 
102 | schedule.every().friday.at('02:30').do(
103 |     train_model,
104 |     config=('configs/timesformer/'
105 |             'timesformer_divST_8x32x1_15e_kinetics400_rgb.py'),
106 |     work_dir='mmaction2/work_dir/timesformer/')
107 | 
108 | while True:
109 |     schedule.run_pending()
110 | 


--------------------------------------------------------------------------------
/src/top_tags.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | from rich.console import Console
 5 | 
 6 | CONSOLE = Console()
 7 | 
 8 | # top predictions to check for each clip
 9 | N = 2
10 | 
11 | 
12 | def parse_args():
13 |     parser = argparse.ArgumentParser(description='get the top tags of a video')
14 |     parser.add_argument('predictions', help='json file containing predictions')
15 |     parser.add_argument('--topk',
16 |                         type=int,
17 |                         default=3,
18 |                         choices=[1, 2, 3, 4, 5],
19 |                         help='top k tags to calculate')
20 |     parser.add_argument('--label-map',
21 |                         default='resources/annotations/annotations.txt',
22 |                         help='annotation file')
23 |     args = parser.parse_args()
24 |     return args
25 | 
26 | 
27 | def main():
28 |     args = parse_args()
29 |     with open(args.label_map, 'r') as ann:
30 |         result = {line.strip(): 0 for line in ann}
31 | 
32 |     assert args.predictions.endswith(
33 |         '.json'), 'prediction file is only supported in json format'
34 |     with open(args.predictions, 'r') as f:
35 |         predictions = json.load(f)
36 | 
37 |     for pred in predictions:
38 |         top_pred = list(pred.items())[:N]
39 |         for p in top_pred:
40 |             result[p[1]] += 1
41 | 
42 |     result = dict(sorted(result.items(), key=lambda x: x[1], reverse=True))
43 |     CONSOLE.print(f'Top {args.topk} tags: {list(result.items())[:args.topk]}')
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     main()
48 | 


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import string
 3 | 
 4 | 
 5 | def annotations_list(annotations):
 6 |     """Given an annotation file, return a list of them."""
 7 |     with open(annotations) as ann:
 8 |         result = [line.strip().replace('-', '_') for line in ann]
 9 |     return result
10 | 
11 | 
12 | def annotations_dic(annotations):
13 |     """Given an annotation file, return a dictionary {label: index} of them."""
14 |     labels = annotations_list(annotations)
15 |     return {label: i for i, label in enumerate(labels)}
16 | 
17 | 
18 | def annotations_dict_rev(annotations):
19 |     """Given an annotation file return a dictionary {index: label} of them."""
20 |     result = annotations_dic(annotations)
21 |     return {v: k for k, v in result.items()}
22 | 
23 | 
24 | def gen_id(size=8):
25 |     """Generate a random id."""
26 |     chars = string.ascii_uppercase + string.digits
27 |     return ''.join(random.choice(chars) for _ in range(size))
28 | 
29 | 
30 | def prettify(byte_content):
31 |     """Prettify subprocess output.
32 | 
33 |     Args:
34 |         byte_content ([type]): [description]
35 | 
36 |     Returns:
37 |         [type]: [description]
38 |     """
39 |     decoded = byte_content.decode('utf-8')
40 |     formatted_output = decoded.replace('\\n', '\n').replace('\\t', '\t')
41 |     return formatted_output
42 | 


--------------------------------------------------------------------------------