├── .circleci └── config.yml ├── .flake8 ├── .github ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── ISSUE_TEMPLATE │ ├── bugs.md │ ├── config.yml │ ├── feature_request.md │ └── questions-help.md ├── PULL_REQUEST_TEMPLATE.md └── media │ ├── ava_slowfast.gif │ └── logo_horizontal_color.png ├── .gitignore ├── .readthedocs.yml ├── CONTRIBUTING.md ├── INSTALL.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── dev ├── README.md └── linter.sh ├── docs ├── Makefile ├── README.md ├── make.bat ├── requirements.txt └── source │ ├── accelerator.md │ ├── api │ ├── data │ │ ├── data.rst │ │ └── index.rst │ ├── index.rst │ ├── layers │ │ ├── index.rst │ │ └── layers.rst │ ├── models │ │ ├── byol.rst │ │ ├── csn.rst │ │ ├── head.rst │ │ ├── index.rst │ │ ├── masked_multistream.rst │ │ ├── memory_bank.rst │ │ ├── net.rst │ │ ├── r2plus1d.rst │ │ ├── resnet.rst │ │ ├── simclr.rst │ │ ├── slowfast.rst │ │ ├── stem.rst │ │ └── x3d.rst │ └── transforms │ │ ├── index.rst │ │ └── transforms.rst │ ├── conf.py │ ├── data.md │ ├── data_preparation.md │ ├── index.rst │ ├── layers.md │ ├── model_zoo.md │ ├── models.md │ └── transforms.md ├── hubconf.py ├── projects └── video_nerf │ ├── README.md │ ├── dataset.py │ ├── dataset_utils.py │ ├── download_objectron_data.py │ └── objectron.yaml ├── pytorchvideo ├── __init__.py ├── accelerator │ ├── __init__.py │ ├── deployment │ │ ├── __init__.py │ │ ├── common │ │ │ ├── __init__.py │ │ │ └── model_transmuter.py │ │ └── mobile_cpu │ │ │ ├── __init__.py │ │ │ ├── transmuter │ │ │ ├── __init__.py │ │ │ └── transmuter_mobile_cpu.py │ │ │ └── utils │ │ │ ├── __init__.py │ │ │ └── model_conversion.py │ └── efficient_blocks │ │ ├── __init__.py │ │ ├── efficient_block_base.py │ │ └── no_op_convert_block.py ├── data │ ├── __init__.py │ ├── ava.py │ ├── charades.py │ ├── clip_sampling.py │ ├── dataset_manifest_utils.py │ ├── decoder.py │ ├── domsev.py │ ├── ego4d │ │ ├── __init__.py │ │ ├── ego4d_dataset.py │ │ └── utils.py │ ├── encoded_video.py │ ├── encoded_video_decord.py │ ├── encoded_video_pyav.py │ ├── encoded_video_torchvision.py │ ├── epic_kitchen │ │ ├── __init__.py │ │ ├── epic_kitchen_dataset.py │ │ └── utils.py │ ├── epic_kitchen_forecasting.py │ ├── epic_kitchen_recognition.py │ ├── frame_video.py │ ├── hmdb51.py │ ├── json_dataset.py │ ├── kinetics.py │ ├── labeled_video_dataset.py │ ├── labeled_video_paths.py │ ├── ssv2.py │ ├── ucf101.py │ ├── utils.py │ └── video.py ├── layers │ ├── __init__.py │ ├── accelerator │ │ ├── __init__.py │ │ └── mobile_cpu │ │ │ ├── __init__.py │ │ │ ├── activation_functions.py │ │ │ ├── attention.py │ │ │ ├── conv_helper.py │ │ │ ├── convolutions.py │ │ │ ├── fully_connected.py │ │ │ └── pool.py │ ├── attention.py │ ├── attention_torchscript.py │ ├── batch_norm.py │ ├── convolutions.py │ ├── distributed.py │ ├── drop_path.py │ ├── fusion.py │ ├── mlp.py │ ├── nonlocal_net.py │ ├── positional_encoding.py │ ├── positional_encoding_torchscript.py │ ├── squeeze_excitation.py │ ├── swish.py │ └── utils.py ├── losses │ ├── __init__.py │ └── soft_target_cross_entropy.py ├── models │ ├── __init__.py │ ├── accelerator │ │ ├── __init__.py │ │ └── mobile_cpu │ │ │ ├── __init__.py │ │ │ ├── efficient_x3d.py │ │ │ └── residual_blocks.py │ ├── audio_visual_slowfast.py │ ├── byol.py │ ├── csn.py │ ├── head.py │ ├── hub │ │ ├── README.md │ │ ├── __init__.py │ │ ├── csn.py │ │ ├── efficient_x3d_mobile_cpu.py │ │ ├── r2plus1d.py │ │ ├── resnet.py │ │ ├── slowfast.py │ │ ├── utils.py │ │ ├── vision_transformers.py │ │ └── x3d.py │ ├── masked_multistream.py │ ├── memory_bank.py │ ├── net.py │ ├── r2plus1d.py │ ├── resnet.py │ ├── simclr.py │ ├── slowfast.py │ ├── stem.py │ ├── vision_transformers.py │ ├── weight_init.py │ └── x3d.py ├── neural_engine │ ├── detection_hook.py │ ├── engine.py │ └── hook.py └── transforms │ ├── __init__.py │ ├── augmentations.py │ ├── augmix.py │ ├── functional.py │ ├── mix.py │ ├── rand_augment.py │ ├── transforms.py │ └── transforms_factory.py ├── pytorchvideo_trainer ├── README.md ├── pytorchvideo_trainer │ ├── __init__.py │ ├── callbacks │ │ ├── __init__.py │ │ └── precise_batchnorm.py │ ├── conf │ │ ├── __init__.py │ │ ├── byol_train_app_conf.yaml │ │ ├── callbacks │ │ │ └── precise_bn.yaml │ │ ├── classification_mvit_16x4.yaml │ │ ├── classification_slow_8x8_r50.yaml │ │ ├── classification_slowfast_8x8_r50.yaml │ │ ├── classification_x3d_xs.yaml │ │ ├── datamodule │ │ │ ├── dataloader │ │ │ │ ├── kinetics_classification.yaml │ │ │ │ └── kinetics_contrastive.yaml │ │ │ └── transforms │ │ │ │ ├── kinetics_classification_mvit_16x4.yaml │ │ │ │ ├── kinetics_classification_slow.yaml │ │ │ │ ├── kinetics_classification_slowfast.yaml │ │ │ │ ├── kinetics_classification_x3d_xs.yaml │ │ │ │ ├── kinetics_contrastive.yaml │ │ │ │ └── kinetics_moco_v2.yaml │ │ ├── logger │ │ │ └── ptl.yaml │ │ ├── moco_v2_train_app_conf.yaml │ │ ├── module │ │ │ ├── knn_memory │ │ │ │ └── kinetics_k400.yaml │ │ │ ├── loss │ │ │ │ ├── contrastive.yaml │ │ │ │ ├── cross_entropy.yaml │ │ │ │ ├── nt_xent.yaml │ │ │ │ ├── similarity.yaml │ │ │ │ └── soft_cross_entropy.yaml │ │ │ ├── lr_scheduler │ │ │ │ └── cosine_with_warmup.yaml │ │ │ ├── metrics │ │ │ │ ├── accuracy.yaml │ │ │ │ └── average_precision.yaml │ │ │ ├── model │ │ │ │ ├── from_lightning_checkpoint.yaml │ │ │ │ ├── from_model_zoo_checkpoint.yaml │ │ │ │ ├── from_ssl_checkpoint.yaml │ │ │ │ ├── mvit_base_16x4.yaml │ │ │ │ ├── slow_r50.yaml │ │ │ │ ├── slow_r50_byol.yaml │ │ │ │ ├── slow_r50_moco_v2.yaml │ │ │ │ ├── slow_r50_simclr.yaml │ │ │ │ ├── slowfast_r50.yaml │ │ │ │ └── x3d_xs.yaml │ │ │ └── optim │ │ │ │ ├── adam.yaml │ │ │ │ ├── adamw.yaml │ │ │ │ ├── sgd.yaml │ │ │ │ └── sgd_ssl.yaml │ │ ├── simclr_train_app_conf.yaml │ │ ├── submitit_conf │ │ │ └── fair_cluster.yaml │ │ └── trainer │ │ │ ├── cpu.yaml │ │ │ ├── multi_gpu.yaml │ │ │ └── single_gpu.yaml │ ├── datamodule │ │ ├── __init__.py │ │ ├── collators.py │ │ ├── datamodule.py │ │ ├── rand_erase_transform.py │ │ └── transforms.py │ ├── module │ │ ├── __init__.py │ │ ├── byol.py │ │ ├── distributed_utils.py │ │ ├── losses.py │ │ ├── lr_policy.py │ │ ├── moco_v2.py │ │ ├── optimizer.py │ │ ├── simclr.py │ │ ├── ssl_helper.py │ │ └── video_classification.py │ └── train_app.py ├── setup.py └── tests │ ├── __init__.py │ ├── test_conf_datamodule.py │ ├── test_conf_module.py │ ├── test_task_byol.py │ ├── test_task_moco_v2.py │ ├── test_task_module_all.py │ ├── test_task_simclr.py │ ├── test_task_video_classification.py │ └── util.py ├── setup.cfg ├── setup.py ├── tests ├── README.md ├── __init__.py ├── benchmark_accelerator_efficient_blocks.py ├── benchmark_transforms.py ├── test_accelerator_deployment_mobile_cpu_model_conversion.py ├── test_accelerator_deployment_model_transmuter.py ├── test_accelerator_efficient_blocks_mobile_cpu_activation_attention.py ├── test_accelerator_efficient_blocks_mobile_cpu_conv3d.py ├── test_accelerator_efficient_blocks_mobile_cpu_head_layer.py ├── test_accelerator_efficient_blocks_mobile_cpu_residual_block.py ├── test_accelerator_models_efficient_x3d.py ├── test_data_ava_dataset.py ├── test_data_charades_dataset.py ├── test_data_dataset_manifest_utils.py ├── test_data_domsev_dataset.py ├── test_data_encoded_video.py ├── test_data_epic_kitchen_dataset.py ├── test_data_epic_kitchen_forecasting.py ├── test_data_epic_kitchen_recognition.py ├── test_data_epic_kitchen_utils.py ├── test_data_frame_video.py ├── test_data_json_dataset.py ├── test_data_labeled_video_dataset.py ├── test_data_ssv2_dataset.py ├── test_data_utils.py ├── test_fuse_bn.py ├── test_layers_attention.py ├── test_layers_convolutions.py ├── test_layers_drop_path.py ├── test_layers_fusion.py ├── test_layers_mlp.py ├── test_layers_nonlocal_net.py ├── test_layers_positional_encoding.py ├── test_layers_squeeze_excitation.py ├── test_losses_soft_target_cross_entropy.py ├── test_models_audio_visual_slowfast.py ├── test_models_byol.py ├── test_models_csn.py ├── test_models_head.py ├── test_models_hub_vision_transformers.py ├── test_models_masked_multistream.py ├── test_models_memory_bank.py ├── test_models_r2plus1d.py ├── test_models_resnet.py ├── test_models_slowfast.py ├── test_models_stem.py ├── test_models_vision_transformers.py ├── test_models_x3d.py ├── test_simclr.py ├── test_transforms.py ├── test_uniform_clip_sampler.py └── utils.py ├── tutorials ├── accelerator │ ├── Build_your_model_with_PytorchVideo_Accelerator.ipynb │ ├── Use_Model_Transmuter.ipynb │ └── Use_PytorchVideo_Accelerator_Model_Zoo.ipynb ├── torchhub_inference_tutorial.ipynb ├── video_classification_example │ ├── environment.yml │ ├── slurm.py │ └── train.py └── video_detection_example │ ├── video_detection_inference_tutorial.ipynb │ └── visualization.py └── website ├── .dockerignore ├── .gitignore ├── docs ├── tutorial_accelerator_build_your_model.md ├── tutorial_accelerator_use_accelerator_model_zoo.md ├── tutorial_accelerator_use_model_transmuter.md ├── tutorial_classification.md ├── tutorial_overview.md ├── tutorial_torchhub_detection_inference.md └── tutorial_torchhub_inference.md └── website ├── README.md ├── core └── Footer.js ├── package.json ├── pages └── en │ └── index.js ├── sidebars.json ├── siteConfig.js └── static ├── CNAME ├── css └── custom.css └── img ├── efficient.svg ├── favicon.png ├── logo.svg ├── logo_no_text.svg ├── logo_white.svg ├── modelzoo.svg ├── oss_logo.png ├── pytorch.svg └── reproducible.svg /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E203, E266, E501, W503, E221 3 | max-line-length = 88 4 | max-complexity = 18 5 | select = B,C,E,F,W,T4,B9 6 | exclude = build,__init__.py 7 | -------------------------------------------------------------------------------- /.github/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at . All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to PyTorchVIdeo 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Pull Requests 6 | We actively welcome your pull requests. 7 | 8 | However, if you're adding any significant features, please make sure to have a corresponding issue to outline your proposal and motivation and allow time for us to give feedback, *before* you send a PR. 9 | We do not always accept new features, and we take the following factors into consideration: 10 | 11 | - Whether the same feature can be achieved without modifying PyTorchVideo directly. If any aspect of the API is not extensible, please highlight this in an issue so we can work on making this more extensible. 12 | - Whether the feature is potentially useful to a large audience, or only to a small portion of users. 13 | - Whether the proposed solution has a good design and interface. 14 | - Whether the proposed solution adds extra mental/practical overhead to users who don't need such feature. 15 | - Whether the proposed solution breaks existing APIs. 16 | 17 | When sending a PR, please ensure you complete the following steps: 18 | 19 | 1. Fork the repo and create your branch from `main`. Follow the instructions 20 | in [INSTALL.md](../INSTALL.md) to build the repo. 21 | 2. If you've added code that should be tested, add tests. 22 | 3. If you've changed any APIs, please update the documentation. 23 | 4. Ensure the test suite passes: 24 | ``` 25 | cd pytorchvideo/tests 26 | python -m unittest -v 27 | ``` 28 | 5. Make sure your code lints by running `dev/linter.sh` from the project root. 29 | 6. If a PR contains multiple orthogonal changes, split it into multiple separate PRs. 30 | 7. If you haven't already, complete the Contributor License Agreement ("CLA"). 31 | 32 | ## Contributor License Agreement ("CLA") 33 | In order to accept your pull request, we need you to submit a CLA. You only need 34 | to do this once to work on any of Facebook's open source projects. 35 | 36 | Complete your CLA here: 37 | 38 | ## Issues 39 | We use GitHub issues to track public bugs. Please ensure your description is 40 | clear and has sufficient instructions to be able to reproduce the issue. 41 | 42 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe 43 | disclosure of security bugs. In those cases, please go through the process 44 | outlined on that page and do not file a public issue. 45 | 46 | ## Coding Style 47 | We follow these [python](http://google.github.io/styleguide/pyguide.html) and [C++](https://google.github.io/styleguide/cppguide.html) style guides. 48 | 49 | For the linter to work, you will need to install `black`, `flake`, `isort` and `clang-format`, and 50 | they need to be fairly up to date. 51 | 52 | ## License 53 | By contributing to PyTorchVideo, you agree that your contributions will be licensed 54 | under the LICENSE file in the root directory of this source tree. 55 | 56 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bugs.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "🐛 Bugs / Unexpected behaviors" 3 | about: Please report unexpected behaviors or bugs in PyTorchVideo. 4 | 5 | --- 6 | 7 | If you do not know the root cause of the problem / bug, and wish someone to help you, please 8 | post according to this template: 9 | 10 | ## 🐛 Bugs / Unexpected behaviors 11 | 12 | 13 | NOTE: Please look at the existing list of Issues tagged with the label ['bug`](https://github.com/facebookresearch/pytorchvideo/issues?q=label%3Abug). **Only open a new issue if this bug has not already been reported. If an issue already exists, please comment there instead.**. 14 | 15 | ## Instructions To Reproduce the Issue: 16 | 17 | Please include the following (depending on what the issue is): 18 | 19 | 1. Any changes you made (`git diff`) or code you wrote 20 | ``` 21 | 22 | ``` 23 | 2. The exact command(s) you ran: 24 | 3. What you observed (including the full logs): 25 | ``` 26 | 27 | ``` 28 | 29 | Please also simplify the steps as much as possible so they do not require additional resources to 30 | run, such as a private dataset, models, etc. 31 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F680 Feature Request" 3 | about: Submit a proposal/request for a new PyTorchVideo feature 4 | 5 | --- 6 | 7 | ## 🚀 Feature 8 | 9 | 10 | NOTE: Please look at the existing list of Issues tagged with the label ['enhancement`](https://github.com/facebookresearch/pytorchvideo/issues?q=label%3Aenhancement). **Only open a new issue if you do not see your feature request there**. 11 | 12 | ## Motivation 13 | 14 | 16 | 17 | ## Pitch 18 | 19 | 20 | 21 | NOTE: we only consider adding new features if they are useful for many users. 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/questions-help.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "❓ Questions" 3 | about: How do I do X with PyTorchVideo? How does PyTorchVideo do X? 4 | 5 | --- 6 | 7 | ## ❓ Questions on how to use PyTorchVideo 8 | 9 | 10 | 11 | 12 | NOTE: Please look at the existing list of Issues tagged with the label ['question`](https://github.com/facebookresearch/pytorchvideo/issues?q=label%3Aquestion) or ['how-to`](https://github.com/facebookresearch/pytorchvideo/issues?q=label%3A%22how+to%22). **Only open a new issue if you cannot find an answer there**. 13 | 14 | Also note the following: 15 | 16 | 1. If you encountered any errors or unexpected issues while using PyTorchVideo and need help resolving them, 17 | please use the "Bugs / Unexpected behaviors" issue template. 18 | 19 | 2. We do not answer general machine learning / computer vision questions that are not specific to 20 | PyTorchVideo, such as how a model works or what algorithm/methods can be 21 | used to achieve X. 22 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Motivation and Context 2 | 3 | 4 | 5 | 6 | 7 | ## How Has This Been Tested 8 | 9 | 10 | 11 | ## Types of changes 12 | 13 | 14 | - [ ] Docs change / refactoring / dependency upgrade 15 | - [ ] Bug fix (non-breaking change which fixes an issue) 16 | - [ ] New feature (non-breaking change which adds functionality) 17 | - [ ] Breaking change (fix or feature that would cause existing functionality to change) 18 | 19 | ## Checklist 20 | 21 | 22 | 23 | - [ ] My code follows the code style of this project. 24 | - [ ] My change requires a change to the documentation. 25 | - [ ] I have updated the documentation accordingly. 26 | - [ ] I have read the **CONTRIBUTING** document. 27 | - [ ] I have completed my CLA (see **CONTRIBUTING**) 28 | - [ ] I have added tests to cover my changes. 29 | - [ ] All new and existing tests passed. 30 | 31 | -------------------------------------------------------------------------------- /.github/media/ava_slowfast.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/pytorchvideo/6cdc929315aab1b5674b6dcf73b16ec99147735f/.github/media/ava_slowfast.gif -------------------------------------------------------------------------------- /.github/media/logo_horizontal_color.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/pytorchvideo/6cdc929315aab1b5674b6dcf73b16ec99147735f/.github/media/logo_horizontal_color.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.DS_Store 2 | 3 | build/ 4 | _ext 5 | *.pyc 6 | *.pyd 7 | *.so 8 | *.dll 9 | *.egg-info/ 10 | **/__pycache__/ 11 | *-checkpoint.ipynb 12 | **/.ipynb_checkpoints 13 | **/.ipynb_checkpoints/** 14 | 15 | 16 | # Docusaurus site 17 | website/yarn.lock 18 | website/build/ 19 | website/i18n/ 20 | website/node_modules/* 21 | website/npm-debug.log 22 | 23 | ## Generated for tutorials 24 | website/_tutorials/ 25 | website/static/files/ 26 | website/pages/tutorials/* 27 | !website/pages/tutorials/index.js 28 | 29 | 30 | ## Conda and pip builds 31 | packaging/out/ 32 | packaging/output_files/ 33 | dist/ 34 | wheels/ 35 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | builder: html 11 | configuration: docs/source/conf.py 12 | 13 | # Build documentation with MkDocs 14 | #mkdocs: 15 | # configuration: mkdocs.yml 16 | 17 | # Optionally build your docs in additional formats such as PDF and ePub 18 | formats: all 19 | 20 | # Optionally set the version of Python and requirements required to build your docs 21 | python: 22 | version: 3.7 23 | system_packages: true 24 | install: 25 | - requirements: docs/requirements.txt 26 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to fvcore 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Pull Requests 6 | We actively welcome your pull requests. 7 | 8 | 1. Fork the repo and create your branch from `main`. 9 | 2. If you've added code that should be tested, add tests. 10 | 3. If you've changed APIs, update the documentation. 11 | 4. Ensure the test suite passes. 12 | 5. Make sure your code lints. 13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 14 | 15 | ## Testing 16 | 17 | Please follow the instructions mentioned in [test-README](https://github.com/facebookresearch/pytorchvideo/blob/main/tests/README.md) to run the existing and your newly added tests. 18 | 19 | ## Linting 20 | 21 | We provide a linting script to correctly format your code changes. 22 | Please follow the instructions mentioned in [dev-README](https://github.com/facebookresearch/pytorchvideo/blob/main/dev/README.md) to run the linter. 23 | 24 | 25 | ## Contributor License Agreement ("CLA") 26 | In order to accept your pull request, we need you to submit a CLA. You only need 27 | to do this once to work on any of Facebook's open source projects. 28 | 29 | Complete your CLA here: 30 | 31 | ## Issues 32 | We use GitHub issues to track public bugs. Please ensure your description is 33 | clear and has sufficient instructions to be able to reproduce the issue. 34 | 35 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe 36 | disclosure of security bugs. In those cases, please go through the process 37 | outlined on that page and do not file a public issue. 38 | 39 | ## License 40 | By contributing to fvcore, you agree that your contributions will be licensed 41 | under the LICENSE file in the root directory of this source tree. 42 | -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | ## Installing PytorchVideo 4 | 5 | 6 | ### 1. Install from PyPI 7 | For stable release, 8 | ``` 9 | pip install pytorchvideo 10 | ======= 11 | conda create -n pytorchvideo python=3.7 12 | conda activate pytorchvideo 13 | conda install -c pytorch pytorch=1.8.0 torchvision cudatoolkit=10.2 14 | conda install -c conda-forge -c fvcore -c iopath fvcore=0.1.4 iopath 15 | ``` 16 | 17 | For nightly builds, 18 | ``` 19 | pip install pytorchvideo-nightly 20 | ``` 21 | 22 | ### 2. Install from GitHub using pip 23 | ``` 24 | pip install "git+https://github.com/facebookresearch/pytorchvideo.git" 25 | ``` 26 | To install using the code of the released version instead of from the main branch, use the following instead. 27 | ``` 28 | pip install "git+https://github.com/facebookresearch/pytorchvideo.git@stable" 29 | ``` 30 | 31 | ### 3. Install from a local clone 32 | ``` 33 | git clone https://github.com/facebookresearch/pytorchvideo.git 34 | cd pytorchvideo 35 | pip install -e . 36 | 37 | # For developing and testing 38 | pip install -e . [test,dev] 39 | ``` 40 | 41 | 42 | ## Requirements 43 | 44 | ### Core library 45 | 46 | - Python 3.7 or 3.8 47 | - PyTorch 1.8.0 or higher. 48 | - torchvision that matches the PyTorch installation. You can install them together as explained at pytorch.org to make sure of this. 49 | - [fvcore](https://github.com/facebookresearch/fvcore) version 0.1.4 or higher 50 | - [ioPath](https://github.com/facebookresearch/iopath) 51 | - If CUDA is to be used, use a version which is supported by the corresponding pytorch version and at least version 10.2 or higher. 52 | 53 | We recommend setting up a conda environment with Pytorch and Torchvision before installing PyTorchVideo. 54 | For instance, follow the bellow instructions to setup the conda environment, 55 | ``` 56 | conda create -n pytorchvideo python=3.7 57 | conda activate pytorchvideo 58 | conda install -c pytorch pytorch=1.8.0 torchvision cudatoolkit=10.2 59 | ``` 60 | 61 | ## Testing 62 | 63 | Please follow the instructions mentioned in [test-README](https://github.com/facebookresearch/pytorchvideo/blob/main/tests/README.md) to run the provided tests. 64 | 65 | ## Linting 66 | 67 | We also provide a linting script to correctly format your code edits. 68 | Please follow the instructions mentioned in [dev-README](https://github.com/facebookresearch/pytorchvideo/blob/main/dev/README.md) to run the linter. 69 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include CONTRIBUTING.md 3 | include requirements.txt -------------------------------------------------------------------------------- /dev/README.md: -------------------------------------------------------------------------------- 1 | ## Running Linter 2 | 3 | 4 | Before running the linter, please ensure that you installed the necessary additional linter dependencies. 5 | If not installed, check the [install-README](https://github.com/facebookresearch/pytorchvideo/blob/main/INSTALL.md) on how to do it. 6 | 7 | Post that, you can run the linter from the project root using, 8 | 9 | ``` 10 | ./dev/linter.sh 11 | ``` 12 | -------------------------------------------------------------------------------- /dev/linter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -ev 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | 4 | # Run this script at project root with "./dev/linter.sh" before you commit. 5 | 6 | echo "Running autoflake..." 7 | python -m autoflake --remove-all-unused-imports -i . 8 | 9 | echo "Running isort..." 10 | isort -y -sp . 11 | 12 | echo "Running black..." 13 | black . 14 | 15 | echo "Running flake8..." 16 | if [ -x "$(command -v flake8)" ]; then 17 | flake8 . 18 | else 19 | python3 -m flake8 . 20 | fi 21 | 22 | command -v arc > /dev/null && { 23 | echo "Running arc lint ..." 24 | arc lint 25 | } 26 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Setup 3 | 4 | ### Install dependencies 5 | 6 | ``` 7 | pip install -U recommonmark mock sphinx sphinx_rtd_theme sphinx_markdown_tables 8 | ``` 9 | 10 | ### Add symlink to the root README.md 11 | 12 | We want to include the root readme as an overview. Before generating the docs create a symlink to the root readme. 13 | 14 | ``` 15 | cd /docs 16 | ln -s ../README.md overview.md 17 | ``` 18 | 19 | In `conf.py` for deployment this is done using `subprocess.call`. 20 | 21 | ### Add a new file 22 | 23 | Add a new `.md` or `.rst` file and add the name to the doc tree in `index.rst` e.g 24 | 25 | ``` 26 | .. toctree:: 27 | :maxdepth: 1 28 | :caption: Intro Documentation 29 | 30 | overview 31 | ``` 32 | 33 | ### Build 34 | 35 | From `pytorchvideo/docs` run: 36 | 37 | ``` 38 | > make html 39 | ``` 40 | 41 | The website is generated in `build/html`. 42 | 43 | ### Common Issues 44 | 45 | Sphinx can be fussy, and sometimes about things you weren’t expecting. For example, you might encounter something like: 46 | 47 | WARNING: toctree contains reference to nonexisting document u'overview' 48 | ... 49 | checking consistency... 50 | /docs/overview.rst:: 51 | WARNING: document isn't included in any toctree 52 | 53 | You might have indented overview in the .. toctree:: in index.rst with four spaces, when Sphinx is expecting three. 54 | 55 | 56 | ### View 57 | 58 | Start a python simple server: 59 | 60 | ``` 61 | > python -m http.server 62 | ``` 63 | 64 | Navigate to: `http://0.0.0.0:8000/` 65 | 66 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | docutils==0.16 2 | # https://github.com/sphinx-doc/sphinx/commit/7acd3ada3f38076af7b2b5c9f3b60bb9c2587a3d 3 | sphinx==3.2.0 4 | recommonmark==0.6.0 5 | sphinx_markdown_tables 6 | mock 7 | numpy 8 | av 9 | torch 10 | torchvision 11 | opencv-python 12 | parameterized 13 | git+git://github.com/facebookresearch/fvcore.git 14 | git+git://github.com/facebookresearch/iopath.git 15 | git+git://github.com/kalyanvasudev/pytorch_sphinx_theme.git 16 | -------------------------------------------------------------------------------- /docs/source/api/data/data.rst: -------------------------------------------------------------------------------- 1 | pytorchvideo.data 2 | ================= 3 | 4 | .. automodule:: pytorchvideo.data 5 | :imported-members: 6 | :members: 7 | :undoc-members: 8 | :show-inheritance: 9 | -------------------------------------------------------------------------------- /docs/source/api/data/index.rst: -------------------------------------------------------------------------------- 1 | Data API 2 | ================== 3 | 4 | .. toctree:: 5 | 6 | data 7 | 8 | -------------------------------------------------------------------------------- /docs/source/api/index.rst: -------------------------------------------------------------------------------- 1 | API Documentation 2 | ================== 3 | 4 | .. toctree:: 5 | 6 | models/index 7 | data/index 8 | layers/index 9 | transforms/index -------------------------------------------------------------------------------- /docs/source/api/layers/index.rst: -------------------------------------------------------------------------------- 1 | Layers API 2 | ================== 3 | 4 | .. toctree:: 5 | 6 | layers -------------------------------------------------------------------------------- /docs/source/api/layers/layers.rst: -------------------------------------------------------------------------------- 1 | pytorchvideo.layers.batch_norm 2 | ================================= 3 | 4 | 5 | .. automodule:: pytorchvideo.layers.batch_norm 6 | :members: 7 | 8 | 9 | pytorchvideo.layers.convolutions 10 | ================================= 11 | 12 | 13 | .. automodule:: pytorchvideo.layers.convolutions 14 | :members: 15 | 16 | pytorchvideo.layers.fusion 17 | ================================= 18 | 19 | 20 | .. automodule:: pytorchvideo.layers.fusion 21 | :members: 22 | 23 | pytorchvideo.layers.mlp 24 | ================================= 25 | 26 | 27 | .. automodule:: pytorchvideo.layers.mlp 28 | :members: 29 | 30 | pytorchvideo.layers.nonlocal_net 31 | ================================= 32 | 33 | 34 | .. automodule:: pytorchvideo.layers.nonlocal_net 35 | :members: 36 | 37 | pytorchvideo.layers.positional_encoding 38 | ================================= 39 | 40 | 41 | .. automodule:: pytorchvideo.layers.positional_encoding 42 | :members: 43 | 44 | pytorchvideo.layers.swish 45 | ================================= 46 | 47 | 48 | .. automodule:: pytorchvideo.layers.swish 49 | :members: 50 | 51 | pytorchvideo.layers.squeeze_excitation 52 | ================================= 53 | 54 | 55 | .. automodule:: pytorchvideo.layers.squeeze_excitation 56 | :members: -------------------------------------------------------------------------------- /docs/source/api/models/byol.rst: -------------------------------------------------------------------------------- 1 | pytorchvideo.models.byol 2 | ================================= 3 | 4 | 5 | .. automodule:: pytorchvideo.models.byol 6 | :members: -------------------------------------------------------------------------------- /docs/source/api/models/csn.rst: -------------------------------------------------------------------------------- 1 | pytorchvideo.models.csn 2 | ================================= 3 | 4 | 5 | .. automodule:: pytorchvideo.models.csn 6 | :members: -------------------------------------------------------------------------------- /docs/source/api/models/head.rst: -------------------------------------------------------------------------------- 1 | pytorchvideo.models.head 2 | ================================= 3 | 4 | 5 | .. automodule:: pytorchvideo.models.head 6 | :members: -------------------------------------------------------------------------------- /docs/source/api/models/index.rst: -------------------------------------------------------------------------------- 1 | Models API 2 | ================== 3 | 4 | .. toctree:: 5 | 6 | resnet 7 | net 8 | head 9 | stem 10 | csn 11 | x3d 12 | slowfast 13 | r2plus1d 14 | simclr 15 | byol 16 | memory_bank 17 | masked_multistream -------------------------------------------------------------------------------- /docs/source/api/models/masked_multistream.rst: -------------------------------------------------------------------------------- 1 | pytorchvideo.models.masked_multistream 2 | ================================= 3 | 4 | 5 | .. automodule:: pytorchvideo.models.masked_multistream 6 | :members: -------------------------------------------------------------------------------- /docs/source/api/models/memory_bank.rst: -------------------------------------------------------------------------------- 1 | pytorchvideo.models.memory_bank 2 | ================================= 3 | 4 | 5 | .. automodule:: pytorchvideo.models.memory_bank 6 | :members: -------------------------------------------------------------------------------- /docs/source/api/models/net.rst: -------------------------------------------------------------------------------- 1 | pytorchvideo.models.net 2 | ================================= 3 | 4 | 5 | .. automodule:: pytorchvideo.models.net 6 | :members: -------------------------------------------------------------------------------- /docs/source/api/models/r2plus1d.rst: -------------------------------------------------------------------------------- 1 | pytorchvideo.models.r2plus1d 2 | ================================= 3 | 4 | 5 | .. automodule:: pytorchvideo.models.r2plus1d 6 | :members: -------------------------------------------------------------------------------- /docs/source/api/models/resnet.rst: -------------------------------------------------------------------------------- 1 | pytorchvideo.models.resnet 2 | ================================= 3 | 4 | Building blocks for Resnet and resnet-like models 5 | 6 | .. automodule:: pytorchvideo.models.resnet 7 | :members: -------------------------------------------------------------------------------- /docs/source/api/models/simclr.rst: -------------------------------------------------------------------------------- 1 | pytorchvideo.models.simclr 2 | ================================= 3 | 4 | 5 | .. automodule:: pytorchvideo.models.simclr 6 | :members: -------------------------------------------------------------------------------- /docs/source/api/models/slowfast.rst: -------------------------------------------------------------------------------- 1 | pytorchvideo.models.slowfast 2 | ================================= 3 | 4 | 5 | .. automodule:: pytorchvideo.models.slowfast 6 | :members: -------------------------------------------------------------------------------- /docs/source/api/models/stem.rst: -------------------------------------------------------------------------------- 1 | pytorchvideo.models.stem 2 | ================================= 3 | 4 | 5 | .. automodule:: pytorchvideo.models.stem 6 | :members: -------------------------------------------------------------------------------- /docs/source/api/models/x3d.rst: -------------------------------------------------------------------------------- 1 | pytorchvideo.models.x3d 2 | ================================= 3 | 4 | 5 | .. automodule:: pytorchvideo.models.x3d 6 | :members: -------------------------------------------------------------------------------- /docs/source/api/transforms/index.rst: -------------------------------------------------------------------------------- 1 | Transforms API 2 | ================== 3 | 4 | .. toctree:: 5 | 6 | transforms -------------------------------------------------------------------------------- /docs/source/api/transforms/transforms.rst: -------------------------------------------------------------------------------- 1 | pytorchvideo.transforms 2 | ================================== 3 | 4 | 5 | .. automodule:: pytorchvideo.transforms 6 | :imported-members: 7 | :members: 8 | :undoc-members: 9 | :show-inheritance: 10 | 11 | 12 | pytorchvideo.transforms.functional 13 | ================================== 14 | 15 | 16 | .. automodule:: pytorchvideo.transforms.functional 17 | :imported-members: 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. pytorchvideo documentation master file, created by 2 | sphinx-quickstart on Tue Feb 23 17:19:36 2021. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | :github_url: https://github.com/facebookresearch/pytorchvideo/ 7 | 8 | 9 | PyTorchVideo Documentation 10 | ======================================== 11 | 12 | .. toctree:: 13 | :maxdepth: 1 14 | :caption: Models 15 | 16 | models 17 | model_zoo 18 | api/models/index 19 | 20 | .. toctree:: 21 | :maxdepth: 1 22 | :caption: Data 23 | 24 | data 25 | data_preparation 26 | api/data/index 27 | 28 | .. toctree:: 29 | :maxdepth: 1 30 | :caption: Transforms 31 | 32 | transforms 33 | api/transforms/index 34 | 35 | .. toctree:: 36 | :maxdepth: 1 37 | :caption: Layers 38 | 39 | layers 40 | api/layers/index 41 | 42 | .. toctree:: 43 | :maxdepth: 1 44 | :caption: Accelerator 45 | 46 | accelerator 47 | 48 | -------------------------------------------------------------------------------- /docs/source/layers.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | 4 | PyTorchVideo is an open source video understanding library that provides up to date builders for state of the art video understanding backbones, layers, heads, and losses addressing different tasks, including acoustic event detection, action recognition (video classification), action detection (video detection), multimodal understanding (acoustic visual classification), self-supervised learning. 5 | 6 | The layers subpackage contains definitions for the following layers and activations: 7 | 8 | 9 | * Layer 10 | * [BatchNorm](https://arxiv.org/abs/1502.03167) 11 | * [2+1 Conv](https://arxiv.org/abs/1711.11248) 12 | * ConCat 13 | * MLP 14 | * [Nonlocal Net](https://arxiv.org/abs/1711.07971) 15 | * Positional Encoding 16 | * [Squeeze and Excitation](https://arxiv.org/abs/1709.01507) 17 | * [Swish](https://arxiv.org/abs/1710.05941) 18 | 19 | ## Build standard models 20 | 21 | PyTorchVideo provide default builders to construct state-of-the-art video understanding layers and activations. 22 | 23 | 24 | ### Layers 25 | 26 | You can construct a layer with random weights by calling its constructor: 27 | 28 | ``` 29 | import pytorchvideo.layers as layers 30 | 31 | nonlocal = layers.create_nonlocal(dim_in=256, dim_inner=128) 32 | swish = layers.Swish() 33 | conv_2plus1d = layers.create_conv_2plus1d(in_channels=256, out_channels=512) 34 | ``` 35 | 36 | You can verify whether you have built the model successfully by: 37 | 38 | ``` 39 | import pytorchvideo.layers as layers 40 | 41 | nonlocal = layers.create_nonlocal(dim_in=256, dim_inner=128) 42 | B, C, T, H, W = 2, 256, 4, 14, 14 43 | input_tensor = torch.zeros(B, C, T, H, W) 44 | output = nonlocal(input_tensor) 45 | 46 | swish = layers.Swish() 47 | B, C, T, H, W = 2, 256, 4, 14, 14 48 | input_tensor = torch.zeros(B, C, T, H, W) 49 | output = swish(input_tensor) 50 | 51 | conv_2plus1d = layers.create_conv_2plus1d(in_channels=256, out_channels=512) 52 | B, C, T, H, W = 2, 256, 4, 14, 14 53 | input_tensor = torch.zeros(B, C, T, H, W) 54 | output = conv_2plus1d(input_tensor) 55 | ``` 56 | -------------------------------------------------------------------------------- /docs/source/transforms.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | The PyTorchVideo transforms package contains common video algorithms used for preprocessing and/or augmenting video data. The package also contains helper dictionary transforms that are useful for interoperability between PyTorchVideo [dataset's clip outputs](https://pytorchvideo.readthedocs.io/en/latest/data.html) and domain specific transforms. For example, here is a standard transform pipeline for a video model, that could be used with a PyTorchVideo dataset: 4 | 5 | ```python 6 | transform = torchvision.transforms.Compose([ 7 | pytorchvideo.transforms.ApplyTransformToKey( 8 | key="video", 9 | transform=torchvision.transforms.Compose([ 10 | pytorchvideo.transforms.UniformTemporalSubsample(8), 11 | pytorchvideo.transforms.Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)), 12 | pytorchvideo.transforms.RandomShortSideScale(min_size=256, max_size=320), 13 | torchvision.transforms.RandomCrop(244), 14 | torchvision.transforms.RandomHorizontalFlip(p=0.5), 15 | )] 16 | ) 17 | ]) 18 | dataset = pytorchvideo.data.Kinetics( 19 | data_path="path/to/kinetics_root/train.csv", 20 | clip_sampler=pytorchvideo.data.make_clip_sampler("random", duration=2), 21 | transform=transform 22 | ) 23 | ``` 24 | 25 | Notice how the example also includes transforms from TorchVision? PyTorchVideo uses the same canonical tensor shape as TorchVision for video and TorchAudio for audio. This allows the frameworks to be used together freely. 26 | 27 | ## Transform vs Functional interface 28 | 29 | The example above demonstrated the [```pytorchvideo.transforms```](https://pytorchvideo.readthedocs.io/en/latest/api/transforms/transforms.html) interface. These transforms are [```torch.nn.module```](https://pytorch.org/docs/stable/generated/torch.nn.Module.html) callable classes that can be stringed together in a declarative way. PyTorchVideo also provides a [```pytorchvideo.transforms.functional```](https://pytorchvideo.readthedocs.io/en/latest/api/transforms/transforms.html#pytorchvideo-transforms-functional) interface, which are the functions that the transform API uses. These allow more fine-grained control over the transformations and may be more suitable for use outside the dataset preprocessing use case. 30 | 31 | ## Scriptable transforms 32 | 33 | All non-OpenCV transforms are TorchScriptable, as described in the [TorchVision docs](https://pytorch.org/vision/stable/transforms.html#scriptable-transforms), in order to script the transforms together, please use [```ltorch.nn.Sequential```](https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html) instead of [```torchvision.transform.Compose```](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Compose). 34 | -------------------------------------------------------------------------------- /hubconf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | dependencies = ["torch"] 4 | from pytorchvideo.models.hub import ( # noqa: F401, E402 5 | c2d_r50, 6 | csn_r101, 7 | efficient_x3d_s, 8 | efficient_x3d_xs, 9 | i3d_r50, 10 | mvit_base_16, 11 | mvit_base_16x4, 12 | mvit_base_32x3, 13 | r2plus1d_r50, 14 | slow_r50, 15 | slow_r50_detection, 16 | slowfast_16x8_r101_50_50, 17 | slowfast_r101, 18 | slowfast_r50, 19 | slowfast_r50_detection, 20 | x3d_l, 21 | x3d_m, 22 | x3d_s, 23 | x3d_xs, 24 | ) 25 | -------------------------------------------------------------------------------- /projects/video_nerf/download_objectron_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import os 4 | 5 | import requests 6 | 7 | 8 | # URLs for downloading the Objectron dataset 9 | public_url = "https://storage.googleapis.com/objectron" 10 | blob_path = public_url + "/v1/index/chair_annotations_train" 11 | video_ids = requests.get(blob_path).text 12 | video_ids = video_ids.split("\n") 13 | 14 | DATA_PATH = "./nerf/data/objectron" 15 | 16 | os.makedirs(DATA_PATH, exist_ok=True) 17 | 18 | # Download a video of a chair. 19 | for i in range(3, 4): 20 | video_filename = public_url + "/videos/" + video_ids[i] + "/video.MOV" 21 | metadata_filename = public_url + "/videos/" + video_ids[i] + "/geometry.pbdata" 22 | annotation_filename = public_url + "/annotations/" + video_ids[i] + ".pbdata" 23 | 24 | # This file contains the bundle adjusted cameras 25 | sfm_filename = public_url + "/videos/" + video_ids[i] + "/sfm_arframe.pbdata" 26 | 27 | # video.content contains the video file. 28 | video = requests.get(video_filename) 29 | metadata = requests.get(metadata_filename) 30 | 31 | # Please refer to Parse Annotation tutorial to see how to parse the annotation files. 32 | annotation = requests.get(annotation_filename) 33 | 34 | sfm = requests.get(sfm_filename) 35 | 36 | video_path = os.path.join(DATA_PATH, "video.MOV") 37 | print("Writing video to %s" % video_path) 38 | file = open(video_path, "wb") 39 | file.write(video.content) 40 | file.close() 41 | 42 | geometry_path = os.path.join(DATA_PATH, "geometry.pbdata") 43 | print("Writing geometry data to %s" % geometry_path) 44 | file = open(geometry_path, "wb") 45 | file.write(metadata.content) 46 | file.close() 47 | 48 | annotation_path = os.path.join(DATA_PATH, "annotation.pbdata") 49 | print("Writing annotation data to %s" % annotation_path) 50 | file = open(annotation_path, "wb") 51 | file.write(annotation.content) 52 | file.close() 53 | 54 | sfm_arframe_path = os.path.join(DATA_PATH, "sfm_arframe.pbdata") 55 | print("Writing bundle adjusted camera data to %s" % sfm_arframe_path) 56 | file = open(sfm_arframe_path, "wb") 57 | file.write(sfm.content) 58 | file.close() 59 | -------------------------------------------------------------------------------- /projects/video_nerf/objectron.yaml: -------------------------------------------------------------------------------- 1 | seed: 3 2 | resume: True 3 | stats_print_interval: 10 4 | validation_epoch_interval: 5 5 | checkpoint_epoch_interval: 30 6 | checkpoint_path: 'checkpoints/objectron.pth' 7 | data: 8 | dataset_name: 'objectron' 9 | image_size: [1440, 1920] # [height, width] 10 | precache_rays: True 11 | test: 12 | mode: 'evaluation' 13 | trajectory_type: 'circular' 14 | up: [0.0, 1.0, 0.0] 15 | scene_center: [-0.5365, -1.05, 7.6191] 16 | n_frames: 50 17 | fps: 1 18 | trajectory_scale: 0.2 19 | optimizer: 20 | max_epochs: 20000 21 | lr: 0.0005 22 | lr_scheduler_step_size: 5000 23 | lr_scheduler_gamma: 0.1 24 | visualization: 25 | history_size: 10 26 | visdom: True 27 | visdom_server: 'localhost' 28 | visdom_port: 8097 29 | visdom_env: 'objectron' 30 | raysampler: 31 | n_pts_per_ray: 64 32 | n_pts_per_ray_fine: 64 33 | n_rays_per_image: 1024 34 | min_depth: 0.1 35 | max_depth: 100.0 36 | stratified: True 37 | stratified_test: False 38 | chunk_size_test: 6000 39 | implicit_function: 40 | n_harmonic_functions_xyz: 10 41 | n_harmonic_functions_dir: 4 42 | n_hidden_neurons_xyz: 256 43 | n_hidden_neurons_dir: 128 44 | density_noise_std: 0.0 45 | n_layers_xyz: 8 46 | -------------------------------------------------------------------------------- /pytorchvideo/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | __version__ = "0.1.5" 4 | -------------------------------------------------------------------------------- /pytorchvideo/accelerator/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /pytorchvideo/accelerator/deployment/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /pytorchvideo/accelerator/deployment/common/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /pytorchvideo/accelerator/deployment/mobile_cpu/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /pytorchvideo/accelerator/deployment/mobile_cpu/transmuter/__init__.py: -------------------------------------------------------------------------------- 1 | from pytorchvideo.accelerator.deployment.common.model_transmuter import ( 2 | EFFICIENT_BLOCK_TRANSMUTER_REGISTRY, 3 | ) 4 | 5 | from .transmuter_mobile_cpu import EFFICIENT_BLOCK_TRANSMUTER_MOBILE_CPU 6 | 7 | 8 | EFFICIENT_BLOCK_TRANSMUTER_REGISTRY["mobile_cpu"] = ( 9 | EFFICIENT_BLOCK_TRANSMUTER_MOBILE_CPU 10 | ) 11 | -------------------------------------------------------------------------------- /pytorchvideo/accelerator/deployment/mobile_cpu/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /pytorchvideo/accelerator/efficient_blocks/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /pytorchvideo/accelerator/efficient_blocks/efficient_block_base.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from abc import abstractmethod 4 | 5 | import torch.nn as nn 6 | 7 | 8 | class EfficientBlockBase(nn.Module): 9 | """ 10 | PyTorchVideo/accelerator provides a set of efficient blocks 11 | that have optimal efficiency for each target hardware device. 12 | 13 | Each efficient block has two forms: 14 | - original form: this form is for training. When efficient block is instantiated, 15 | it is in this original form. 16 | - deployable form: this form is for deployment. Once the network is ready for 17 | deploy, it can be converted into deployable form for efficient execution 18 | on target hardware. One block is transformed into deployable form by calling 19 | convert() method. By conversion to deployable form, 20 | various optimization (operator fuse, kernel optimization, etc.) are applied. 21 | 22 | EfficientBlockBase is the base class for efficient blocks. 23 | All efficient blocks should inherit this base class 24 | and implement following methods: 25 | - forward(): same as required by nn.Module 26 | - convert(): called to convert block into deployable form 27 | """ 28 | 29 | @abstractmethod 30 | def convert(self): 31 | pass 32 | 33 | @abstractmethod 34 | def forward(self): 35 | pass 36 | -------------------------------------------------------------------------------- /pytorchvideo/accelerator/efficient_blocks/no_op_convert_block.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import torch.nn as nn 4 | 5 | from .efficient_block_base import EfficientBlockBase 6 | 7 | 8 | class NoOpConvertBlock(EfficientBlockBase): 9 | """ 10 | This class provides an interface with EfficientBlockBase for modules that do not 11 | need convert. 12 | Args: 13 | model (nn.Module): NoOpConvertBlock takes model as input and generate a wrapper 14 | instance of EfficientBlockBase with same functionality as model, with no change 15 | applied when convert() is called. 16 | """ 17 | 18 | def __init__(self, model: nn.Module): 19 | super().__init__() 20 | self.model = model 21 | 22 | def convert(self, *args, **kwargs): 23 | pass 24 | 25 | def forward(self, x): 26 | return self.model(x) 27 | -------------------------------------------------------------------------------- /pytorchvideo/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from .ava import Ava # noqa 4 | from .charades import Charades # noqa 5 | from .clip_sampling import ( # noqa; noqa 6 | ClipSampler, 7 | make_clip_sampler, 8 | RandomClipSampler, 9 | UniformClipSampler, 10 | ) 11 | from .domsev import DomsevFrameDataset, DomsevVideoDataset # noqa 12 | from .epic_kitchen_forecasting import EpicKitchenForecasting # noqa 13 | from .epic_kitchen_recognition import EpicKitchenRecognition # noqa 14 | from .hmdb51 import Hmdb51 # noqa 15 | from .kinetics import Kinetics # noqa 16 | from .labeled_video_dataset import labeled_video_dataset, LabeledVideoDataset # noqa 17 | from .ssv2 import SSv2 18 | from .ucf101 import Ucf101 # noqa 19 | -------------------------------------------------------------------------------- /pytorchvideo/data/decoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from enum import Enum 3 | 4 | 5 | class DecoderType(Enum): 6 | PYAV = "pyav" 7 | TORCHVISION = "torchvision" 8 | DECORD = "decord" 9 | -------------------------------------------------------------------------------- /pytorchvideo/data/ego4d/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from .ego4d_dataset import Ego4dMomentsDataset 4 | -------------------------------------------------------------------------------- /pytorchvideo/data/encoded_video.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import io 4 | import logging 5 | import pathlib 6 | from typing import Any, Dict 7 | 8 | from iopath.common.file_io import g_pathmgr 9 | from pytorchvideo.data.decoder import DecoderType 10 | 11 | from .video import Video 12 | 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | def select_video_class(decoder: str) -> Video: 18 | """ 19 | Select the class for accessing clips based on provided decoder string 20 | 21 | Args: 22 | decoder (str): Defines what type of decoder used to decode a video. 23 | """ 24 | if DecoderType(decoder) == DecoderType.PYAV: 25 | from .encoded_video_pyav import EncodedVideoPyAV 26 | 27 | video_cls = EncodedVideoPyAV 28 | elif DecoderType(decoder) == DecoderType.TORCHVISION: 29 | from .encoded_video_torchvision import EncodedVideoTorchVision 30 | 31 | video_cls = EncodedVideoTorchVision 32 | elif DecoderType(decoder) == DecoderType.DECORD: 33 | from .encoded_video_decord import EncodedVideoDecord 34 | 35 | video_cls = EncodedVideoDecord 36 | else: 37 | raise NotImplementedError(f"Unknown decoder type {decoder}") 38 | 39 | return video_cls 40 | 41 | 42 | class EncodedVideo(Video): 43 | """ 44 | EncodedVideo is an abstraction for accessing clips from an encoded video. 45 | It supports selective decoding when header information is available. 46 | """ 47 | 48 | @classmethod 49 | def from_path( 50 | cls, 51 | file_path: str, 52 | decode_video: bool = True, 53 | decode_audio: bool = True, 54 | decoder: str = "pyav", 55 | **other_args: Dict[str, Any], 56 | ): 57 | """ 58 | Fetches the given video path using PathManager (allowing remote uris to be 59 | fetched) and constructs the EncodedVideo object. 60 | 61 | Args: 62 | file_path (str): a PathManager file-path. 63 | """ 64 | # We read the file with PathManager so that we can read from remote uris. 65 | with g_pathmgr.open(file_path, "rb") as fh: 66 | video_file = io.BytesIO(fh.read()) 67 | 68 | video_cls = select_video_class(decoder) 69 | return video_cls( 70 | file=video_file, 71 | video_name=pathlib.Path(file_path).name, 72 | decode_video=decode_video, 73 | decode_audio=decode_audio, 74 | **other_args, 75 | ) 76 | -------------------------------------------------------------------------------- /pytorchvideo/data/epic_kitchen/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from .epic_kitchen_dataset import ActionData, EpicKitchenDataset 4 | -------------------------------------------------------------------------------- /pytorchvideo/data/kinetics.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from typing import Any, Callable, Dict, Optional, Type 4 | 5 | import torch 6 | from pytorchvideo.data.clip_sampling import ClipSampler 7 | 8 | from .labeled_video_dataset import labeled_video_dataset, LabeledVideoDataset 9 | 10 | 11 | """ 12 | Action recognition video dataset for Kinetics-{400,600,700} 13 | 14 | """ 15 | 16 | 17 | def Kinetics( 18 | data_path: str, 19 | clip_sampler: ClipSampler, 20 | video_sampler: Type[torch.utils.data.Sampler] = torch.utils.data.RandomSampler, 21 | transform: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, 22 | video_path_prefix: str = "", 23 | decode_audio: bool = True, 24 | decoder: str = "pyav", 25 | ) -> LabeledVideoDataset: 26 | """ 27 | A helper function to create ``LabeledVideoDataset`` object for the Kinetics dataset. 28 | 29 | Args: 30 | data_path (str): Path to the data. The path type defines how the data 31 | should be read: 32 | 33 | * For a file path, the file is read and each line is parsed into a 34 | video path and label. 35 | * For a directory, the directory structure defines the classes 36 | (i.e. each subdirectory is a class). 37 | 38 | clip_sampler (ClipSampler): Defines how clips should be sampled from each 39 | video. See the clip sampling documentation for more information. 40 | 41 | video_sampler (Type[torch.utils.data.Sampler]): Sampler for the internal 42 | video container. This defines the order videos are decoded and, 43 | if necessary, the distributed split. 44 | 45 | transform (Callable): This callable is evaluated on the clip output before 46 | the clip is returned. It can be used for user defined preprocessing and 47 | augmentations to the clips. See the ``LabeledVideoDataset`` class for clip 48 | output format. 49 | 50 | video_path_prefix (str): Path to root directory with the videos that are 51 | loaded in ``LabeledVideoDataset``. All the video paths before loading 52 | are prefixed with this path. 53 | 54 | decode_audio (bool): If True, also decode audio from video. 55 | 56 | decoder (str): Defines what type of decoder used to decode a video. 57 | 58 | """ 59 | 60 | torch._C._log_api_usage_once("PYTORCHVIDEO.dataset.Kinetics") 61 | 62 | return labeled_video_dataset( 63 | data_path, 64 | clip_sampler, 65 | video_sampler, 66 | transform, 67 | video_path_prefix, 68 | decode_audio, 69 | decoder, 70 | ) 71 | -------------------------------------------------------------------------------- /pytorchvideo/data/ucf101.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from typing import Any, Callable, Dict, Optional, Type 4 | 5 | import torch 6 | from pytorchvideo.data.clip_sampling import ClipSampler 7 | 8 | from .labeled_video_dataset import labeled_video_dataset, LabeledVideoDataset 9 | 10 | 11 | """ 12 | Action recognition video dataset for UCF101 13 | 14 | """ 15 | 16 | 17 | def Ucf101( 18 | data_path: str, 19 | clip_sampler: ClipSampler, 20 | video_sampler: Type[torch.utils.data.Sampler] = torch.utils.data.RandomSampler, 21 | transform: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, 22 | video_path_prefix: str = "", 23 | decode_audio: bool = True, 24 | decoder: str = "pyav", 25 | ) -> LabeledVideoDataset: 26 | """ 27 | A helper function to create ``LabeledVideoDataset`` object for the Ucf101 dataset. 28 | 29 | Args: 30 | data_path (str): Path to the data. The path type defines how the data 31 | should be read: 32 | 33 | * For a file path, the file is read and each line is parsed into a 34 | video path and label. 35 | * For a directory, the directory structure defines the classes 36 | (i.e. each subdirectory is a class). 37 | 38 | clip_sampler (ClipSampler): Defines how clips should be sampled from each 39 | video. See the clip sampling documentation for more information. 40 | 41 | video_sampler (Type[torch.utils.data.Sampler]): Sampler for the internal 42 | video container. This defines the order videos are decoded and, 43 | if necessary, the distributed split. 44 | 45 | transform (Callable): This callable is evaluated on the clip output before 46 | the clip is returned. It can be used for user defined preprocessing and 47 | augmentations to the clips. See the ``LabeledVideoDataset`` class for clip 48 | output format. 49 | 50 | video_path_prefix (str): Path to root directory with the videos that are 51 | loaded in ``LabeledVideoDataset``. All the video paths before loading 52 | are prefixed with this path. 53 | 54 | decode_audio (bool): If True, also decode audio from video. 55 | 56 | decoder (str): Defines what type of decoder used to decode a video. 57 | 58 | """ 59 | 60 | torch._C._log_api_usage_once("PYTORCHVIDEO.dataset.Ucf101") 61 | 62 | return labeled_video_dataset( 63 | data_path, 64 | clip_sampler, 65 | video_sampler, 66 | transform, 67 | video_path_prefix, 68 | decode_audio, 69 | decoder, 70 | ) 71 | -------------------------------------------------------------------------------- /pytorchvideo/data/video.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import BinaryIO, Dict, Optional 5 | 6 | import torch 7 | from iopath.common.file_io import g_pathmgr 8 | 9 | 10 | class VideoPathHandler: 11 | """ 12 | Utility class that handles all deciphering and caching of video paths for 13 | encoded and frame videos. 14 | """ 15 | 16 | def __init__(self) -> None: 17 | # Pathmanager isn't guaranteed to be in correct order, 18 | # sorting is expensive, so we cache paths in case of frame video and reuse. 19 | self.path_order_cache = {} 20 | 21 | def video_from_path( 22 | self, filepath, decode_video=True, decode_audio=False, decoder="pyav", fps=30 23 | ): 24 | try: 25 | is_file = g_pathmgr.isfile(filepath) 26 | is_dir = g_pathmgr.isdir(filepath) 27 | except NotImplementedError: 28 | # Not all PathManager handlers support is{file,dir} functions, when this is the 29 | # case, we default to assuming the path is a file. 30 | is_file = True 31 | is_dir = False 32 | 33 | if is_file: 34 | from pytorchvideo.data.encoded_video import EncodedVideo 35 | 36 | return EncodedVideo.from_path( 37 | filepath, 38 | decode_video=decode_video, 39 | decode_audio=decode_audio, 40 | decoder=decoder, 41 | ) 42 | elif is_dir: 43 | from pytorchvideo.data.frame_video import FrameVideo 44 | 45 | assert not decode_audio, "decode_audio must be False when using FrameVideo" 46 | return FrameVideo.from_directory( 47 | filepath, fps, path_order_cache=self.path_order_cache 48 | ) 49 | else: 50 | raise FileNotFoundError(f"{filepath} not found.") 51 | 52 | 53 | class Video(ABC): 54 | """ 55 | Video provides an interface to access clips from a video container. 56 | """ 57 | 58 | @abstractmethod 59 | def __init__( 60 | self, 61 | file: BinaryIO, 62 | video_name: Optional[str] = None, 63 | decode_audio: bool = True, 64 | ) -> None: 65 | """ 66 | Args: 67 | file (BinaryIO): a file-like object (e.g. io.BytesIO or io.StringIO) that 68 | contains the encoded video. 69 | """ 70 | pass 71 | 72 | @property 73 | @abstractmethod 74 | def duration(self) -> float: 75 | """ 76 | Returns: 77 | duration of the video in seconds 78 | """ 79 | pass 80 | 81 | @abstractmethod 82 | def get_clip( 83 | self, start_sec: float, end_sec: float 84 | ) -> Dict[str, Optional[torch.Tensor]]: 85 | """ 86 | Retrieves frames from the internal video at the specified start and end times 87 | in seconds (the video always starts at 0 seconds). 88 | 89 | Args: 90 | start_sec (float): the clip start time in seconds 91 | end_sec (float): the clip end time in seconds 92 | Returns: 93 | video_data_dictonary: A dictionary mapping strings to tensor of the clip's 94 | underlying data. 95 | 96 | """ 97 | pass 98 | 99 | def close(self): 100 | pass 101 | -------------------------------------------------------------------------------- /pytorchvideo/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | 3 | from .attention import Mlp, MultiScaleAttention, MultiScaleBlock 4 | from .attention_torchscript import ScriptableMultiScaleBlock 5 | from .drop_path import DropPath 6 | from .fusion import ConcatFusion, make_fusion_layer, ReduceFusion 7 | from .mlp import make_multilayer_perceptron 8 | from .positional_encoding import PositionalEncoding, SpatioTemporalClsPositionalEncoding 9 | from .positional_encoding_torchscript import ( 10 | ScriptableSpatioTemporalClsPositionalEncoding, 11 | ) 12 | -------------------------------------------------------------------------------- /pytorchvideo/layers/accelerator/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /pytorchvideo/layers/accelerator/mobile_cpu/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /pytorchvideo/layers/accelerator/mobile_cpu/activation_functions.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | """ 4 | This file contains supported activation functions in efficient block and helper code. 5 | All supported activation functions are child class of EfficientBlockBase, and included 6 | in supported_act_functions. 7 | """ 8 | 9 | import torch 10 | import torch.nn as nn 11 | from pytorchvideo.accelerator.efficient_blocks.efficient_block_base import ( 12 | EfficientBlockBase, 13 | ) 14 | from pytorchvideo.layers.swish import Swish as SwishCustomOp 15 | 16 | 17 | class _NaiveSwish(nn.Module): 18 | """ 19 | Helper class to implement naive swish for deploy. It is not intended to be used to 20 | build network. 21 | """ 22 | 23 | def __init__(self): 24 | super().__init__() 25 | self.mul_func = nn.quantized.FloatFunctional() 26 | 27 | def forward(self, x): 28 | return self.mul_func.mul(x, torch.sigmoid(x)) 29 | 30 | 31 | class Swish(EfficientBlockBase): 32 | """ 33 | Swish activation function for efficient block. When in original form for training, 34 | using custom op version of swish for better training memory efficiency. When in 35 | deployable form, use naive swish as custom op is not supported to run on Pytorch 36 | Mobile. For better latency on mobile CPU, use HardSwish instead. 37 | """ 38 | 39 | def __init__(self): 40 | super().__init__() 41 | self.act = SwishCustomOp() 42 | 43 | def forward(self, x): 44 | return self.act(x) 45 | 46 | def convert(self, *args, **kwarg): 47 | self.act = _NaiveSwish() 48 | 49 | 50 | class HardSwish(EfficientBlockBase): 51 | """ 52 | Hardswish activation function. It is natively supported by Pytorch Mobile, and has 53 | better latency than Swish in int8 mode. 54 | """ 55 | 56 | def __init__(self): 57 | super().__init__() 58 | self.act = nn.Hardswish() 59 | 60 | def forward(self, x): 61 | return self.act(x) 62 | 63 | def convert(self, *args, **kwarg): 64 | pass 65 | 66 | 67 | class ReLU(EfficientBlockBase): 68 | """ 69 | ReLU activation function for EfficientBlockBase. 70 | """ 71 | 72 | def __init__(self): 73 | super().__init__() 74 | self.act = nn.ReLU(inplace=True) 75 | 76 | def forward(self, x): 77 | return self.act(x) 78 | 79 | def convert(self, *args, **kwarg): 80 | pass 81 | 82 | 83 | class Identity(EfficientBlockBase): 84 | """ 85 | Identity operation for EfficientBlockBase. 86 | """ 87 | 88 | def __init__(self): 89 | super().__init__() 90 | self.act = nn.Identity() 91 | 92 | def forward(self, x): 93 | return self.act(x) 94 | 95 | def convert(self, *args, **kwarg): 96 | pass 97 | 98 | 99 | supported_act_functions = { 100 | "relu": ReLU, 101 | "swish": Swish, 102 | "hswish": HardSwish, 103 | "identity": Identity, 104 | } 105 | -------------------------------------------------------------------------------- /pytorchvideo/layers/accelerator/mobile_cpu/fully_connected.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import torch.nn as nn 4 | from pytorchvideo.accelerator.efficient_blocks.no_op_convert_block import ( 5 | NoOpConvertBlock, 6 | ) 7 | 8 | 9 | class FullyConnected(NoOpConvertBlock): 10 | """ 11 | Implements fully connected layer. This operator is natively supported by QNNPACK for 12 | mobile CPU with good efficiency, and no change is made upon convert(). 13 | Args: 14 | in_features (int): input channels for FC layer. 15 | out_features (int): output channels for FC layer. 16 | bias (bool): if True, bias is applied 17 | """ 18 | 19 | def __init__( 20 | self, 21 | in_features: int, 22 | out_features: int, 23 | bias: bool = True, 24 | ): 25 | super().__init__(model=nn.Linear(in_features, out_features, bias=bias)) 26 | -------------------------------------------------------------------------------- /pytorchvideo/layers/drop_path.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | 7 | def drop_path( 8 | x: torch.Tensor, drop_prob: float = 0.0, training: bool = False 9 | ) -> torch.Tensor: 10 | """ 11 | Stochastic Depth per sample. 12 | 13 | Args: 14 | x (tensor): Input tensor. 15 | drop_prob (float): Probability to apply drop path. 16 | training (bool): If True, apply drop path to input. Otherwise (tesing), return input. 17 | """ 18 | if drop_prob == 0.0 or not training: 19 | return x 20 | keep_prob = 1 - drop_prob 21 | shape = (x.shape[0],) + (1,) * ( 22 | x.ndim - 1 23 | ) # work with diff dim tensors, not just 2D ConvNets 24 | mask = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device) 25 | mask.floor_() # binarize 26 | output = x.div(keep_prob) * mask 27 | return output 28 | 29 | 30 | class DropPath(nn.Module): 31 | """ 32 | Drop paths (Stochastic Depth) per sample. 33 | """ 34 | 35 | def __init__(self, drop_prob: float = 0.0) -> None: 36 | """ 37 | Args: 38 | drop_prob (float): Probability to apply drop path. 39 | """ 40 | super(DropPath, self).__init__() 41 | self.drop_prob = drop_prob 42 | 43 | def forward(self, x: torch.Tensor) -> torch.Tensor: 44 | """ 45 | Args: 46 | x (tensor): Input tensor. 47 | """ 48 | return drop_path(x, self.drop_prob, self.training) 49 | -------------------------------------------------------------------------------- /pytorchvideo/layers/mlp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | 3 | from typing import Callable, List, Optional, Tuple 4 | 5 | from torch import nn 6 | 7 | 8 | def make_multilayer_perceptron( 9 | fully_connected_dims: List[int], 10 | norm: Optional[Callable] = None, 11 | mid_activation: Callable = nn.ReLU, 12 | final_activation: Optional[Callable] = nn.ReLU, 13 | dropout_rate: float = 0.0, 14 | ) -> Tuple[nn.Module, int]: 15 | """ 16 | Factory function for Multi-Layer Perceptron. These are constructed as repeated 17 | blocks of the following format where each fc represents the blocks output/input dimension. 18 | 19 | :: 20 | 21 | Linear (in=fc[i-1], out=fc[i]) 22 | ↓ 23 | Normalization (norm) 24 | ↓ 25 | Activation (mid_activation) 26 | ↓ 27 | After the repeated Perceptron blocks, 28 | a final dropout and activation layer is applied: 29 | ↓ 30 | Dropout (p=dropout_rate) 31 | ↓ 32 | Activation (final_activation) 33 | 34 | """ 35 | assert isinstance(fully_connected_dims, list) 36 | assert len(fully_connected_dims) > 1 37 | assert all(_is_pos_int(x) for x in fully_connected_dims) 38 | 39 | layers = [] 40 | cur_dim = fully_connected_dims[0] 41 | for dim in fully_connected_dims[1:-1]: 42 | layers.append(nn.Linear(cur_dim, dim)) 43 | if norm is not None: 44 | layers.append(norm(dim)) 45 | layers.append(mid_activation()) 46 | cur_dim = dim 47 | layers.append(nn.Linear(cur_dim, fully_connected_dims[-1])) 48 | if dropout_rate > 0: 49 | layers.append(nn.Dropout(p=dropout_rate)) 50 | if final_activation is not None: 51 | layers.append(final_activation()) 52 | 53 | mlp = nn.Sequential(*layers) 54 | output_dim = fully_connected_dims[-1] 55 | return mlp, output_dim 56 | 57 | 58 | def _is_pos_int(number: int) -> bool: 59 | """ 60 | Returns True if a number is a positive integer. 61 | """ 62 | return type(number) == int and number >= 0 63 | -------------------------------------------------------------------------------- /pytorchvideo/layers/positional_encoding_torchscript.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | 3 | from typing import Tuple 4 | 5 | import torch 6 | from torch import nn 7 | 8 | 9 | class ScriptableSpatioTemporalClsPositionalEncoding(nn.Module): 10 | """ 11 | Add a cls token and apply a spatiotemporal encoding to a tensor. 12 | """ 13 | 14 | def __init__( 15 | self, 16 | embed_dim: int, 17 | patch_embed_shape: Tuple[int, int, int], 18 | sep_pos_embed: bool = False, 19 | has_cls: bool = True, 20 | ) -> None: 21 | """ 22 | Args: 23 | embed_dim (int): Embedding dimension for input sequence. 24 | patch_embed_shape (Tuple): The number of patches in each dimension 25 | (T, H, W) after patch embedding. 26 | sep_pos_embed (bool): If set to true, one positional encoding is used for 27 | spatial patches and another positional encoding is used for temporal 28 | sequence. Otherwise, only one positional encoding is used for all the 29 | patches. 30 | has_cls (bool): If set to true, a cls token is added in the beginning of each 31 | input sequence. 32 | """ 33 | super().__init__() 34 | assert ( 35 | len(patch_embed_shape) == 3 36 | ), "Patch_embed_shape should be in the form of (T, H, W)." 37 | assert not has_cls 38 | self.sep_pos_embed = sep_pos_embed 39 | self._patch_embed_shape = patch_embed_shape 40 | self.num_spatial_patch = patch_embed_shape[1] * patch_embed_shape[2] 41 | self.num_temporal_patch = patch_embed_shape[0] 42 | 43 | self.pos_embed_spatial = nn.Parameter( 44 | torch.zeros(1, self.num_spatial_patch, embed_dim) 45 | ) 46 | self.pos_embed_temporal = nn.Parameter( 47 | torch.zeros(1, self.num_temporal_patch, embed_dim) 48 | ) 49 | 50 | @property 51 | def patch_embed_shape(self): 52 | return self._patch_embed_shape 53 | 54 | def forward(self, x: torch.Tensor) -> torch.Tensor: 55 | """ 56 | Args: 57 | x (torch.Tensor): Input tensor. 58 | """ 59 | B, N, C = x.shape 60 | 61 | assert self.sep_pos_embed 62 | pos_embed = self.pos_embed_spatial.repeat( 63 | 1, self.num_temporal_patch, 1 64 | ) + torch.repeat_interleave( 65 | self.pos_embed_temporal, 66 | self.num_spatial_patch, 67 | dim=1, 68 | ) 69 | x = x + pos_embed 70 | return x 71 | -------------------------------------------------------------------------------- /pytorchvideo/layers/swish.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | 7 | class Swish(nn.Module): 8 | """ 9 | Wrapper for the Swish activation function. 10 | """ 11 | 12 | def forward(self, x): 13 | return SwishFunction.apply(x) 14 | 15 | 16 | class SwishFunction(torch.autograd.Function): 17 | """ 18 | Implementation of the Swish activation function: x * sigmoid(x). 19 | 20 | Searching for activation functions. Ramachandran, Prajit and Zoph, Barret 21 | and Le, Quoc V. 2017 22 | """ 23 | 24 | @staticmethod 25 | def forward(ctx, x): 26 | result = x * torch.sigmoid(x) 27 | ctx.save_for_backward(x) 28 | return result 29 | 30 | @staticmethod 31 | def backward(ctx, grad_output): 32 | (x,) = ctx.saved_tensors 33 | sigmoid_x = torch.sigmoid(x) 34 | return grad_output * (sigmoid_x * (1 + x * (1 - sigmoid_x))) 35 | -------------------------------------------------------------------------------- /pytorchvideo/layers/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import math 4 | from typing import List 5 | 6 | 7 | def set_attributes(self, params: List[object] = None) -> None: 8 | """ 9 | An utility function used in classes to set attributes from the input list of parameters. 10 | Args: 11 | params (list): list of parameters. 12 | """ 13 | if params: 14 | for k, v in params.items(): 15 | if k != "self": 16 | setattr(self, k, v) 17 | 18 | 19 | def round_width(width, multiplier, min_width=8, divisor=8, ceil=False): 20 | """ 21 | Round width of filters based on width multiplier 22 | Args: 23 | width (int): the channel dimensions of the input. 24 | multiplier (float): the multiplication factor. 25 | min_width (int): the minimum width after multiplication. 26 | divisor (int): the new width should be dividable by divisor. 27 | ceil (bool): If True, use ceiling as the rounding method. 28 | """ 29 | if not multiplier: 30 | return width 31 | 32 | width *= multiplier 33 | min_width = min_width or divisor 34 | if ceil: 35 | width_out = max(min_width, int(math.ceil(width / divisor)) * divisor) 36 | else: 37 | width_out = max(min_width, int(width + divisor / 2) // divisor * divisor) 38 | if width_out < 0.9 * width: 39 | width_out += divisor 40 | return int(width_out) 41 | 42 | 43 | def round_repeats(repeats, multiplier): 44 | """ 45 | Round number of layers based on depth multiplier. 46 | """ 47 | if not multiplier: 48 | return repeats 49 | return int(math.ceil(multiplier * repeats)) 50 | -------------------------------------------------------------------------------- /pytorchvideo/losses/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/pytorchvideo/6cdc929315aab1b5674b6dcf73b16ec99147735f/pytorchvideo/losses/__init__.py -------------------------------------------------------------------------------- /pytorchvideo/losses/soft_target_cross_entropy.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from pytorchvideo.layers.utils import set_attributes 7 | from pytorchvideo.transforms.functional import convert_to_one_hot 8 | 9 | 10 | class SoftTargetCrossEntropyLoss(nn.Module): 11 | """ 12 | Adapted from Classy Vision: ./classy_vision/losses/soft_target_cross_entropy_loss.py. 13 | This allows the targets for the cross entropy loss to be multi-label. 14 | """ 15 | 16 | def __init__( 17 | self, 18 | ignore_index: int = -100, 19 | reduction: str = "mean", 20 | normalize_targets: bool = True, 21 | ) -> None: 22 | """ 23 | Args: 24 | ignore_index (int): sample should be ignored for loss if the class is this value. 25 | reduction (str): specifies reduction to apply to the output. 26 | normalize_targets (bool): whether the targets should be normalized to a sum of 1 27 | based on the total count of positive targets for a given sample. 28 | """ 29 | super().__init__() 30 | set_attributes(self, locals()) 31 | assert isinstance(self.normalize_targets, bool) 32 | if self.reduction not in ["mean", "none"]: 33 | raise NotImplementedError( 34 | 'reduction type "{}" not implemented'.format(self.reduction) 35 | ) 36 | self.eps = torch.finfo(torch.float32).eps 37 | 38 | def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor: 39 | """ 40 | Args: 41 | input (torch.Tensor): the shape of the tensor is N x C, where N is the number of 42 | samples and C is the number of classes. The tensor is raw input without 43 | softmax/sigmoid. 44 | target (torch.Tensor): the shape of the tensor is N x C or N. If the shape is N, we 45 | will convert the target to one hot vectors. 46 | """ 47 | # Check if targets are inputted as class integers 48 | if target.ndim == 1: 49 | assert ( 50 | input.shape[0] == target.shape[0] 51 | ), "SoftTargetCrossEntropyLoss requires input and target to have same batch size!" 52 | target = convert_to_one_hot(target.view(-1, 1), input.shape[1]) 53 | 54 | assert input.shape == target.shape, ( 55 | "SoftTargetCrossEntropyLoss requires input and target to be same " 56 | f"shape: {input.shape} != {target.shape}" 57 | ) 58 | 59 | # Samples where the targets are ignore_index do not contribute to the loss 60 | N, C = target.shape 61 | valid_mask = torch.ones((N, 1), dtype=torch.float).to(input.device) 62 | if 0 <= self.ignore_index <= C - 1: 63 | drop_idx = target[:, self.ignore_idx] > 0 64 | valid_mask[drop_idx] = 0 65 | 66 | valid_targets = target.float() * valid_mask 67 | if self.normalize_targets: 68 | valid_targets /= self.eps + valid_targets.sum(dim=1, keepdim=True) 69 | per_sample_per_target_loss = -valid_targets * F.log_softmax(input, -1) 70 | 71 | per_sample_loss = torch.sum(per_sample_per_target_loss, -1) 72 | # Perform reduction 73 | if self.reduction == "mean": 74 | # Normalize based on the number of samples with > 0 non-ignored targets 75 | loss = per_sample_loss.sum() / torch.sum( 76 | (torch.sum(valid_mask, -1) > 0) 77 | ).clamp(min=1) 78 | elif self.reduction == "none": 79 | loss = per_sample_loss 80 | 81 | return loss 82 | -------------------------------------------------------------------------------- /pytorchvideo/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from .csn import create_csn 4 | from .head import create_res_basic_head, ResNetBasicHead 5 | from .masked_multistream import ( 6 | LearnMaskedDefault, 7 | LSTM, 8 | MaskedMultiPathWay, 9 | MaskedSequential, 10 | MaskedTemporalPooling, 11 | TransposeMultiheadAttention, 12 | TransposeTransformerEncoder, 13 | ) 14 | from .net import MultiPathWayWithFuse, Net 15 | from .resnet import BottleneckBlock, create_bottleneck_block, create_resnet 16 | from .slowfast import create_slowfast 17 | from .stem import create_conv_patch_embed, create_res_basic_stem, ResNetBasicStem 18 | from .vision_transformers import create_multiscale_vision_transformers 19 | from .weight_init import init_net_weights 20 | -------------------------------------------------------------------------------- /pytorchvideo/models/accelerator/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /pytorchvideo/models/accelerator/mobile_cpu/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /pytorchvideo/models/hub/README.md: -------------------------------------------------------------------------------- 1 | ## TorchHub Models 2 | 3 | PyTorchVideo provides a large set of [TorchHub](https://pytorch.org/hub/) models for state-of-the-art models with pre-trained weights. Check the tables below for the torchhub names and corresponding models. 4 | 5 | 6 | ### Kinetics-400 7 | 8 | Models are trained on Kinetics-400. For more benchmarking and model details, please check the [PyTorchVideo Model Zoo](https://github.com/facebookresearch/pytorchvideo/blob/main/docs/source/model_zoo.md) 9 | 10 | torchhub name | arch | depth | frame length x sample rate | top 1 | top 5 | 11 | ------------------------ | -------- | ----- | -------------------------- | ----- | ----- | 12 | c2d_r50 | C2D | R50 | 8x8 | 71.46 | 89.68 | 13 | i3d_r50 | I3D | R50 | 8x8 | 73.27 | 90.70 | 14 | slow_r50 | Slow | R50 | 8x8 | 74.58 | 91.63 | 15 | slowfast_r50 | SlowFast | R50 | 8x8 | 76.94 | 92.69 | 16 | slowfast_r101 | SlowFast | R101 | 8x8 | 77.90 | 93.27 | 17 | slowfast_16x8_r101_50_50 | SlowFast | R101 | 16x8 | 78.70 | 93.61 | 18 | csn_r101 | CSN | R101 | 32x2 | 77.00 | 92.90 | 19 | r2plus1d_r50 | R(2+1)D | R50 | 16x4 | 76.01 | 92.23 | 20 | x3d_xs | X3D | XS | 4x12 | 69.12 | 88.63 | 21 | x3d_s | X3D | S | 13x6 | 73.33 | 91.27 | 22 | x3d_m | X3D | M | 16x5 | 75.94 | 92.72 | 23 | x3d_l | X3D | L | 16x5 | 77.44 | 93.31 | 24 | 25 | ### PytorchVideo Accelerator Models 26 | 27 | **Efficient Models for mobile CPU** 28 | Models are trained on Kinetics-400. Latency is benchmarked on Samsung S8 phone with 1s input clip length. 29 | 30 | torchhub name | model | top 1 | top 5 | latency (ms) | 31 | ---------------- |--------|-------|-------|--------------| 32 | efficient_x3d_xs | X3D_XS | 68.5 | 88.0 | 233 | 33 | efficient_x3d_s | X3D_S | 73.0 | 90.6 | 764 | 34 | 35 | 36 | 37 | ### Using PyTorchVideo torchhub models 38 | The models have been integrated into TorchHub, so could be loaded with TorchHub with or without pre-trained models. You can specify the torchhub name for the model to construct the model with pre-trained weights: 39 | 40 | ```Python 41 | # Pick a pretrained model 42 | model_name = "slowfast_r50" 43 | model = torch.hub.load("facebookresearch/pytorchvideo:main", model=model_name, pretrained=True) 44 | ``` 45 | 46 | Notes: 47 | * Please check [torchhub inference tutorial](https://pytorchvideo.org/docs/tutorial_torchhub_inference) for more details about how to load models from TorchHub and perform inference. 48 | * Check [Model Zoo](https://github.com/facebookresearch/pytorchvideo/blob/main/docs/source/model_zoo.md) for the full set of supported PytorchVideo model zoo and more details about how the model zoo is prepared. 49 | -------------------------------------------------------------------------------- /pytorchvideo/models/hub/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from .csn import csn_r101 4 | from .efficient_x3d_mobile_cpu import efficient_x3d_s, efficient_x3d_xs 5 | from .r2plus1d import r2plus1d_r50 6 | from .resnet import c2d_r50, i3d_r50, slow_r50, slow_r50_detection 7 | from .slowfast import ( 8 | slowfast_16x8_r101_50_50, 9 | slowfast_r101, 10 | slowfast_r50, 11 | slowfast_r50_detection, 12 | ) 13 | from .vision_transformers import mvit_base_16, mvit_base_16x4, mvit_base_32x3 14 | from .x3d import x3d_l, x3d_m, x3d_s, x3d_xs 15 | -------------------------------------------------------------------------------- /pytorchvideo/models/hub/csn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from typing import Any 4 | 5 | import torch.nn as nn 6 | from pytorchvideo.models.csn import create_csn 7 | from torch.hub import load_state_dict_from_url 8 | 9 | 10 | """ 11 | Channel-Separated Convolutional Network models for video recognition. 12 | """ 13 | 14 | root_dir = "https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics" 15 | checkpoint_paths = { 16 | "csn_r101": f"{root_dir}/CSN_32x2_R101.pyth", 17 | } 18 | 19 | 20 | def csn_r101( 21 | pretrained: bool = False, progress: bool = True, **kwargs: Any 22 | ) -> nn.Module: 23 | r""" 24 | Channel-Separated Convolutional Networks (CSN) R101 model architecture [1] 25 | with pretrained weights based on 32x2 setting on the Kinetics dataset. 26 | Model with pretrained weights has top1 accuracy of 77.0 (trained on 16x8 GPUs). 27 | 28 | [1] "Video classification with channel-separated convolutional networks" 29 | Du Tran, Heng Wang, Lorenzo Torresani, Matt Feiszli. ICCV 2019. 30 | https://arxiv.org/abs/1904.02811 31 | 32 | Args: 33 | pretrained (bool): If True, returns a model pre-trained on the Kinetics dataset 34 | progress (bool): If True, displays a progress bar of the download to stderr 35 | kwargs: use these to modify any of the other model settings. All the 36 | options are defined in pytorchvideo/models/resnet.py 37 | 38 | NOTE: to use the pretrained model, do not modify the model configuration 39 | via the kwargs. Only modify settings via kwargs to initialize a new model 40 | without pretrained weights. 41 | """ 42 | model = create_csn( 43 | model_depth=101, 44 | stem_pool=nn.MaxPool3d, 45 | head_pool_kernel_size=(4, 7, 7), 46 | **kwargs, 47 | ) 48 | 49 | if pretrained: 50 | path = checkpoint_paths["csn_r101"] 51 | # All models are loaded onto CPU by default 52 | checkpoint = load_state_dict_from_url( 53 | path, progress=progress, map_location="cpu" 54 | ) 55 | state_dict = checkpoint["model_state"] 56 | model.load_state_dict(state_dict) 57 | 58 | return model 59 | -------------------------------------------------------------------------------- /pytorchvideo/models/hub/efficient_x3d_mobile_cpu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from typing import Any, Optional 4 | 5 | import torch.nn as nn 6 | from pytorchvideo.models.accelerator.mobile_cpu.efficient_x3d import create_x3d 7 | from torch.hub import load_state_dict_from_url 8 | 9 | 10 | _root_dir = "https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics" 11 | _checkpoint_paths = { 12 | "efficient_x3d_xs": f"{_root_dir}/efficient_x3d_xs_original_form.pyth", 13 | "efficient_x3d_s": f"{_root_dir}/efficient_x3d_s_original_form.pyth", 14 | } 15 | 16 | 17 | def _efficient_x3d( 18 | pretrained: bool = False, 19 | progress: bool = True, 20 | checkpoint_path: Optional[str] = None, 21 | # Model params 22 | expansion: str = "XS", 23 | **kwargs: Any, 24 | ) -> nn.Module: 25 | model = create_x3d( 26 | expansion=expansion, 27 | **kwargs, 28 | ) 29 | 30 | if pretrained and checkpoint_path is not None: 31 | # All models are loaded onto CPU by default 32 | state_dict = load_state_dict_from_url( 33 | checkpoint_path, progress=progress, map_location="cpu" 34 | ) 35 | model.load_state_dict(state_dict) 36 | 37 | return model 38 | 39 | 40 | def efficient_x3d_xs(pretrained: bool = False, progress: bool = True, **kwargs): 41 | r""" 42 | X3D-XS model architectures [1] with pretrained weights trained 43 | on the Kinetics dataset with efficient implementation for mobile cpu. 44 | 45 | [1] Christoph Feichtenhofer, "X3D: Expanding Architectures for 46 | Efficient Video Recognition." https://arxiv.org/abs/2004.04730 47 | 48 | Args: 49 | pretrained (bool): If True, returns a model pre-trained on Kinetcis-400 dataset 50 | progress (bool): If True, displays a progress bar of the download to stderr 51 | To modify any other model settings, specify them in the kwargs. 52 | All the args are defined in pytorchvideo/models/x3d.py 53 | """ 54 | return _efficient_x3d( 55 | pretrained=pretrained, 56 | progress=progress, 57 | checkpoint_path=_checkpoint_paths["efficient_x3d_xs"], 58 | expansion="XS", 59 | **kwargs, 60 | ) 61 | 62 | 63 | def efficient_x3d_s(pretrained: bool = False, progress: bool = True, **kwargs): 64 | r""" 65 | X3D-S model architectures [1] with pretrained weights trained 66 | on the Kinetics dataset with efficient implementation for mobile cpu. 67 | 68 | [1] Christoph Feichtenhofer, "X3D: Expanding Architectures for 69 | Efficient Video Recognition." https://arxiv.org/abs/2004.04730 70 | 71 | Args: 72 | pretrained (bool): If True, returns a model pre-trained on Kinetcis-400 dataset 73 | progress (bool): If True, displays a progress bar of the download to stderr 74 | To modify any other model settings, specify them in the kwargs. 75 | All the args are defined in pytorchvideo/models/x3d.py 76 | """ 77 | return _efficient_x3d( 78 | pretrained=pretrained, 79 | progress=progress, 80 | checkpoint_path=_checkpoint_paths["efficient_x3d_s"], 81 | expansion="S", 82 | **kwargs, 83 | ) 84 | -------------------------------------------------------------------------------- /pytorchvideo/models/hub/r2plus1d.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from typing import Any 4 | 5 | import torch.nn as nn 6 | from pytorchvideo.models.r2plus1d import create_r2plus1d 7 | from torch.hub import load_state_dict_from_url 8 | 9 | 10 | """ 11 | R(2+1)D style models for video recognition. 12 | """ 13 | 14 | root_dir = "https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics" 15 | checkpoint_paths = { 16 | "r2plus1d_r50": f"{root_dir}/R2PLUS1D_16x4_R50.pyth", 17 | } 18 | 19 | 20 | def r2plus1d_r50( 21 | pretrained: bool = False, progress: bool = True, **kwargs: Any 22 | ) -> nn.Module: 23 | r""" 24 | 25 | R(2+1)D model architecture from [1] with pretrained weights based on 16x4 setting 26 | on the Kinetics dataset. Model with pretrained weights has top1 accuracy of 76.01. 27 | (trained on 8*8 GPUs) 28 | 29 | [1] "A closer look at spatiotemporal convolutions for action recognition" 30 | Du Tran, Heng Wang, Lorenzo Torresani, Jamie Ray, Yann LeCun, Manohar Paluri. CVPR 2018. 31 | https://arxiv.org/abs/1711.11248 32 | 33 | Args: 34 | pretrained (bool): If True, returns a model pre-trained on the Kinetics dataset 35 | progress (bool): If True, displays a progress bar of the download to stderr 36 | kwargs: use these to modify any of the other model settings. All the 37 | options are defined in pytorchvideo/models/resnet.py 38 | 39 | NOTE: to use the pretrained model, do not modify the model configuration 40 | via the kwargs. Only modify settings via kwargs to initialize a new model 41 | without pretrained weights. 42 | """ 43 | model = create_r2plus1d(dropout_rate=0.5, **kwargs) 44 | 45 | if pretrained: 46 | path = checkpoint_paths["r2plus1d_r50"] 47 | # All models are loaded onto CPU by default 48 | checkpoint = load_state_dict_from_url( 49 | path, progress=progress, map_location="cpu" 50 | ) 51 | state_dict = checkpoint["model_state"] 52 | model.load_state_dict(state_dict) 53 | 54 | return model 55 | -------------------------------------------------------------------------------- /pytorchvideo/models/hub/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from typing import Any, Callable, Dict, Optional 4 | 5 | import torch.nn as nn 6 | from torch.hub import load_state_dict_from_url 7 | 8 | 9 | MODEL_ZOO_ROOT_DIR = "https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo" 10 | 11 | 12 | def hub_model_builder( 13 | model_builder_func: Callable, 14 | pretrained: bool = False, 15 | progress: bool = True, 16 | checkpoint_path: str = "", 17 | default_config: Optional[Dict[Any, Any]] = None, 18 | **kwargs: Any, 19 | ) -> nn.Module: 20 | """ 21 | model_builder_func (Callable): Model builder function. 22 | pretrained (bool): Whether to load a pretrained model or not. Default: False. 23 | progress (bool): Whether or not to display a progress bar to stderr. Default: True. 24 | checkpoint_path (str): URL of the model weight to download. 25 | default_config (Dict): Default model configs that is passed to the model builder. 26 | **kwargs: (Any): Additional model configs. Do not modify the model configuration 27 | via the kwargs for pretrained model. 28 | """ 29 | if pretrained: 30 | assert len(kwargs) == 0, "Do not change kwargs for pretrained model." 31 | 32 | if default_config is not None: 33 | for argument, value in default_config.items(): 34 | if kwargs.get(argument) is None: 35 | kwargs[argument] = value 36 | 37 | model = model_builder_func(**kwargs) 38 | if pretrained: 39 | # All models are loaded onto CPU by default 40 | checkpoint = load_state_dict_from_url( 41 | checkpoint_path, progress=progress, map_location="cpu" 42 | ) 43 | state_dict = checkpoint["model_state"] 44 | model.load_state_dict(state_dict) 45 | return model 46 | -------------------------------------------------------------------------------- /pytorchvideo/models/simclr.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from typing import Optional 4 | 5 | import torch 6 | import torch.distributed as dist 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | from fvcore.nn.distributed import differentiable_all_gather 10 | from pytorchvideo.layers.utils import set_attributes 11 | 12 | 13 | class SimCLR(nn.Module): 14 | """ 15 | A Simple Framework for Contrastive Learning of Visual Representations 16 | Details can be found from: 17 | https://arxiv.org/abs/2002.05709 18 | """ 19 | 20 | def __init__( 21 | self, 22 | mlp: nn.Module, 23 | backbone: Optional[nn.Module] = None, 24 | temperature: float = 0.07, 25 | ) -> None: 26 | super().__init__() 27 | 28 | torch._C._log_api_usage_once("PYTORCHVIDEO.model.SimCLR.__init__") 29 | 30 | set_attributes(self, locals()) 31 | 32 | def forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: 33 | """ 34 | Args: 35 | x1 (torch.tensor): a batch of image with augmentation. The input tensor 36 | shape should able to be feed into the backbone. 37 | x2 (torch.tensor): the size batch of image with different augmentation. The 38 | input tensor shape should able to be feed into the backbone. 39 | """ 40 | if self.backbone is not None: 41 | x1 = self.backbone(x1) 42 | x1 = self.mlp(x1) 43 | x1 = F.normalize(x1, p=2, dim=1) 44 | 45 | if self.backbone is not None: 46 | x2 = self.backbone(x2) 47 | x2 = self.mlp(x2) 48 | x2 = F.normalize(x2, p=2, dim=1) 49 | x2 = torch.cat(differentiable_all_gather(x2), dim=0) 50 | 51 | prod = torch.einsum("nc,kc->nk", [x1, x2]) 52 | prod = prod.div(self.temperature) 53 | batch_size = x1.size(0) 54 | if dist.is_available() and dist.is_initialized(): 55 | device_ind = dist.get_rank() 56 | else: 57 | device_ind = 0 58 | gt = ( 59 | torch.tensor( 60 | list(range(device_ind * batch_size, (device_ind + 1) * batch_size)) 61 | ) 62 | .long() 63 | .to(x1.device) 64 | ) 65 | loss = torch.nn.functional.cross_entropy(prod, gt) 66 | return loss 67 | -------------------------------------------------------------------------------- /pytorchvideo/neural_engine/engine.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import itertools 4 | from collections import OrderedDict 5 | from typing import List, Union 6 | 7 | import networkx as nx 8 | from pytorchvideo.neural_engine import HookBase 9 | 10 | 11 | class NeuralEngine: 12 | """ 13 | NeuralEngine takes a list of hooks and executes them in their topological order. The 14 | topological order of the hooks is determined by their required inputs and outputs. 15 | """ 16 | 17 | def __init__(self, hooks: List[HookBase]) -> None: 18 | self.hooks = hooks 19 | self.execution_order_func = NeuralEngine.topological_sort 20 | 21 | def get_execution_order(self, status): 22 | self.execution_order_func(status, self.hooks) 23 | 24 | def set_execution_order_func(self, func): 25 | self.execution_order_func = func 26 | 27 | @staticmethod 28 | def topological_sort(status, hooks): 29 | # Get DAG 30 | graph = nx.DiGraph() 31 | edges = [] 32 | pending_outputs = [] 33 | output_to_hook = {} 34 | for hook in hooks: 35 | for pair in itertools.product(hook.get_inputs(), hook.get_outputs()): 36 | edges.append(pair) 37 | for output in hook.get_outputs(): 38 | assert output not in pending_outputs 39 | output_to_hook[output] = hook 40 | pending_outputs.append(output) 41 | graph.add_edges_from(edges) 42 | for _current in nx.topological_sort(graph): 43 | if _current in pending_outputs: 44 | _hook = output_to_hook[_current] 45 | yield _hook 46 | for _hook_out in _hook.get_outputs(): 47 | pending_outputs.remove(_hook_out) 48 | else: 49 | assert _current in status 50 | assert len(pending_outputs) == 0 51 | 52 | def run(self, status: OrderedDict): 53 | for hook in self.get_execution_order(status): 54 | status.update(hook.run(status)) 55 | return status 56 | 57 | def __enter__( 58 | self, 59 | ): 60 | return self 61 | 62 | def __exit__( 63 | self, 64 | type, 65 | value, 66 | traceback, 67 | ): 68 | pass 69 | 70 | def __call__( 71 | self, 72 | status: Union[OrderedDict, str], 73 | ): 74 | # If not specified, the default input should be the path to video. 75 | if type(status) == str: 76 | status = {"path": status} 77 | return self.run(status) 78 | -------------------------------------------------------------------------------- /pytorchvideo/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from .augmix import AugMix # noqa 4 | from .mix import CutMix, MixUp, MixVideo # noqa 5 | from .rand_augment import RandAugment # noqa 6 | from .transforms import * # noqa 7 | from .transforms_factory import create_video_transform # noqa 8 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/README.md: -------------------------------------------------------------------------------- 1 | ## PyTorchVideo Trainer 2 | 3 | A [PyTorch-Lightning]() based trainer supporting PytorchVideo models and dataloaders for various video understanding tasks. 4 | 5 | Currently supported tasks include: 6 | 7 | - Video Action Recognition: ResNet's, SlowFast Models, X3D models and MViT 8 | - Video Self-Supervised Learning: SimCLR, BYOL, MoCo 9 | - (Planned) Video Action Detection 10 | 11 | ## Installation 12 | 13 | These instructions assumes that both pytorch and torchvision are already installed 14 | using the instructions in [INSTALL.md](https://github.com/facebookresearch/pytorchvideo/blob/main/INSTALL.md#requirements) 15 | 16 | Install the required additional dependency `recipes` by running the following command, 17 | ``` 18 | pip install "git+https://github.com/facebookresearch/recipes.git" 19 | ``` 20 | 21 | Post that, install PyTorchVideo Trainer by running, 22 | ``` 23 | git clone https://github.com/facebookresearch/pytorchvideo.git 24 | cd pytorchvideo/pytorchvideo_trainer 25 | pip install -e . 26 | 27 | # For developing and testing 28 | pip install -e . [test,dev] 29 | ``` 30 | 31 | ## Testing 32 | 33 | Before running the tests, please ensure that you installed the necessary additional test dependencies. 34 | 35 | Use the the following command to run the tests: 36 | ``` 37 | # From the current directory 38 | python -m unittest discover -v -s ./tests 39 | ``` 40 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | 4 | def register_components() -> None: 5 | """ 6 | Calls register_components() for all subfolders so we can register 7 | subcomponents to Hydra's ConfigStore. 8 | """ 9 | import pytorchvideo_trainer.datamodule.datamodule # noqa 10 | import pytorchvideo_trainer.module.byol # noqa 11 | import pytorchvideo_trainer.module.lr_policy # noqa 12 | import pytorchvideo_trainer.module.moco_v2 # noqa 13 | import pytorchvideo_trainer.module.optimizer # noqa 14 | import pytorchvideo_trainer.module.simclr # noqa 15 | import pytorchvideo_trainer.module.video_classification # noqa 16 | import pytorchvideo_trainer.train_app # noqa 17 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/callbacks/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from .precise_batchnorm import PreciseBn # noqa 4 | 5 | 6 | __all__ = [ 7 | "PreciseBn", 8 | ] 9 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/callbacks/precise_batchnorm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | from typing import Generator 4 | 5 | import torch 6 | from fvcore.nn.precise_bn import update_bn_stats 7 | from pytorch_lightning.callbacks import Callback 8 | from pytorch_lightning.core.lightning import LightningModule 9 | from pytorch_lightning.trainer.trainer import Trainer 10 | from torch.utils.data import DataLoader 11 | 12 | 13 | class PreciseBn(Callback): 14 | """ 15 | Recompute and update the batch norm stats to make them more precise. During 16 | training both BN stats and the weight are changing after every iteration, so 17 | the running average can not precisely reflect the actual stats of the 18 | current model. 19 | In this callaback, the BN stats are recomputed with fixed weights, to make 20 | the running average more precise during Training Phase. Specifically, it 21 | computes the true average of per-batch mean/variance instead of the 22 | running average. See Sec. 3 of the paper "Rethinking Batch in BatchNorm" 23 | for details. 24 | """ 25 | 26 | def __init__(self, num_batches: int) -> None: 27 | """ 28 | Args: 29 | num_batches (int): Number of steps / mini-batches to 30 | perform to sample for updating the precise batchnorm 31 | stats. 32 | """ 33 | self.num_batches = num_batches 34 | 35 | def _get_precise_bn_loader( 36 | self, data_loader: DataLoader, pl_module: LightningModule 37 | ) -> Generator[torch.Tensor, None, None]: 38 | for batch in data_loader: 39 | inputs = batch[pl_module.modality_key] 40 | if isinstance(inputs, list): 41 | inputs = [x.to(pl_module.device) for x in inputs] 42 | else: 43 | inputs = inputs.to(pl_module.device) 44 | yield inputs 45 | 46 | def on_train_epoch_end( 47 | self, 48 | trainer: Trainer, 49 | pl_module: LightningModule, 50 | ) -> None: 51 | """ 52 | Called at the end of every epoch only during the training 53 | phase. 54 | 55 | Args: 56 | trainer (Trainer): A PyTorch-Lightning trainer object. 57 | pl_module (LightningModule): A PyTorch-Lightning module. 58 | Typically supported modules include - 59 | pytorchvideo_trainer.module.VideoClassificationModule, etc. 60 | """ 61 | # pyre-ignore[16] 62 | dataloader = trainer.datamodule.train_dataloader() 63 | precise_bn_loader = self._get_precise_bn_loader( 64 | data_loader=dataloader, pl_module=pl_module 65 | ) 66 | update_bn_stats( 67 | model=pl_module.model, # pyre-ignore[6] 68 | data_loader=precise_bn_loader, 69 | num_iters=self.num_batches, 70 | ) 71 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import torchrecipes.core.conf # noqa 4 | 5 | # Components to register with this config 6 | from pytorchvideo_trainer import register_components 7 | 8 | 9 | register_components() 10 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/byol_train_app_conf.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorchvideo_trainer.train_app.VideoClassificationTrainApp 2 | 3 | defaults: 4 | - schema/module: byol_module_conf 5 | - schema/module/optim: optim_conf 6 | - schema/datamodule: ptv_video_classification_data_module_conf 7 | - datamodule/dataloader: kinetics_contrastive 8 | - logger: ptl 9 | - datamodule/transforms: kinetics_contrastive 10 | - module/knn_memory: kinetics_k400 11 | - module/model: slow_r50_byol 12 | - module/loss: similarity 13 | - module/optim: sgd_ssl 14 | - module/metrics: accuracy 15 | - schema/trainer: trainer 16 | - trainer: cpu 17 | - callbacks: null 18 | - _self_ 19 | trainer: 20 | sync_batchnorm: false # set this to true for training 21 | 22 | module: 23 | momentum_anneal_cosine: true 24 | 25 | hydra: 26 | searchpath: 27 | - pkg://pytorchvideo_trainer.conf 28 | - pkg://torchrecipes.core.conf 29 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/callbacks/precise_bn.yaml: -------------------------------------------------------------------------------- 1 | precise_bn: 2 | _target_: pytorchvideo_trainer.callbacks.precise_batchnorm.PreciseBn 3 | num_batches: null 4 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/classification_mvit_16x4.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorchvideo_trainer.train_app.VideoClassificationTrainApp 2 | 3 | defaults: 4 | - schema/module: video_classification_module_conf_vision_transformer 5 | - schema/module/optim: optim_conf 6 | - schema/datamodule: ptv_video_classification_data_module_conf 7 | - datamodule/dataloader: kinetics_classification 8 | - logger: ptl 9 | - datamodule/transforms: kinetics_classification_mvit_16x4 10 | - module/model: mvit_base_16x4 11 | - module/loss: soft_cross_entropy 12 | - module/optim: adamw 13 | - module/metrics: accuracy 14 | - module/lr_scheduler: cosine_with_warmup 15 | - schema/trainer: trainer 16 | - trainer: multi_gpu 17 | - _self_ 18 | 19 | module: 20 | clip_gradient_norm: 1.0 21 | ensemble_method: "sum" 22 | lr_scheduler: 23 | max_iters: 200 24 | warmup_start_lr: 1.6e-05 25 | warmup_iters: 30 26 | cosine_after_warmup: true 27 | cosine_end_lr: 1.6e-05 28 | optim: 29 | lr: 0.0016 30 | weight_decay: 0.05 31 | method: adamw 32 | zero_weight_decay_1d_param: true 33 | batch_transform: 34 | _target_: pytorchvideo_trainer.datamodule.transforms.MixVideoBatchWrapper 35 | mixup_alpha: 0.8 36 | cutmix_prob: 0.5 37 | cutmix_alpha: 1.0 38 | label_smoothing: 0.1 39 | 40 | datamodule: 41 | dataloader: 42 | train: 43 | batch_size: 2 44 | dataset: 45 | clip_sampler: 46 | clip_duration: 2.13 47 | collate_fn: 48 | _target_: pytorchvideo_trainer.datamodule.collators.build_collator_from_name 49 | name: multiple_samples_collate 50 | val: 51 | batch_size: 8 52 | dataset: 53 | clip_sampler: 54 | clip_duration: 2.13 55 | test: 56 | batch_size: 8 57 | dataset: 58 | clip_sampler: 59 | clip_duration: 2.13 60 | 61 | trainer: 62 | num_nodes: 16 63 | gpus: 8 64 | max_epochs: 200 65 | sync_batchnorm: False 66 | replace_sampler_ddp: False 67 | 68 | hydra: 69 | searchpath: 70 | - pkg://pytorchvideo_trainer.conf 71 | - pkg://torchrecipes.core.conf 72 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/classification_slow_8x8_r50.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorchvideo_trainer.train_app.VideoClassificationTrainApp 2 | 3 | defaults: 4 | - schema/module: video_classification_module_conf 5 | - schema/module/optim: optim_conf 6 | - schema/datamodule: ptv_video_classification_data_module_conf 7 | - datamodule/dataloader: kinetics_classification 8 | - logger: ptl 9 | - datamodule/transforms: kinetics_classification_slow 10 | - module/model: slow_r50 11 | - module/loss: cross_entropy 12 | - module/optim: sgd 13 | - module/metrics: accuracy 14 | - module/lr_scheduler: cosine_with_warmup 15 | - schema/trainer: trainer 16 | - trainer: multi_gpu 17 | - callbacks: precise_bn 18 | - _self_ 19 | 20 | module: 21 | ensemble_method: "sum" 22 | lr_scheduler: 23 | max_iters: 196 24 | warmup_start_lr: 0.01 25 | warmup_iters: 34 26 | optim: 27 | lr: 0.8 28 | nesterov: true 29 | 30 | callbacks: 31 | precise_bn: 32 | num_batches: 200 33 | 34 | trainer: 35 | num_nodes: 8 36 | gpus: 8 37 | max_epochs: 196 38 | sync_batchnorm: False 39 | replace_sampler_ddp: False 40 | 41 | 42 | hydra: 43 | searchpath: 44 | - pkg://pytorchvideo_trainer.conf 45 | - pkg://torchrecipes.core.conf 46 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/classification_slowfast_8x8_r50.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorchvideo_trainer.train_app.VideoClassificationTrainApp 2 | 3 | defaults: 4 | - schema/module: video_classification_module_conf 5 | - schema/module/optim: optim_conf 6 | - schema/datamodule: ptv_video_classification_data_module_conf 7 | - datamodule/dataloader: kinetics_classification 8 | - logger: ptl 9 | - datamodule/transforms: kinetics_classification_slowfast 10 | - module/model: slowfast_r50 11 | - module/loss: cross_entropy 12 | - module/optim: sgd 13 | - module/metrics: accuracy 14 | - module/lr_scheduler: cosine_with_warmup 15 | - schema/trainer: trainer 16 | - trainer: multi_gpu 17 | - callbacks: precise_bn 18 | - _self_ 19 | 20 | module: 21 | ensemble_method: "sum" 22 | lr_scheduler: 23 | max_iters: 196 24 | warmup_start_lr: 0.01 25 | warmup_iters: 34 26 | optim: 27 | lr: 0.8 28 | nesterov: true 29 | 30 | callbacks: 31 | precise_bn: 32 | num_batches: 200 33 | 34 | trainer: 35 | num_nodes: 8 36 | gpus: 8 37 | max_epochs: 196 38 | sync_batchnorm: False 39 | replace_sampler_ddp: False 40 | 41 | 42 | hydra: 43 | searchpath: 44 | - pkg://pytorchvideo_trainer.conf 45 | - pkg://torchrecipes.core.conf 46 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/classification_x3d_xs.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorchvideo_trainer.train_app.VideoClassificationTrainApp 2 | 3 | defaults: 4 | - schema/module: video_classification_module_conf 5 | - schema/module/optim: optim_conf 6 | - schema/datamodule: ptv_video_classification_data_module_conf 7 | - datamodule/dataloader: kinetics_classification 8 | - logger: ptl 9 | - datamodule/transforms: kinetics_classification_x3d_xs 10 | - module/model: x3d_xs 11 | - module/loss: cross_entropy 12 | - module/optim: sgd 13 | - module/metrics: accuracy 14 | - module/lr_scheduler: cosine_with_warmup 15 | - schema/trainer: trainer 16 | - trainer: multi_gpu 17 | - callbacks: precise_bn 18 | - _self_ 19 | 20 | module: 21 | ensemble_method: "sum" 22 | lr_scheduler: 23 | max_iters: 300 24 | warmup_start_lr: 0.01 25 | warmup_iters: 35 26 | optim: 27 | lr: 0.8 28 | nesterov: true 29 | weight_decay: 5e-5 30 | 31 | datamodule: 32 | dataloader: 33 | train: 34 | batch_size: 16 35 | dataset: 36 | clip_sampler: 37 | clip_duration: 1.6 38 | val: 39 | batch_size: 16 40 | dataset: 41 | clip_sampler: 42 | clip_duration: 1.6 43 | test: 44 | batch_size: 16 45 | dataset: 46 | clip_sampler: 47 | clip_duration: 1.6 48 | 49 | callbacks: 50 | precise_bn: 51 | num_batches: 200 52 | 53 | trainer: 54 | num_nodes: 8 55 | gpus: 8 56 | max_epochs: 300 57 | sync_batchnorm: False 58 | replace_sampler_ddp: False 59 | 60 | 61 | hydra: 62 | searchpath: 63 | - pkg://pytorchvideo_trainer.conf 64 | - pkg://torchrecipes.core.conf 65 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/datamodule/dataloader/kinetics_classification.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | dataset: 3 | _target_: pytorchvideo.data.Kinetics 4 | data_path: ??? 5 | video_path_prefix: ??? 6 | clip_sampler: 7 | _target_: pytorchvideo.data.clip_sampling.RandomClipSampler 8 | clip_duration: 2.13 9 | 10 | shuffle: True 11 | batch_size: 8 12 | num_workers: 8 13 | pin_memory: True 14 | 15 | val: 16 | dataset: 17 | _target_: pytorchvideo.data.Kinetics 18 | data_path: ??? 19 | video_path_prefix: ??? 20 | clip_sampler: 21 | _target_: pytorchvideo.data.clip_sampling.UniformClipSampler 22 | clip_duration: 2.13 23 | 24 | shuffle: False 25 | batch_size: 8 26 | num_workers: 8 27 | pin_memory: True 28 | 29 | test: 30 | dataset: 31 | _target_: pytorchvideo.data.Kinetics 32 | data_path: ??? 33 | video_path_prefix: ??? 34 | clip_sampler: 35 | _target_: pytorchvideo.data.clip_sampling.ConstantClipsPerVideoSampler 36 | clip_duration: 2.13 37 | clips_per_video: 10 #num_ensemble_views 38 | augs_per_clip: 3 # num_spatial_crops 39 | 40 | shuffle: False 41 | batch_size: 8 42 | num_workers: 8 43 | pin_memory: True 44 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/datamodule/dataloader/kinetics_contrastive.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | dataset: 3 | _target_: pytorchvideo.data.Kinetics 4 | data_path: ??? 5 | video_path_prefix: ??? 6 | clip_sampler: 7 | _target_: pytorchvideo.data.clip_sampling.RandomMultiClipSampler 8 | clip_duration: 2.0 9 | num_clips: 2 10 | 11 | shuffle: True 12 | batch_size: 8 13 | num_workers: 8 14 | 15 | val: 16 | dataset: 17 | _target_: pytorchvideo.data.Kinetics 18 | data_path: ??? 19 | video_path_prefix: ??? 20 | clip_sampler: 21 | _target_: pytorchvideo.data.clip_sampling.UniformClipSampler 22 | clip_duration: 2.0 23 | 24 | shuffle: False 25 | batch_size: 8 26 | num_workers: 8 27 | 28 | test: 29 | dataset: 30 | _target_: pytorchvideo.data.Kinetics 31 | data_path: ??? 32 | video_path_prefix: ??? 33 | clip_sampler: 34 | _target_: pytorchvideo.data.clip_sampling.ConstantClipsPerVideoSampler 35 | clip_duration: 2.0 36 | clips_per_video: 10 #num_ensemble_views 37 | augs_per_clip: 3 # num_spatial_crops 38 | 39 | shuffle: False 40 | batch_size: 8 41 | num_workers: 8 42 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/datamodule/transforms/kinetics_classification_mvit_16x4.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | - _target_: pytorchvideo_trainer.datamodule.transforms.RepeatandConverttoList 3 | repeat_num: 2 4 | - _target_: pytorchvideo_trainer.datamodule.transforms.ApplyTransformToKeyOnList 5 | transform: 6 | - _target_: pytorchvideo.transforms.UniformTemporalSubsample 7 | num_samples: 16 8 | - _target_: pytorchvideo.transforms.Div255 9 | - _target_: pytorchvideo.transforms.Permute 10 | dims: [1,0,2,3] 11 | - _target_: pytorchvideo.transforms.rand_augment.RandAugment 12 | magnitude: 7 13 | num_layers: 4 14 | - _target_: pytorchvideo.transforms.Permute 15 | dims: [1,0,2,3] 16 | - _target_: pytorchvideo.transforms.Normalize 17 | mean: [0.45, 0.45, 0.45] 18 | std: [0.225, 0.225, 0.225] 19 | - _target_: pytorchvideo.transforms.RandomResizedCrop 20 | target_height: 224 21 | target_width: 224 22 | scale: [0.08, 1.0] 23 | aspect_ratio: [0.75, 1.3333] 24 | - _target_: torchvision.transforms.RandomHorizontalFlip 25 | p: 0.5 26 | - _target_: pytorchvideo.transforms.Permute 27 | dims: [1,0,2,3] 28 | - _target_: pytorchvideo_trainer.datamodule.rand_erase_transform.RandomErasing 29 | probability: 0.25 30 | mode: "pixel" 31 | max_count: 1 32 | num_splits: 1 33 | device: "cpu" 34 | - _target_: pytorchvideo.transforms.Permute 35 | dims: [1,0,2,3] 36 | key: video 37 | - _target_: pytorchvideo.transforms.RemoveKey 38 | key: audio 39 | val: 40 | - _target_: pytorchvideo.transforms.ApplyTransformToKey 41 | transform: 42 | - _target_: pytorchvideo.transforms.UniformTemporalSubsample 43 | num_samples: 16 44 | - _target_: pytorchvideo.transforms.Div255 45 | - _target_: pytorchvideo.transforms.Normalize 46 | mean: [0.45, 0.45, 0.45] 47 | std: [0.225, 0.225, 0.225] 48 | - _target_: pytorchvideo.transforms.ShortSideScale 49 | size: 224 50 | - _target_: torchvision.transforms.CenterCrop 51 | size: 224 52 | key: video 53 | - _target_: pytorchvideo.transforms.RemoveKey 54 | key: audio 55 | test: 56 | - _target_: pytorchvideo.transforms.ApplyTransformToKey 57 | transform: 58 | - _target_: pytorchvideo.transforms.UniformTemporalSubsample 59 | num_samples: 16 60 | - _target_: pytorchvideo.transforms.Div255 61 | - _target_: pytorchvideo.transforms.Normalize 62 | mean: [0.45, 0.45, 0.45] 63 | std: [0.225, 0.225, 0.225] 64 | - _target_: pytorchvideo.transforms.ShortSideScale 65 | size: 224 66 | key: video 67 | - _target_: pytorchvideo.transforms.UniformCropVideo 68 | size: 224 69 | - _target_: pytorchvideo.transforms.RemoveKey 70 | key: audio 71 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/datamodule/transforms/kinetics_classification_slow.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | - _target_: pytorchvideo.transforms.ApplyTransformToKey 3 | transform: 4 | - _target_: pytorchvideo.transforms.UniformTemporalSubsample 5 | num_samples: 8 6 | - _target_: pytorchvideo.transforms.Div255 7 | - _target_: pytorchvideo.transforms.Normalize 8 | mean: [0.45, 0.45, 0.45] 9 | std: [0.225, 0.225, 0.225] 10 | - _target_: pytorchvideo.transforms.RandomShortSideScale 11 | min_size: 256 12 | max_size: 320 13 | - _target_: torchvision.transforms.RandomCrop 14 | size: 224 15 | - _target_: torchvision.transforms.RandomHorizontalFlip 16 | p: 0.5 17 | key: video 18 | - _target_: pytorchvideo.transforms.RemoveKey 19 | key: audio 20 | val: 21 | - _target_: pytorchvideo.transforms.ApplyTransformToKey 22 | transform: 23 | - _target_: pytorchvideo.transforms.UniformTemporalSubsample 24 | num_samples: 8 25 | - _target_: pytorchvideo.transforms.Div255 26 | - _target_: pytorchvideo.transforms.Normalize 27 | mean: [0.45, 0.45, 0.45] 28 | std: [0.225, 0.225, 0.225] 29 | - _target_: pytorchvideo.transforms.ShortSideScale 30 | size: 256 31 | - _target_: torchvision.transforms.CenterCrop 32 | size: 256 33 | key: video 34 | - _target_: pytorchvideo.transforms.RemoveKey 35 | key: audio 36 | test: 37 | - _target_: pytorchvideo.transforms.ApplyTransformToKey 38 | transform: 39 | - _target_: pytorchvideo.transforms.UniformTemporalSubsample 40 | num_samples: 8 41 | - _target_: pytorchvideo.transforms.Div255 42 | - _target_: pytorchvideo.transforms.Normalize 43 | mean: [0.45, 0.45, 0.45] 44 | std: [0.225, 0.225, 0.225] 45 | - _target_: pytorchvideo.transforms.ShortSideScale 46 | size: 256 47 | key: video 48 | - _target_: pytorchvideo.transforms.UniformCropVideo 49 | size: 256 50 | - _target_: pytorchvideo.transforms.RemoveKey 51 | key: audio 52 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/datamodule/transforms/kinetics_classification_slowfast.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | - _target_: pytorchvideo.transforms.ApplyTransformToKey 3 | transform: 4 | - _target_: pytorchvideo.transforms.UniformTemporalSubsample 5 | num_samples: 32 6 | - _target_: pytorchvideo.transforms.Div255 7 | - _target_: pytorchvideo.transforms.Normalize 8 | mean: [0.45, 0.45, 0.45] 9 | std: [0.225, 0.225, 0.225] 10 | - _target_: pytorchvideo.transforms.RandomShortSideScale 11 | min_size: 256 12 | max_size: 320 13 | - _target_: torchvision.transforms.RandomCrop 14 | size: 224 15 | - _target_: torchvision.transforms.RandomHorizontalFlip 16 | p: 0.5 17 | - _target_: pytorchvideo_trainer.datamodule.transforms.SlowFastPackPathway 18 | alpha: 4 19 | key: video 20 | - _target_: pytorchvideo.transforms.RemoveKey 21 | key: audio 22 | val: 23 | - _target_: pytorchvideo.transforms.ApplyTransformToKey 24 | transform: 25 | - _target_: pytorchvideo.transforms.UniformTemporalSubsample 26 | num_samples: 32 27 | - _target_: pytorchvideo.transforms.Div255 28 | - _target_: pytorchvideo.transforms.Normalize 29 | mean: [0.45, 0.45, 0.45] 30 | std: [0.225, 0.225, 0.225] 31 | - _target_: pytorchvideo.transforms.ShortSideScale 32 | size: 256 33 | - _target_: torchvision.transforms.CenterCrop 34 | size: 256 35 | - _target_: pytorchvideo_trainer.datamodule.transforms.SlowFastPackPathway 36 | alpha: 4 37 | key: video 38 | - _target_: pytorchvideo.transforms.RemoveKey 39 | key: audio 40 | test: 41 | - _target_: pytorchvideo.transforms.ApplyTransformToKey 42 | transform: 43 | - _target_: pytorchvideo.transforms.UniformTemporalSubsample 44 | num_samples: 32 45 | - _target_: pytorchvideo.transforms.Div255 46 | - _target_: pytorchvideo.transforms.Normalize 47 | mean: [0.45, 0.45, 0.45] 48 | std: [0.225, 0.225, 0.225] 49 | - _target_: pytorchvideo.transforms.ShortSideScale 50 | size: 256 51 | key: video 52 | - _target_: pytorchvideo.transforms.UniformCropVideo 53 | size: 256 54 | - _target_: pytorchvideo.transforms.ApplyTransformToKey 55 | transform: 56 | - _target_: pytorchvideo_trainer.datamodule.transforms.SlowFastPackPathway 57 | alpha: 4 58 | key: video 59 | - _target_: pytorchvideo.transforms.RemoveKey 60 | key: audio 61 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/datamodule/transforms/kinetics_classification_x3d_xs.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | - _target_: pytorchvideo.transforms.ApplyTransformToKey 3 | transform: 4 | - _target_: pytorchvideo.transforms.UniformTemporalSubsample 5 | num_samples: 4 6 | - _target_: pytorchvideo.transforms.Div255 7 | - _target_: pytorchvideo.transforms.Normalize 8 | mean: [0.45, 0.45, 0.45] 9 | std: [0.225, 0.225, 0.225] 10 | - _target_: pytorchvideo.transforms.RandomShortSideScale 11 | min_size: 182 12 | max_size: 228 13 | - _target_: torchvision.transforms.RandomCrop 14 | size: 160 15 | - _target_: torchvision.transforms.RandomHorizontalFlip 16 | p: 0.5 17 | key: video 18 | - _target_: pytorchvideo.transforms.RemoveKey 19 | key: audio 20 | val: 21 | - _target_: pytorchvideo.transforms.ApplyTransformToKey 22 | transform: 23 | - _target_: pytorchvideo.transforms.UniformTemporalSubsample 24 | num_samples: 4 25 | - _target_: pytorchvideo.transforms.Div255 26 | - _target_: pytorchvideo.transforms.Normalize 27 | mean: [0.45, 0.45, 0.45] 28 | std: [0.225, 0.225, 0.225] 29 | - _target_: pytorchvideo.transforms.ShortSideScale 30 | size: 182 31 | - _target_: torchvision.transforms.CenterCrop 32 | size: 182 33 | key: video 34 | - _target_: pytorchvideo.transforms.RemoveKey 35 | key: audio 36 | test: 37 | - _target_: pytorchvideo.transforms.ApplyTransformToKey 38 | transform: 39 | - _target_: pytorchvideo.transforms.UniformTemporalSubsample 40 | num_samples: 4 41 | - _target_: pytorchvideo.transforms.Div255 42 | - _target_: pytorchvideo.transforms.Normalize 43 | mean: [0.45, 0.45, 0.45] 44 | std: [0.225, 0.225, 0.225] 45 | - _target_: pytorchvideo.transforms.ShortSideScale 46 | size: 182 47 | key: video 48 | - _target_: pytorchvideo.transforms.UniformCropVideo 49 | size: 182 50 | - _target_: pytorchvideo.transforms.RemoveKey 51 | key: audio 52 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/datamodule/transforms/kinetics_contrastive.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | - _target_: pytorchvideo_trainer.datamodule.transforms.ApplyTransformToKeyOnList 3 | transform: 4 | - _target_: pytorchvideo.transforms.UniformTemporalSubsample 5 | num_samples: 8 6 | - _target_: pytorchvideo.transforms.Div255 7 | - _target_: pytorchvideo_trainer.datamodule.transforms.ColorJitterVideoSSl 8 | bri_con_sat: [0.6, 0.6, 0.6] 9 | hue: 0.15 10 | p_color_jitter: 0.8 11 | p_convert_gray: 0.2 12 | - _target_: pytorchvideo.transforms.Normalize 13 | mean: [0.45, 0.45, 0.45] 14 | std: [0.225, 0.225, 0.225] 15 | - _target_: pytorchvideo.transforms.RandomResizedCrop 16 | target_height: 224 17 | target_width: 224 18 | scale: [0.2, 0.766] 19 | aspect_ratio: [0.75, 1.3333] 20 | - _target_: torchvision.transforms.RandomHorizontalFlip 21 | p: 0.5 22 | key: video 23 | - _target_: pytorchvideo.transforms.RemoveKey 24 | key: audio 25 | val: 26 | - _target_: pytorchvideo.transforms.ApplyTransformToKey 27 | transform: 28 | - _target_: pytorchvideo.transforms.UniformTemporalSubsample 29 | num_samples: 8 30 | - _target_: pytorchvideo.transforms.Div255 31 | - _target_: pytorchvideo.transforms.Normalize 32 | mean: [0.45, 0.45, 0.45] 33 | std: [0.225, 0.225, 0.225] 34 | - _target_: pytorchvideo.transforms.ShortSideScale 35 | size: 256 36 | - _target_: torchvision.transforms.CenterCrop 37 | size: 256 38 | key: video 39 | - _target_: pytorchvideo.transforms.RemoveKey 40 | key: audio 41 | test: 42 | - _target_: pytorchvideo.transforms.ApplyTransformToKey 43 | transform: 44 | - _target_: pytorchvideo.transforms.UniformTemporalSubsample 45 | num_samples: 8 46 | - _target_: pytorchvideo.transforms.Div255 47 | - _target_: pytorchvideo.transforms.Normalize 48 | mean: [0.45, 0.45, 0.45] 49 | std: [0.225, 0.225, 0.225] 50 | - _target_: pytorchvideo.transforms.ShortSideScale 51 | size: 256 52 | key: video 53 | - _target_: pytorchvideo.transforms.UniformCropVideo 54 | size: 256 55 | - _target_: pytorchvideo.transforms.RemoveKey 56 | key: audio 57 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/datamodule/transforms/kinetics_moco_v2.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | - _target_: pytorchvideo_trainer.datamodule.transforms.ApplyTransformToKeyOnList 3 | transform: 4 | - _target_: pytorchvideo.transforms.UniformTemporalSubsample 5 | num_samples: 8 6 | - _target_: pytorchvideo.transforms.Div255 7 | - _target_: pytorchvideo_trainer.datamodule.transforms.ColorJitterVideoSSl 8 | bri_con_sat: [0.4, 0.4, 0.4] 9 | hue: 0.4 10 | p_color_jitter: 0.8 11 | p_convert_gray: 0.2 12 | - _target_: pytorchvideo.transforms.Normalize 13 | mean: [0.45, 0.45, 0.45] 14 | std: [0.225, 0.225, 0.225] 15 | - _target_: pytorchvideo.transforms.RandomResizedCrop 16 | target_height: 224 17 | target_width: 224 18 | scale: [0.2, 0.766] 19 | aspect_ratio: [0.75, 1.3333] 20 | - _target_: torchvision.transforms.RandomHorizontalFlip 21 | p: 0.5 22 | key: video 23 | - _target_: pytorchvideo.transforms.RemoveKey 24 | key: audio 25 | val: 26 | - _target_: pytorchvideo.transforms.ApplyTransformToKey 27 | transform: 28 | - _target_: pytorchvideo.transforms.UniformTemporalSubsample 29 | num_samples: 8 30 | - _target_: pytorchvideo.transforms.Div255 31 | - _target_: pytorchvideo.transforms.Normalize 32 | mean: [0.45, 0.45, 0.45] 33 | std: [0.225, 0.225, 0.225] 34 | - _target_: pytorchvideo.transforms.ShortSideScale 35 | size: 256 36 | - _target_: torchvision.transforms.CenterCrop 37 | size: 256 38 | key: video 39 | - _target_: pytorchvideo.transforms.RemoveKey 40 | key: audio 41 | test: 42 | - _target_: pytorchvideo.transforms.ApplyTransformToKey 43 | transform: 44 | - _target_: pytorchvideo.transforms.UniformTemporalSubsample 45 | num_samples: 8 46 | - _target_: pytorchvideo.transforms.Div255 47 | - _target_: pytorchvideo.transforms.Normalize 48 | mean: [0.45, 0.45, 0.45] 49 | std: [0.225, 0.225, 0.225] 50 | - _target_: pytorchvideo.transforms.ShortSideScale 51 | size: 256 52 | key: video 53 | - _target_: pytorchvideo.transforms.UniformCropVideo 54 | size: 256 55 | - _target_: pytorchvideo.transforms.RemoveKey 56 | key: audio 57 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/logger/ptl.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorch_lightning.loggers.TensorBoardLogger 2 | save_dir: ??? 3 | name: default 4 | version: null 5 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/moco_v2_train_app_conf.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorchvideo_trainer.train_app.VideoClassificationTrainApp 2 | 3 | defaults: 4 | - schema/module: moco_v2_module_conf 5 | - schema/module/optim: optim_conf 6 | - schema/datamodule: ptv_video_classification_data_module_conf 7 | - datamodule/dataloader: kinetics_contrastive 8 | - logger: ptl 9 | - datamodule/transforms: kinetics_moco_v2 10 | - module/knn_memory: kinetics_k400 11 | - module/model: slow_r50_moco_v2 12 | - module/loss: contrastive 13 | - module/optim: sgd_ssl 14 | - module/metrics: accuracy 15 | - schema/trainer: trainer 16 | - trainer: cpu 17 | - callbacks: null 18 | - _self_ 19 | trainer: 20 | sync_batchnorm: false # set this to true for training 21 | 22 | module: 23 | dim: ${module.model.backbone_embed_dim} 24 | k: 65536 25 | batch_shuffle: true 26 | local_shuffle_bn: true 27 | 28 | hydra: 29 | searchpath: 30 | - pkg://pytorchvideo_trainer.conf 31 | - pkg://torchrecipes.core.conf 32 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/module/knn_memory/kinetics_k400.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorchvideo_trainer.module.ssl_helper.KnnMemory 2 | temperature: ${module.loss.temperature} 3 | dim: ${module.model.backbone_embed_dim} 4 | length: 239975 5 | downstream_classes: 400 6 | knn_k: 200 7 | momentum: 1.0 8 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/module/loss/contrastive.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorchvideo_trainer.module.losses.ContrastiveLoss 2 | temperature: 0.1 3 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/module/loss/cross_entropy.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: torch.nn.CrossEntropyLoss 3 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/module/loss/nt_xent.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: pytorchvideo_trainer.module.losses.NtxentLoss 3 | temperature: 0.1 4 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/module/loss/similarity.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: pytorchvideo_trainer.module.losses.SimilarityLoss 3 | temperature: 0.1 4 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/module/loss/soft_cross_entropy.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: pytorchvideo_trainer.module.losses.SoftTargetCrossEntropy 3 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/module/lr_scheduler/cosine_with_warmup.yaml: -------------------------------------------------------------------------------- 1 | lr_policy: 'cosine' 2 | cosine_after_warmup: False 3 | cosine_end_lr: 0 4 | warmup_iters: 34 5 | warmup_start_lr: 0.01 6 | max_iters: ${trainer.max_epochs} 7 | lr: ${module.optim.lr} 8 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/module/metrics/accuracy.yaml: -------------------------------------------------------------------------------- 1 | - name: accuracy_top1 2 | config: 3 | _target_: torchmetrics.Accuracy 4 | top_k: 1 5 | - name: accuracy_top5 6 | config: 7 | _target_: torchmetrics.Accuracy 8 | top_k: 5 9 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/module/metrics/average_precision.yaml: -------------------------------------------------------------------------------- 1 | - name: average_precision 2 | config: 3 | _target_: torchmetrics.AveragePrecision 4 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/module/model/from_lightning_checkpoint.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorchvideo_trainer.module.video_classification.create_classification_model_from_lightning 2 | checkpoint_path: ??? 3 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/module/model/from_model_zoo_checkpoint.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorchvideo_trainer.module.video_classification.create_classification_model_from_modelzoo 2 | checkpoint_path: manifold://fair_logging/tree/kalyanv/hub_models/SLOW_8x8_R50.pyth 3 | model: 4 | _target_: pytorchvideo.models.hub.resnet.slow_r50 5 | pretrained: False 6 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/module/model/from_ssl_checkpoint.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorchvideo_trainer.module.ssl_helper.create_classification_model_from_ssl_checkpoint 2 | ssl_checkpoint_path: null 3 | checkpoint_type: simclr 4 | mlp: 5 | _target_: pytorchvideo_trainer.module.byol.create_mlp_util 6 | dim_in: null 7 | dim_out: 400 8 | mlp_dim: 256 9 | num_layers: 1 10 | norm: null 11 | detach_backbone: true 12 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/module/model/mvit_base_16x4.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorchvideo.models.vision_transformers.create_multiscale_vision_transformers 2 | spatial_size: 224 3 | temporal_size: 16 4 | cls_embed_on: True 5 | sep_pos_embed: True 6 | depth: 16 7 | norm: "layernorm" 8 | input_channels: 3 9 | patch_embed_dim: 96 10 | conv_patch_embed_kernel: [3, 7, 7] 11 | conv_patch_embed_stride: [2, 4, 4] 12 | conv_patch_embed_padding: [1, 3, 3] 13 | enable_patch_embed_norm: False 14 | use_2d_patch: False 15 | # Attention block config. 16 | num_heads: 1 17 | mlp_ratio: 4.0 18 | qkv_bias: True 19 | dropout_rate_block: 0.0 20 | droppath_rate_block: 0.2 21 | pooling_mode: "conv" 22 | pool_first: False 23 | embed_dim_mul: [[1, 2.0], [3, 2.0], [14, 2.0]] 24 | atten_head_mul: [[1, 2.0], [3, 2.0], [14, 2.0]] 25 | pool_q_stride_size: [[1, 1, 2, 2], [3, 1, 2, 2], [14, 1, 2, 2]] 26 | pool_kv_stride_size: null 27 | pool_kv_stride_adaptive: [1, 8, 8] 28 | pool_kvq_kernel: [3, 3, 3] 29 | # Head config. 30 | head_dropout_rate: 0.5 31 | head_activation: null 32 | head_num_classes: 400 33 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/module/model/slow_r50.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorchvideo.models.resnet.create_resnet 2 | input_channel: 3 3 | model_depth: 50 4 | model_num_class: 400 5 | dropout_rate: 0.5 6 | stem_conv_kernel_size: [1, 7, 7] 7 | head_pool_kernel_size: [8, 7, 7] 8 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/module/model/slow_r50_byol.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorchvideo_trainer.module.byol.create_byol_resnet_50 2 | backbone_embed_dim: 128 3 | mmt: 0.996 4 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/module/model/slow_r50_moco_v2.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorchvideo_trainer.module.moco_v2.create_moco_resnet_50 2 | backbone_embed_dim: 128 3 | mmt: 0.994 4 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/module/model/slow_r50_simclr.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorchvideo_trainer.module.simclr.create_simclr_resnet_50 2 | backbone_embed_dim: 128 3 | mlp_depth: 1 4 | mlp_inner_dim: 2048 5 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/module/model/slowfast_r50.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorchvideo.models.slowfast.create_slowfast 2 | input_channels: [3,3] 3 | model_depth: 50 4 | model_num_class: 400 5 | dropout_rate: 0.5 6 | slowfast_fusion_conv_kernel_size: [7, 1, 1] 7 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/module/model/x3d_xs.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorchvideo.models.x3d.create_x3d 2 | input_channel: 3 3 | model_num_class: 400 4 | dropout_rate: 0.5 5 | input_clip_length: 4 6 | input_crop_size: 160 7 | depth_factor: 2.2 8 | head_activation: null 9 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/module/optim/adam.yaml: -------------------------------------------------------------------------------- 1 | method: 'adam' 2 | lr: 0.001 3 | weight_decay: 0 4 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/module/optim/adamw.yaml: -------------------------------------------------------------------------------- 1 | method: 'adamw' 2 | lr: 0.001 3 | weight_decay: 0.01 4 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/module/optim/sgd.yaml: -------------------------------------------------------------------------------- 1 | method: 'sgd' 2 | lr: 0.1 3 | weight_decay: 1e-4 4 | momentum: 0.9 5 | nesterov: True 6 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/module/optim/sgd_ssl.yaml: -------------------------------------------------------------------------------- 1 | method: 'sgd' 2 | lr: 0.6 3 | weight_decay: 1e-6 4 | momentum: 0.9 5 | nesterov: True 6 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/simclr_train_app_conf.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorchvideo_trainer.train_app.VideoClassificationTrainApp 2 | 3 | defaults: 4 | - schema/module: simclr_module_conf 5 | - schema/module/optim: optim_conf 6 | - schema/datamodule: ptv_video_classification_data_module_conf 7 | - datamodule/dataloader: kinetics_contrastive 8 | - logger: ptl 9 | - datamodule/transforms: kinetics_moco_v2 10 | - module/knn_memory: kinetics_k400 11 | - module/model: slow_r50_simclr 12 | - module/loss: nt_xent 13 | - module/optim: sgd_ssl 14 | - module/metrics: accuracy 15 | - schema/trainer: trainer 16 | - trainer: cpu 17 | - callbacks: null 18 | - _self_ 19 | trainer: 20 | sync_batchnorm: false # set this to true for training 21 | 22 | hydra: 23 | searchpath: 24 | - pkg://pytorchvideo_trainer.conf 25 | - pkg://torchrecipes.core.conf 26 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/submitit_conf/fair_cluster.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | log_save_dir: null 3 | name: "ptv_trainer_job" 4 | time: "72:00:00" 5 | cpus_per_task: 10 6 | partition: "learnlab" 7 | mem: "470GB" 8 | constraint: "volta32gb" 9 | mode: "prod" 10 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/trainer/cpu.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | max_epochs: 1 3 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/trainer/multi_gpu.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | gpus: 8 3 | strategy: ddp 4 | max_epochs: 1 5 | num_sanity_val_steps: 0 6 | log_every_n_steps: 10 7 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/conf/trainer/single_gpu.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | gpus: 1 3 | max_epochs: 1 4 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/datamodule/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from .datamodule import PyTorchVideoDataModule # noqa 4 | 5 | 6 | __all__ = [ 7 | "PyTorchVideoDataModule", 8 | ] 9 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/datamodule/collators.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from typing import Any, Callable, Dict, List 4 | 5 | from torch.utils.data._utils.collate import default_collate 6 | 7 | 8 | # pyre-ignore[2] 9 | def multiple_samples_collate(batch: List[Dict[str, List[Any]]]) -> Dict[str, Any]: 10 | """ 11 | Collate function for repeated augmentation. Each instance in the batch has 12 | more than one sample. 13 | 14 | To be used when working with, 15 | `pytorchvideo_trainer.datamodule.transforms.RepeatandConverttoList` 16 | """ 17 | batch_dict = {} 18 | for k in batch[0].keys(): 19 | v_iter = [] 20 | for sample_dict in batch: 21 | v_iter += sample_dict[k] 22 | batch_dict[k] = default_collate(v_iter) 23 | 24 | return batch_dict 25 | 26 | 27 | # pyre-ignore[24] 28 | _COLLATORS: Dict[str, Callable] = { 29 | "multiple_samples_collate": multiple_samples_collate, 30 | } 31 | 32 | 33 | def build_collator_from_name(name: str) -> Callable: # pyre-ignore[24] 34 | """ 35 | A utility function that returns the function handles to specific collators 36 | in `_COLLATORS` dictionary object based on the queried key. Used in 37 | `pytorchvideo_trainer.datamodule.PyTorchVideoDataModule`, etc. 38 | 39 | Arg: 40 | name (str): name of the qurried collators. The key should be present in 41 | `_COLLATORS` dictionary object 42 | """ 43 | assert ( 44 | name in _COLLATORS 45 | ), f"Inavalid Collator method. Available methods are {_COLLATORS.keys()}" 46 | return _COLLATORS[name] 47 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/pytorchvideo_trainer/module/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from .byol import BYOLModule # noqa 4 | from .moco_v2 import MOCOV2Module # noqa 5 | from .simclr import SimCLRModule # noqa 6 | from .video_classification import VideoClassificationModule # noqa 7 | 8 | 9 | __all__ = [ 10 | "VideoClassificationModule", 11 | "SimCLRModule", 12 | "BYOLModule", 13 | "MOCOV2Module", 14 | ] 15 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | from setuptools import find_packages, setup 5 | 6 | 7 | setup( 8 | name="pytorchvideo_trainer", 9 | version="0.0.1", 10 | license="Apache 2.0", 11 | author="Facebook AI", 12 | url="https://github.com/facebookresearch/pytorchvideo", 13 | description="PyTorch-Lightning trainer powering PyTorchVideo models.", 14 | python_requires=">=3.8", 15 | install_requires=[ 16 | "submitit", 17 | "pytorchvideo>=0.1.5", 18 | ], 19 | extras_require={ 20 | "test": ["coverage", "pytest", "opencv-python"], 21 | "dev": [ 22 | "opencv-python", 23 | "black==20.8b1", 24 | "sphinx", 25 | "isort==4.3.21", 26 | "flake8==3.8.1", 27 | "flake8-bugbear", 28 | "flake8-comprehensions", 29 | "pre-commit", 30 | "nbconvert", 31 | "bs4", 32 | "autoflake==1.4", 33 | ], 34 | "opencv-python": [ 35 | "opencv-python", 36 | ], 37 | }, 38 | packages=find_packages(exclude=("scripts", "tests")), 39 | ) 40 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/tests/test_conf_datamodule.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import unittest 4 | 5 | from hydra.experimental import compose, initialize_config_module 6 | from hydra.utils import instantiate # @manual 7 | from pytorchvideo_trainer.datamodule.datamodule import PyTorchVideoDataModule 8 | 9 | 10 | class TestKineticsDataModuleConf(unittest.TestCase): 11 | def test_init_with_hydra(self) -> None: 12 | with initialize_config_module(config_module="pytorchvideo_trainer.conf"): 13 | test_conf = compose( 14 | config_name="video_classification_train_app_conf", 15 | overrides=[ 16 | "datamodule/dataloader=kinetics_classification", 17 | "datamodule/transforms=kinetics_classification_slow", 18 | ], 19 | ) 20 | print(test_conf) 21 | kinetics_data_module = instantiate( 22 | test_conf.datamodule, 23 | _recursive_=False, 24 | ) 25 | self.assertIsInstance(kinetics_data_module, PyTorchVideoDataModule) 26 | self.assertIsNotNone(kinetics_data_module.transforms["train"]) 27 | self.assertIsNotNone(kinetics_data_module.transforms["val"]) 28 | self.assertIsNotNone(kinetics_data_module.transforms["test"]) 29 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/tests/test_conf_module.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import unittest 4 | 5 | import hydra 6 | from hydra.experimental import compose, initialize_config_module 7 | from pytorchvideo_trainer.module.byol import BYOLModule 8 | from pytorchvideo_trainer.module.moco_v2 import MOCOV2Module 9 | from pytorchvideo_trainer.module.simclr import SimCLRModule 10 | from pytorchvideo_trainer.module.video_classification import VideoClassificationModule 11 | 12 | 13 | class TestVideoClassificationModuleConf(unittest.TestCase): 14 | def test_init_with_hydra(self) -> None: 15 | with initialize_config_module(config_module="pytorchvideo_trainer.conf"): 16 | test_conf = compose( 17 | config_name="video_classification_train_app_conf", 18 | overrides=["module/model=slow_r50"], 19 | ) 20 | test_module = hydra.utils.instantiate(test_conf.module, _recursive_=False) 21 | self.assertIsInstance(test_module, VideoClassificationModule) 22 | self.assertIsNotNone(test_module.model) 23 | 24 | 25 | class TestVideoSimCLRModuleConf(unittest.TestCase): 26 | def test_init_with_hydra(self) -> None: 27 | with initialize_config_module(config_module="pytorchvideo_trainer.conf"): 28 | test_conf = compose( 29 | config_name="simclr_train_app_conf", 30 | ) 31 | test_module = hydra.utils.instantiate(test_conf.module, _recursive_=False) 32 | self.assertIsInstance(test_module, SimCLRModule) 33 | self.assertIsNotNone(test_module.model) 34 | 35 | 36 | class TestVideoBYOLModuleConf(unittest.TestCase): 37 | def test_init_with_hydra(self) -> None: 38 | with initialize_config_module(config_module="pytorchvideo_trainer.conf"): 39 | test_conf = compose( 40 | config_name="byol_train_app_conf", 41 | ) 42 | test_module = hydra.utils.instantiate(test_conf.module, _recursive_=False) 43 | self.assertIsInstance(test_module, BYOLModule) 44 | self.assertIsNotNone(test_module.model) 45 | 46 | 47 | class TestVideoMOCOV2ModuleConf(unittest.TestCase): 48 | def test_init_with_hydra(self) -> None: 49 | with initialize_config_module(config_module="pytorchvideo_trainer.conf"): 50 | test_conf = compose( 51 | config_name="moco_v2_train_app_conf", 52 | # overrides=["module/model=resnet"], 53 | ) 54 | test_module = hydra.utils.instantiate(test_conf.module, _recursive_=False) 55 | self.assertIsInstance(test_module, MOCOV2Module) 56 | self.assertIsNotNone(test_module.model) 57 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/tests/test_task_byol.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | # pyre-strict 4 | from torchrecipes.core.base_train_app import BaseTrainApp 5 | from vision.fair.pytorchvideo.pytorchvideo_trainer.tests.util import ( 6 | BaseTrainAppTestCase, 7 | create_small_kinetics_dataset, 8 | run_locally, 9 | tempdir, 10 | ) 11 | 12 | 13 | class TestBYOLTrainApp(BaseTrainAppTestCase): 14 | def get_train_app( 15 | self, 16 | root_dir: str, 17 | fast_dev_run: bool = True, 18 | logger: bool = False, 19 | ) -> BaseTrainApp: 20 | create_small_kinetics_dataset(root_dir) 21 | overrides = [ 22 | f"datamodule.dataloader.train.dataset.data_path={root_dir}/train.csv", 23 | f"datamodule.dataloader.val.dataset.data_path={root_dir}/val.csv", 24 | f"datamodule.dataloader.test.dataset.data_path={root_dir}/val.csv", 25 | f"datamodule.dataloader.train.dataset.video_path_prefix={root_dir}", 26 | f"datamodule.dataloader.val.dataset.video_path_prefix={root_dir}", 27 | f"datamodule.dataloader.test.dataset.video_path_prefix={root_dir}", 28 | "datamodule.dataloader.train.num_workers=0", 29 | "datamodule.dataloader.val.num_workers=0", 30 | "datamodule.dataloader.test.num_workers=0", 31 | "module.knn_memory.length=50", 32 | "module.knn_memory.knn_k=2", 33 | "datamodule.dataloader.train.batch_size=2", 34 | "datamodule.dataloader.val.batch_size=2", 35 | "datamodule.dataloader.test.batch_size=2", 36 | "trainer.logger=false", 37 | ] 38 | app = self.create_app_from_hydra( 39 | config_module="pytorchvideo_trainer.conf", 40 | config_name="byol_train_app_conf", 41 | overrides=overrides, 42 | ) 43 | trainer_overrides = {"fast_dev_run": fast_dev_run, "logger": logger} 44 | self.mock_trainer_params(app, trainer_overrides) 45 | return app 46 | 47 | @run_locally 48 | @tempdir 49 | def test_byol_app_train_test_30_views(self, root_dir: str) -> None: 50 | train_app = self.get_train_app( 51 | root_dir=root_dir, fast_dev_run=False, logger=False 52 | ) 53 | output = train_app.train() 54 | self.assertIsNotNone(output) 55 | output = train_app.test() 56 | self.assertIsNotNone(output) 57 | 58 | video_clips_cnts = getattr(train_app.module, "video_clips_cnts", None) 59 | num_ensemble_views = getattr(train_app.datamodule, "num_ensemble_views", 10) 60 | num_spatial_crops = getattr(train_app.datamodule, "num_spatial_crops", 3) 61 | self.assertIsNotNone(video_clips_cnts) 62 | for _, sample_cnts in video_clips_cnts.items(): 63 | self.assertEqual(num_ensemble_views * num_spatial_crops, sample_cnts) 64 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/tests/test_task_moco_v2.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | # pyre-strict 4 | from torchrecipes.core.base_train_app import BaseTrainApp 5 | from vision.fair.pytorchvideo.pytorchvideo_trainer.tests.util import ( 6 | BaseTrainAppTestCase, 7 | create_small_kinetics_dataset, 8 | run_locally, 9 | tempdir, 10 | ) 11 | 12 | 13 | class TestMOCOV2TrainApp(BaseTrainAppTestCase): 14 | def get_train_app( 15 | self, 16 | root_dir: str, 17 | fast_dev_run: bool = True, 18 | logger: bool = False, 19 | ) -> BaseTrainApp: 20 | create_small_kinetics_dataset(root_dir) 21 | overrides = [ 22 | f"datamodule.dataloader.train.dataset.data_path={root_dir}/train.csv", 23 | f"datamodule.dataloader.val.dataset.data_path={root_dir}/val.csv", 24 | f"datamodule.dataloader.test.dataset.data_path={root_dir}/val.csv", 25 | f"datamodule.dataloader.train.dataset.video_path_prefix={root_dir}", 26 | f"datamodule.dataloader.val.dataset.video_path_prefix={root_dir}", 27 | f"datamodule.dataloader.test.dataset.video_path_prefix={root_dir}", 28 | "datamodule.dataloader.train.num_workers=0", 29 | "datamodule.dataloader.val.num_workers=0", 30 | "datamodule.dataloader.test.num_workers=0", 31 | "module.knn_memory.length=50", 32 | "module.knn_memory.knn_k=2", 33 | "datamodule.dataloader.train.batch_size=2", 34 | "datamodule.dataloader.val.batch_size=2", 35 | "datamodule.dataloader.test.batch_size=2", 36 | "trainer.logger=false", 37 | ] 38 | 39 | app = self.create_app_from_hydra( 40 | config_module="pytorchvideo_trainer.conf", 41 | config_name="moco_v2_train_app_conf", 42 | overrides=overrides, 43 | ) 44 | trainer_overrides = {"fast_dev_run": fast_dev_run, "logger": logger} 45 | self.mock_trainer_params(app, trainer_overrides) 46 | return app 47 | 48 | @run_locally 49 | @tempdir 50 | def test_moco_v2_app_train_test_30_views(self, root_dir: str) -> None: 51 | train_app = self.get_train_app( 52 | root_dir=root_dir, fast_dev_run=False, logger=False 53 | ) 54 | output = train_app.train() 55 | self.assertIsNotNone(output) 56 | output = train_app.test() 57 | self.assertIsNotNone(output) 58 | 59 | video_clips_cnts = getattr(train_app.module, "video_clips_cnts", None) 60 | num_ensemble_views = getattr(train_app.datamodule, "num_ensemble_views", 10) 61 | num_spatial_crops = getattr(train_app.datamodule, "num_spatial_crops", 3) 62 | self.assertIsNotNone(video_clips_cnts) 63 | for _, sample_cnts in video_clips_cnts.items(): 64 | self.assertEqual(num_ensemble_views * num_spatial_crops, sample_cnts) 65 | -------------------------------------------------------------------------------- /pytorchvideo_trainer/tests/test_task_simclr.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | # pyre-strict 4 | from torchrecipes.core.base_train_app import BaseTrainApp 5 | from vision.fair.pytorchvideo.pytorchvideo_trainer.tests.util import ( 6 | BaseTrainAppTestCase, 7 | create_small_kinetics_dataset, 8 | run_locally, 9 | tempdir, 10 | ) 11 | 12 | 13 | class TestSimCLRTrainApp(BaseTrainAppTestCase): 14 | def get_train_app( 15 | self, 16 | root_dir: str, 17 | fast_dev_run: bool = True, 18 | logger: bool = False, 19 | ) -> BaseTrainApp: 20 | create_small_kinetics_dataset(root_dir) 21 | overrides = [ 22 | f"datamodule.dataloader.train.dataset.data_path={root_dir}/train.csv", 23 | f"datamodule.dataloader.val.dataset.data_path={root_dir}/val.csv", 24 | f"datamodule.dataloader.test.dataset.data_path={root_dir}/val.csv", 25 | f"datamodule.dataloader.train.dataset.video_path_prefix={root_dir}", 26 | f"datamodule.dataloader.val.dataset.video_path_prefix={root_dir}", 27 | f"datamodule.dataloader.test.dataset.video_path_prefix={root_dir}", 28 | "datamodule.dataloader.train.num_workers=0", 29 | "datamodule.dataloader.val.num_workers=0", 30 | "datamodule.dataloader.test.num_workers=0", 31 | "module.knn_memory.length=50", 32 | "module.knn_memory.knn_k=2", 33 | "datamodule.dataloader.train.batch_size=2", 34 | "datamodule.dataloader.val.batch_size=2", 35 | "datamodule.dataloader.test.batch_size=2", 36 | "trainer.logger=false", 37 | ] 38 | app = self.create_app_from_hydra( 39 | config_module="pytorchvideo_trainer.conf", 40 | config_name="simclr_train_app_conf", 41 | overrides=overrides, 42 | ) 43 | trainer_overrides = {"fast_dev_run": fast_dev_run, "logger": logger} 44 | self.mock_trainer_params(app, trainer_overrides) 45 | return app 46 | 47 | @run_locally 48 | @tempdir 49 | def test_simclr_app_train_test_30_views(self, root_dir: str) -> None: 50 | train_app = self.get_train_app( 51 | root_dir=root_dir, fast_dev_run=False, logger=False 52 | ) 53 | output = train_app.train() 54 | self.assertIsNotNone(output) 55 | output = train_app.test() 56 | self.assertIsNotNone(output) 57 | 58 | video_clips_cnts = getattr(train_app.module, "video_clips_cnts", None) 59 | num_ensemble_views = getattr(train_app.datamodule, "num_ensemble_views", 10) 60 | num_spatial_crops = getattr(train_app.datamodule, "num_spatial_crops", 3) 61 | self.assertIsNotNone(video_clips_cnts) 62 | for _, sample_cnts in video_clips_cnts.items(): 63 | self.assertEqual(num_ensemble_views * num_spatial_crops, sample_cnts) 64 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | line_length = 88 3 | multi_line_output = 3 4 | include_trailing_comma = True 5 | force_grid_warp = 0 6 | default_section = THIRDPARTY 7 | lines_after_imports = 2 8 | combine_as_imports = True 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | import os 5 | 6 | from setuptools import find_packages, setup 7 | 8 | 9 | def get_version(): 10 | init_py_path = os.path.join( 11 | os.path.abspath(os.path.dirname(__file__)), "pytorchvideo", "__init__.py" 12 | ) 13 | init_py = open(init_py_path, "r").readlines() 14 | version_line = [ 15 | lines.strip() for lines in init_py if lines.startswith("__version__") 16 | ][0] 17 | version = version_line.split("=")[-1].strip().strip("'\"") 18 | 19 | # Used by CI to build nightly packages. Users should never use it. 20 | # To build a nightly wheel, run: 21 | # BUILD_NIGHTLY=1 python setup.py sdist 22 | if os.getenv("BUILD_NIGHTLY", "0") == "1": 23 | from datetime import datetime 24 | 25 | date_str = datetime.today().strftime("%Y%m%d") 26 | # pip can perform proper comparison for ".post" suffix, 27 | # i.e., "1.1.post1234" >= "1.1" 28 | version = version + ".post" + date_str 29 | 30 | new_init_py = [l for l in init_py if not l.startswith("__version__")] 31 | new_init_py.append('__version__ = "{}"\n'.format(version)) 32 | with open(init_py_path, "w") as f: 33 | f.write("".join(new_init_py)) 34 | 35 | return version 36 | 37 | 38 | def get_name(): 39 | name = "pytorchvideo" 40 | if os.getenv("BUILD_NIGHTLY", "0") == "1": 41 | name += "-nightly" 42 | return name 43 | 44 | 45 | setup( 46 | name=get_name(), 47 | version=get_version(), 48 | license="Apache 2.0", 49 | author="Facebook AI", 50 | url="https://github.com/facebookresearch/pytorchvideo", 51 | description="A video understanding deep learning library.", 52 | python_requires=">=3.7", 53 | install_requires=[ 54 | "fvcore", 55 | "av", 56 | "parameterized", 57 | "iopath", 58 | "networkx", 59 | ], 60 | extras_require={ 61 | "test": ["coverage", "pytest", "opencv-python", "decord"], 62 | "dev": [ 63 | "opencv-python", 64 | "decord", 65 | "black==20.8b1", 66 | "sphinx", 67 | "isort==4.3.21", 68 | "flake8==3.8.1", 69 | "flake8-bugbear", 70 | "flake8-comprehensions", 71 | "pre-commit", 72 | "nbconvert", 73 | "bs4", 74 | "autoflake==1.4", 75 | ], 76 | "opencv-python": [ 77 | "opencv-python", 78 | ], 79 | }, 80 | packages=find_packages(exclude=("scripts", "tests")), 81 | ) 82 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | ## Unit Tests 2 | 3 | 4 | Before running the tests, please ensure that you installed the necessary additional test dependencies. 5 | If not installed, check the [install-README](https://github.com/facebookresearch/pytorchvideo/blob/main/INSTALL.md) on how to do it. 6 | 7 | Use the the following command to run the tests: 8 | ``` 9 | # From root of the project 10 | python -m unittest discover -v -s ./tests 11 | ``` 12 | 13 | To generate the coverage reports, please run the following command: 14 | ``` 15 | #Install Coverage using 16 | pip install coverage 17 | 18 | # From root of the project 19 | coverage run -m unittest discover -v -s tests 20 | ``` 21 | 22 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /tests/benchmark_transforms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import unittest 4 | from typing import Callable 5 | 6 | import torch 7 | from fvcore.common.benchmark import benchmark 8 | from pytorchvideo.data.utils import thwc_to_cthw 9 | from pytorchvideo.transforms.functional import short_side_scale 10 | from utils import create_dummy_video_frames 11 | 12 | 13 | class TestBenchmarkTransforms(unittest.TestCase): 14 | def setUp(self): 15 | super().setUp() 16 | torch.set_rng_state(torch.manual_seed(42).get_state()) 17 | 18 | def test_benchmark_short_side_scale_pytorch(self, num_iters: int = 10) -> None: 19 | """ 20 | Benchmark scale operation with pytorch backend. 21 | Args: 22 | num_iters (int): number of iterations to perform benchmarking. 23 | """ 24 | kwargs_list = [ 25 | {"temporal_size": 8, "ori_spatial_size": (128, 128), "dst_short_size": 112}, 26 | { 27 | "temporal_size": 16, 28 | "ori_spatial_size": (128, 128), 29 | "dst_short_size": 112, 30 | }, 31 | { 32 | "temporal_size": 32, 33 | "ori_spatial_size": (128, 128), 34 | "dst_short_size": 112, 35 | }, 36 | {"temporal_size": 8, "ori_spatial_size": (256, 256), "dst_short_size": 224}, 37 | { 38 | "temporal_size": 16, 39 | "ori_spatial_size": (256, 256), 40 | "dst_short_size": 224, 41 | }, 42 | { 43 | "temporal_size": 32, 44 | "ori_spatial_size": (256, 256), 45 | "dst_short_size": 224, 46 | }, 47 | {"temporal_size": 8, "ori_spatial_size": (320, 320), "dst_short_size": 224}, 48 | { 49 | "temporal_size": 16, 50 | "ori_spatial_size": (320, 320), 51 | "dst_short_size": 224, 52 | }, 53 | { 54 | "temporal_size": 32, 55 | "ori_spatial_size": (320, 320), 56 | "dst_short_size": 224, 57 | }, 58 | ] 59 | 60 | def _init_benchmark_short_side_scale(**kwargs) -> Callable: 61 | x = thwc_to_cthw( 62 | create_dummy_video_frames( 63 | kwargs["temporal_size"], 64 | kwargs["ori_spatial_size"][0], 65 | kwargs["ori_spatial_size"][1], 66 | ) 67 | ).to(dtype=torch.float32) 68 | 69 | def func_to_benchmark() -> None: 70 | _ = short_side_scale(x, kwargs["dst_short_size"]) 71 | return 72 | 73 | return func_to_benchmark 74 | 75 | benchmark( 76 | _init_benchmark_short_side_scale, 77 | "benchmark_short_side_scale_pytorch", 78 | kwargs_list, 79 | num_iters=num_iters, 80 | warmup_iters=2, 81 | ) 82 | self.assertTrue(True) 83 | -------------------------------------------------------------------------------- /tests/test_accelerator_efficient_blocks_mobile_cpu_activation_attention.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import logging 4 | import unittest 5 | from copy import deepcopy 6 | 7 | import torch 8 | from pytorchvideo.layers.accelerator.mobile_cpu.activation_functions import ( 9 | supported_act_functions, 10 | ) 11 | from pytorchvideo.layers.accelerator.mobile_cpu.attention import SqueezeExcitation 12 | 13 | 14 | class TestActivationAttentionEquivalency(unittest.TestCase): 15 | def test_activation_equivalency(self): 16 | # Input tensor 17 | input_tensor = torch.randn(1, 3, 4, 6, 6) 18 | for iter_activation_name in supported_act_functions: 19 | act_func_ref = supported_act_functions[iter_activation_name]() 20 | act_func_convert = deepcopy(act_func_ref) 21 | act_func_convert.convert() 22 | # Get output of both activations 23 | out0 = act_func_ref(input_tensor) 24 | out1 = act_func_convert(input_tensor) 25 | # Check arithmetic equivalency 26 | max_err = float(torch.max(torch.abs(out0 - out1))) 27 | 28 | logging.info( 29 | f"test_activation_equivalency: {iter_activation_name} max_err {max_err}" 30 | ) 31 | self.assertTrue(max_err < 1e-3) 32 | 33 | def test_squeeze_excite_equivalency(self): 34 | # Input tensor 35 | input_tensor = torch.randn(1, 16, 4, 6, 6) 36 | # Instantiate ref and convert se modules. 37 | se_ref = SqueezeExcitation(16, num_channels_reduced=2, is_3d=True) 38 | se_ref.eval() 39 | se_convert = deepcopy(se_ref) 40 | se_convert.convert((1, 16, 4, 6, 6)) 41 | # Get output of both activations 42 | out0 = se_ref(input_tensor) 43 | out1 = se_convert(input_tensor) 44 | # Check arithmetic equivalency 45 | max_err = float(torch.max(torch.abs(out0 - out1))) 46 | rel_err = torch.abs((out0 - out1) / out0) 47 | max_rel_err = float(torch.max(rel_err)) 48 | 49 | logging.info( 50 | ( 51 | "test_squeeze_excite_equivalency: " 52 | f"max_err {max_err}, max_rel_err {max_rel_err}" 53 | ) 54 | ) 55 | self.assertTrue(max_err < 1e-3) 56 | -------------------------------------------------------------------------------- /tests/test_accelerator_efficient_blocks_mobile_cpu_head_layer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import logging 4 | import unittest 5 | from copy import deepcopy 6 | 7 | import torch 8 | from pytorchvideo.layers.accelerator.mobile_cpu.fully_connected import FullyConnected 9 | from pytorchvideo.layers.accelerator.mobile_cpu.pool import ( 10 | AdaptiveAvgPool2d, 11 | AdaptiveAvgPool2dOutSize1, 12 | AdaptiveAvgPool3d, 13 | AdaptiveAvgPool3dOutSize1, 14 | ) 15 | 16 | 17 | class TestHeadLayerEquivalency(unittest.TestCase): 18 | def test_head_layer_equivalency(self): 19 | for input_dim in (4, 5): # 4 for BCHW, 5 for BCTHW 20 | input_tensor_size = (1, 3, 4, 6, 6) if input_dim == 5 else (1, 3, 6, 6) 21 | input_tensor = torch.randn(input_tensor_size) 22 | # Build up common head layer: pool + linear 23 | if input_dim == 5: 24 | pool_efficient_block_ref = AdaptiveAvgPool3d(1) 25 | pool_efficient_block_1 = AdaptiveAvgPool3d(1) 26 | pool_efficient_block_2 = AdaptiveAvgPool3dOutSize1() 27 | 28 | else: 29 | pool_efficient_block_ref = AdaptiveAvgPool2d(1) 30 | pool_efficient_block_1 = AdaptiveAvgPool2d(1) 31 | pool_efficient_block_2 = AdaptiveAvgPool2dOutSize1() 32 | pool_efficient_block_1.convert() 33 | pool_efficient_block_2.convert(input_tensor_size) 34 | linear_ref = FullyConnected(3, 8) 35 | linear_1 = deepcopy(linear_ref) 36 | linear_1.convert() 37 | 38 | ref_out = pool_efficient_block_ref(input_tensor) 39 | if input_dim == 5: 40 | ref_out = ref_out.permute((0, 2, 3, 4, 1)) 41 | else: 42 | ref_out = ref_out.permute((0, 2, 3, 1)) 43 | ref_out = linear_ref(ref_out) 44 | 45 | head_out_1 = pool_efficient_block_1(input_tensor) 46 | if input_dim == 5: 47 | head_out_1 = head_out_1.permute((0, 2, 3, 4, 1)) 48 | else: 49 | head_out_1 = head_out_1.permute((0, 2, 3, 1)) 50 | head_out_1 = linear_1(head_out_1) 51 | # Check arithmetic equivalency 52 | max_err = float(torch.max(torch.abs(ref_out - head_out_1))) 53 | rel_err = torch.abs((ref_out - head_out_1) / ref_out) 54 | max_rel_err = float(torch.max(rel_err)) 55 | logging.info( 56 | ( 57 | "test_head_layer_equivalency: AdaptiveAvgPool + Linear" 58 | f"input tensor size: {input_tensor_size}" 59 | f"max_err {max_err}, max_rel_err {max_rel_err}" 60 | ) 61 | ) 62 | self.assertTrue(max_err < 1e-3) 63 | 64 | head_out_2 = pool_efficient_block_2(input_tensor) 65 | if input_dim == 5: 66 | head_out_2 = head_out_2.permute((0, 2, 3, 4, 1)) 67 | else: 68 | head_out_2 = head_out_2.permute((0, 2, 3, 1)) 69 | head_out_2 = linear_1(head_out_2) 70 | # Check arithmetic equivalency 71 | max_err = float(torch.max(torch.abs(ref_out - head_out_2))) 72 | rel_err = torch.abs((ref_out - head_out_2) / ref_out) 73 | max_rel_err = float(torch.max(rel_err)) 74 | logging.info( 75 | ( 76 | "test_head_layer_equivalency: AdaptiveAvgPoolOutSize1 + Linear" 77 | f"input tensor size: {input_tensor_size}" 78 | f"max_err {max_err}, max_rel_err {max_rel_err}" 79 | ) 80 | ) 81 | self.assertTrue(max_err < 1e-3) 82 | -------------------------------------------------------------------------------- /tests/test_accelerator_efficient_blocks_mobile_cpu_residual_block.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import logging 4 | import unittest 5 | from copy import deepcopy 6 | 7 | import torch 8 | from pytorchvideo.models.accelerator.mobile_cpu.residual_blocks import ( 9 | X3dBottleneckBlock, 10 | ) 11 | 12 | 13 | class TestConv3dBlockEquivalency(unittest.TestCase): 14 | def test_X3dBottleneckBlock_equivalency(self): 15 | # Input tensor 16 | input_blob_size = (1, 3, 4, 6, 6) 17 | input_tensor = torch.randn(input_blob_size) 18 | for use_residual in (True, False): 19 | for spatial_stride in (1, 2): 20 | for se_ratio in (0, 0.5): 21 | for act_func_0 in ("relu", "swish", "hswish", "identity"): 22 | for act_func_1 in ("relu", "swish", "hswish", "identity"): 23 | for act_func_2 in ("relu", "swish", "hswish", "identity"): 24 | act_func_tuple = (act_func_0, act_func_1, act_func_2) 25 | # X3dBottleneckBlock 26 | x3d_block_ref = X3dBottleneckBlock( 27 | 3, 28 | 16, 29 | 3, 30 | use_residual=use_residual, 31 | spatial_stride=spatial_stride, 32 | se_ratio=se_ratio, 33 | act_functions=act_func_tuple, 34 | ) 35 | x3d_block = deepcopy(x3d_block_ref) 36 | # Get ref output 37 | x3d_block_ref.eval() 38 | out_ref = x3d_block_ref(input_tensor) 39 | # Convert into deployment mode 40 | x3d_block.convert(input_blob_size) 41 | out = x3d_block(input_tensor) 42 | # Check arithmetic equivalency 43 | max_err = float(torch.max(torch.abs(out_ref - out))) 44 | rel_err = torch.abs((out_ref - out) / out_ref) 45 | max_rel_err = float(torch.max(rel_err)) 46 | logging.info( 47 | ( 48 | "test_X3dBottleneckBlock_equivalency: " 49 | f"current setting: use_residual {use_residual}, " 50 | f"spatial_stride {spatial_stride}, " 51 | f"se_ratio {se_ratio}, " 52 | f"act_func_tuple {act_func_tuple}, " 53 | f"max_err {max_err}, max_rel_err {max_rel_err}" 54 | ) 55 | ) 56 | self.assertTrue(max_err < 1e-3) 57 | -------------------------------------------------------------------------------- /tests/test_data_frame_video.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import unittest 4 | 5 | import pytest 6 | from pytorchvideo.data.frame_video import FrameVideo 7 | from utils import temp_frame_video 8 | 9 | 10 | class TestFrameVideo(unittest.TestCase): 11 | def test_frame_video_works(self): 12 | frame_names = [f"{str(i)}.png" for i in range(3)] 13 | with temp_frame_video(frame_names) as (f_name, data): 14 | frame_paths = [f_name / x for x in frame_names] 15 | test_video = FrameVideo.from_frame_paths(frame_paths) 16 | expected_duration = ( 17 | 0.1 # Total duration of 3 frames at 30fps is 0.1 seconds. 18 | ) 19 | self.assertEqual(test_video.duration, expected_duration) 20 | 21 | # All frames (0 - 0.1 seconds) 22 | clip = test_video.get_clip(0, 0.1) 23 | frames, indices = clip["video"], clip["frame_indices"] 24 | self.assertTrue(frames.equal(data)) 25 | self.assertEqual(indices, [0, 1, 2]) 26 | 27 | # All frames (0 - 0.1 seconds), filtred to middle frame 28 | clip = test_video.get_clip(0, 0.1, lambda lst: lst[1:2]) 29 | frames, indices = clip["video"], clip["frame_indices"] 30 | self.assertTrue(frames.equal(data[:, 1:2])) 31 | self.assertEqual(indices, [1]) 32 | 33 | # 2 frames (0 - 0.066 seconds) 34 | clip = test_video.get_clip(0, 0.066) 35 | frames, indices = clip["video"], clip["frame_indices"] 36 | self.assertTrue(frames.equal(data[:, :2])) 37 | self.assertEqual(indices, [0, 1]) 38 | 39 | # No frames (3 - 5 seconds) 40 | result = test_video.get_clip(3, 5) 41 | self.assertEqual(result, None) 42 | 43 | def test_open_video_failure(self): 44 | test_video = FrameVideo.from_frame_paths(["non_existent_file.txt"]) 45 | with pytest.raises(Exception): 46 | test_video.get_clip(0, 0.01) # duration is 1 / 30 because one frame 47 | 48 | def test_empty_frames_failure(self): 49 | with pytest.raises(AssertionError): 50 | FrameVideo.from_frame_paths([]) 51 | -------------------------------------------------------------------------------- /tests/test_fuse_bn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import unittest 4 | 5 | import torch 6 | from pytorchvideo.models.vision_transformers import ( 7 | create_multiscale_vision_transformers, 8 | ) 9 | 10 | 11 | class TestFuseBN(unittest.TestCase): 12 | def setUp(self): 13 | super().setUp() 14 | torch.set_rng_state(torch.manual_seed(42).get_state()) 15 | 16 | def test_fuse_bn(self): 17 | model = create_multiscale_vision_transformers( 18 | spatial_size=224, 19 | temporal_size=8, 20 | norm="batchnorm", 21 | embed_dim_mul=[[1, 2.0], [3, 2.0], [14, 2.0]], 22 | atten_head_mul=[[1, 2.0], [3, 2.0], [14, 2.0]], 23 | pool_q_stride_size=[[1, 1, 2, 2], [3, 1, 2, 2], [14, 1, 2, 2]], 24 | pool_kv_stride_adaptive=[1, 8, 8], 25 | pool_kvq_kernel=[3, 3, 3], 26 | cls_embed_on=False, 27 | ) 28 | 29 | for blk in model.blocks: 30 | blk.norm1 = rand_init_bn(blk.norm1) 31 | blk.norm2 = rand_init_bn(blk.norm2) 32 | if blk.attn.norm_q: 33 | blk.attn.norm_q = rand_init_bn(blk.attn.norm_q) 34 | if blk.attn.norm_k: 35 | blk.attn.norm_k = rand_init_bn(blk.attn.norm_k) 36 | if blk.attn.norm_v: 37 | blk.attn.norm_v = rand_init_bn(blk.attn.norm_v) 38 | 39 | model.eval() 40 | 41 | x = torch.randn((4, 3, 8, 224, 224)) 42 | expected_output = model(x) 43 | model.fuse_bn() 44 | output = model(x) 45 | self.assertTrue(torch.all(torch.isclose(output, expected_output, atol=1e-5))) 46 | self.assertTrue( 47 | len( 48 | [ 49 | layer 50 | for layer in model.modules() 51 | if isinstance(layer, (torch.nn.BatchNorm1d, torch.nn.BatchNorm3d)) 52 | ] 53 | ) 54 | == 0 55 | ) 56 | 57 | 58 | def rand_init_bn(bn): 59 | bn.weight.data.uniform_(0.5, 1.5) 60 | bn.bias.data.uniform_(-0.5, 0.5) 61 | bn.running_var.data.uniform_(0.5, 1.5) 62 | bn.running_mean.data.uniform_(-0.5, 0.5) 63 | return bn 64 | -------------------------------------------------------------------------------- /tests/test_layers_drop_path.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import unittest 4 | 5 | import torch 6 | from pytorchvideo.layers import DropPath 7 | 8 | 9 | class TestDropPath(unittest.TestCase): 10 | def setUp(self): 11 | super().setUp() 12 | torch.set_rng_state(torch.manual_seed(42).get_state()) 13 | 14 | def test_dropPath(self): 15 | # Input should be same if drop_prob = 0. 16 | net_drop_path = DropPath(drop_prob=0.0) 17 | fake_input = torch.rand(64, 10, 20) 18 | output = net_drop_path(fake_input) 19 | self.assertTrue(output.equal(fake_input)) 20 | # Test when drop_prob > 0. 21 | net_drop_path = DropPath(drop_prob=0.5) 22 | fake_input = torch.rand(64, 10, 20) 23 | output = net_drop_path(fake_input) 24 | self.assertTrue(output.shape, fake_input.shape) 25 | -------------------------------------------------------------------------------- /tests/test_layers_fusion.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import unittest 4 | 5 | import torch 6 | from pytorchvideo.layers import make_fusion_layer 7 | 8 | 9 | class TestFusion(unittest.TestCase): 10 | def setUp(self): 11 | super().setUp() 12 | torch.set_rng_state(torch.manual_seed(42).get_state()) 13 | 14 | self.fake_input_1 = torch.Tensor( 15 | [[[4, -2], [3, 0]], [[0, 2], [4, 3]], [[3, 1], [5, 2]]] 16 | ).float() 17 | self.fake_input_2 = torch.Tensor( 18 | [[[1, 2], [3, 4]], [[5, 6], [6, 5]], [[4, 3], [2, 1]]] 19 | ).float() 20 | 21 | def test_reduce_fusion_layers(self): 22 | expected_output_for_method = { 23 | "max": torch.Tensor( 24 | [[[4, 2], [3, 4]], [[5, 6], [6, 5]], [[4, 3], [5, 2]]] 25 | ).float(), 26 | "sum": torch.Tensor( 27 | [[[5, 0], [6, 4]], [[5, 8], [10, 8]], [[7, 4], [7, 3]]] 28 | ).float(), 29 | "prod": torch.Tensor( 30 | [[[4, -4], [9, 0]], [[0, 12], [24, 15]], [[12, 3], [10, 2]]] 31 | ).float(), 32 | } 33 | 34 | for method, expected_output in expected_output_for_method.items(): 35 | model = make_fusion_layer( 36 | method, [self.fake_input_1.shape[-1], self.fake_input_2.shape[-1]] 37 | ) 38 | output = model([self.fake_input_1, self.fake_input_2]) 39 | self.assertTrue(torch.equal(output, expected_output)) 40 | self.assertEqual(model.output_dim, self.fake_input_1.shape[-1]) 41 | 42 | def test_concat_fusion(self): 43 | model = make_fusion_layer( 44 | "concat", [self.fake_input_1.shape[-1], self.fake_input_2.shape[-1]] 45 | ) 46 | input_list = [self.fake_input_1, self.fake_input_2] 47 | output = model(input_list) 48 | expected_output = torch.cat(input_list, dim=-1) 49 | self.assertTrue(torch.equal(output, expected_output)) 50 | 51 | expected_shape = self.fake_input_1.shape[-1] + self.fake_input_2.shape[-1] 52 | self.assertEqual(model.output_dim, expected_shape) 53 | 54 | def test_temporal_concat_fusion(self): 55 | model = make_fusion_layer( 56 | "temporal_concat", 57 | [self.fake_input_1.shape[-1], self.fake_input_2.shape[-1]], 58 | ) 59 | input_list = [self.fake_input_1, self.fake_input_2] 60 | output = model(input_list) 61 | 62 | expected_output = torch.cat(input_list, dim=-2) 63 | self.assertTrue(torch.equal(output, expected_output)) 64 | self.assertEqual(model.output_dim, self.fake_input_2.shape[-1]) 65 | -------------------------------------------------------------------------------- /tests/test_layers_mlp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import itertools 4 | import unittest 5 | 6 | import torch 7 | import torch.nn as nn 8 | from pytorchvideo.layers import make_multilayer_perceptron 9 | 10 | 11 | class TestMLP(unittest.TestCase): 12 | def setUp(self): 13 | super().setUp() 14 | torch.set_rng_state(torch.manual_seed(42).get_state()) 15 | 16 | def test_make_multilayer_perceptron(self): 17 | fake_input = torch.rand((8, 64)) 18 | fcs = [64, 128, 64, 32] 19 | mid_activations = [nn.ReLU, nn.Sigmoid] 20 | final_activations = [nn.ReLU, nn.Sigmoid, None] 21 | norms = [nn.LayerNorm, nn.BatchNorm1d, None] 22 | for mid_act, final_act, norm in itertools.product( 23 | mid_activations, final_activations, norms 24 | ): 25 | mlp, output_dim = make_multilayer_perceptron( 26 | fully_connected_dims=fcs, 27 | mid_activation=mid_act, 28 | final_activation=final_act, 29 | norm=norm, 30 | dropout_rate=0.5, 31 | ) 32 | 33 | self.assertEqual(output_dim, 32) 34 | 35 | output = mlp(fake_input) 36 | self.assertTrue(output.shape, torch.Size([8, 32])) 37 | -------------------------------------------------------------------------------- /tests/test_layers_squeeze_excitation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import copy 4 | import unittest 5 | 6 | import torch 7 | import torch.nn as nn 8 | from pytorchvideo.layers.squeeze_excitation import ( 9 | create_audio_2d_squeeze_excitation_block, 10 | ) 11 | 12 | 13 | class Test2DSqueezeExcitationBlock(unittest.TestCase): 14 | def setUp(self): 15 | self.layer_args = { 16 | "dim_in": 32, 17 | "dim_out": 32, 18 | "use_se": True, 19 | "se_reduction_ratio": 16, 20 | "branch_fusion": lambda x, y: x + y, 21 | "conv_a_kernel_size": 3, 22 | "conv_a_stride": 1, 23 | "conv_a_padding": 1, 24 | "conv_b_kernel_size": 3, 25 | "conv_b_stride": 1, 26 | "conv_b_padding": 1, 27 | "norm": nn.BatchNorm2d, 28 | "norm_eps": 1e-5, 29 | "norm_momentum": 0.1, 30 | "activation": nn.ReLU, 31 | } 32 | 33 | self.batchsize = 1 34 | self.forward_pass_configs = [ 35 | { 36 | "input": torch.rand(self.batchsize, self.layer_args["dim_in"], 100, 40), 37 | "output_shape": torch.Size( 38 | [self.batchsize, self.layer_args["dim_out"], 100, 40] 39 | ), 40 | }, 41 | ] 42 | 43 | def test_forward_pass(self): 44 | for split_config in self.forward_pass_configs: 45 | layer_args = copy.deepcopy(self.layer_args) 46 | model = create_audio_2d_squeeze_excitation_block(**layer_args) 47 | 48 | out = model(split_config["input"]) 49 | self.assertTrue(isinstance(out, torch.Tensor)) 50 | self.assertEqual(out.size(), split_config["output_shape"]) 51 | -------------------------------------------------------------------------------- /tests/test_losses_soft_target_cross_entropy.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import itertools 4 | import unittest 5 | 6 | import torch 7 | import torch.nn.functional as F 8 | from pytorchvideo.losses.soft_target_cross_entropy import SoftTargetCrossEntropyLoss 9 | 10 | 11 | class TestSoftTargetCrossEntropyLoss(unittest.TestCase): 12 | def setUp(self): 13 | super().setUp() 14 | torch.set_rng_state(torch.manual_seed(42).get_state()) 15 | 16 | def test_soft_target_cross_entropy_loss(self): 17 | """ 18 | Test the soft target cross entropy loss. 19 | """ 20 | for batch_size, num_class, use_1D_target in itertools.product( 21 | (1, 8), (2, 10), (True, False) 22 | ): 23 | loss = SoftTargetCrossEntropyLoss() 24 | 25 | # Test forwarding. 26 | for ( 27 | input_tensor, 28 | target_tensor, 29 | ) in TestSoftTargetCrossEntropyLoss._get_inputs( 30 | batch_size=batch_size, num_class=num_class, use_1D_target=use_1D_target 31 | ): 32 | output_tensor = loss(input_tensor, target_tensor) 33 | output_shape = output_tensor.shape 34 | 35 | self.assertEqual( 36 | output_shape, 37 | torch.Size([]), 38 | "Output shape {} is different from expected.".format(output_shape), 39 | ) 40 | 41 | # If target is normalized, output_tensor must match direct eval 42 | if target_tensor.ndim == 1 or all(target_tensor.sum(dim=-1) == 1): 43 | _target_tensor = target_tensor 44 | if target_tensor.ndim == 1: 45 | _target_tensor = torch.nn.functional.one_hot( 46 | target_tensor, num_class 47 | ) 48 | 49 | _output_tensor = torch.sum( 50 | -_target_tensor * F.log_softmax(input_tensor, dim=-1), dim=-1 51 | ).mean() 52 | 53 | self.assertTrue(abs(_output_tensor - output_tensor) < 1e-6) 54 | 55 | @staticmethod 56 | def _get_inputs( 57 | batch_size: int = 16, num_class: int = 400, use_1D_target: bool = True 58 | ) -> torch.tensor: 59 | """ 60 | Provide different tensors as test cases. 61 | 62 | Yield: 63 | (torch.tensor): tensor as test case input. 64 | """ 65 | # Prepare random tensor as test cases. 66 | if use_1D_target: 67 | target_shape = (batch_size,) 68 | else: 69 | target_shape = (batch_size, num_class) 70 | input_shape = (batch_size, num_class) 71 | 72 | yield torch.rand(input_shape), torch.randint(num_class, target_shape) 73 | -------------------------------------------------------------------------------- /tests/test_models_audio_visual_slowfast.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import itertools 4 | import unittest 5 | from typing import Tuple 6 | 7 | import torch 8 | from pytorchvideo.models.audio_visual_slowfast import create_audio_visual_slowfast 9 | from pytorchvideo.transforms.functional import uniform_temporal_subsample_repeated 10 | from torch import nn 11 | 12 | 13 | class TestAVSlowFast(unittest.TestCase): 14 | def setUp(self): 15 | super().setUp() 16 | torch.set_rng_state(torch.manual_seed(42).get_state()) 17 | 18 | def test_create_avslowfast_with_callable(self): 19 | """ 20 | Test builder `create_audio_visual_slowfast` with callable inputs. 21 | """ 22 | for norm, activation in itertools.product( 23 | (nn.BatchNorm3d, None), (nn.ReLU, nn.Sigmoid, None) 24 | ): 25 | input_channel = 3 26 | 27 | model = create_audio_visual_slowfast( 28 | input_channels=(input_channel, input_channel, 1), 29 | model_depth=18, 30 | norm=norm, 31 | activation=activation, 32 | ) 33 | 34 | # Test forwarding. 35 | for tensor in TestAVSlowFast._get_inputs(input_channel): 36 | with torch.no_grad(): 37 | if tensor[0].shape[1] != input_channel: 38 | with self.assertRaises(RuntimeError): 39 | model(tensor) 40 | continue 41 | 42 | model(tensor) 43 | 44 | @staticmethod 45 | def _get_inputs( 46 | channel: int = 3, 47 | clip_length: int = 64, 48 | audio_clip_length: int = 128, 49 | crop_size: int = 224, 50 | audio_size: int = 80, 51 | frame_ratios: Tuple[int] = (8, 2), 52 | audio_frame_ratio: int = 1, 53 | ) -> Tuple[torch.Tensor]: 54 | """ 55 | Provide different tensors as test cases. 56 | 57 | Yield: 58 | Tuple[torch.Tensor]: tensors as test case input. 59 | """ 60 | # Prepare random inputs as test cases. 61 | shape = (1, channel, clip_length, crop_size, crop_size) 62 | audio_shape = (1, 1, audio_clip_length, 1, audio_size) 63 | output = uniform_temporal_subsample_repeated( 64 | torch.rand(shape), frame_ratios=frame_ratios, temporal_dim=2 65 | ) 66 | yield output + uniform_temporal_subsample_repeated( 67 | torch.rand(audio_shape), frame_ratios=(audio_frame_ratio,), temporal_dim=2 68 | ) 69 | -------------------------------------------------------------------------------- /tests/test_models_byol.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import unittest 4 | 5 | import torch 6 | from pytorchvideo.models.byol import BYOL 7 | from torch import nn 8 | 9 | 10 | class TestBYOL(unittest.TestCase): 11 | def setUp(self): 12 | super().setUp() 13 | torch.set_rng_state(torch.manual_seed(42).get_state()) 14 | 15 | def test_byol(self): 16 | byol = BYOL( 17 | backbone=nn.Linear(8, 4), 18 | projector=nn.Linear(4, 4), 19 | feature_dim=4, 20 | norm=nn.BatchNorm1d, 21 | ) 22 | for crop1, crop2 in TestBYOL._get_inputs(): 23 | byol(crop1, crop2) 24 | 25 | @staticmethod 26 | def _get_inputs() -> torch.tensor: 27 | """ 28 | Provide different tensors as test cases. 29 | 30 | Yield: 31 | (torch.tensor): tensor as test case input. 32 | """ 33 | # Prepare random inputs as test cases. 34 | shapes = ((2, 8),) 35 | for shape in shapes: 36 | yield torch.rand(shape), torch.rand(shape) 37 | -------------------------------------------------------------------------------- /tests/test_models_hub_vision_transformers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import os 4 | import unittest 5 | 6 | import torch 7 | import torch.nn as nn 8 | from pytorchvideo.models.hub.utils import hub_model_builder 9 | 10 | 11 | class TestHubVisionTransformers(unittest.TestCase): 12 | def setUp(self): 13 | super().setUp() 14 | torch.set_rng_state(torch.manual_seed(42).get_state()) 15 | 16 | def test_load_hubconf(self): 17 | def test_load_mvit_(model_name, pretrained): 18 | path = os.path.join( 19 | os.path.dirname(os.path.realpath(__file__)), 20 | "..", 21 | ) 22 | model = torch.hub.load( 23 | repo_or_dir=path, 24 | source="local", 25 | model=model_name, 26 | pretrained=pretrained, 27 | ) 28 | self.assertIsNotNone(model) 29 | 30 | models = [ 31 | "mvit_base_16x4", 32 | "mvit_base_16", 33 | "mvit_base_32x3", 34 | ] 35 | pretrains = [False, False, False] 36 | 37 | for model_name, pretrain in zip(models, pretrains): 38 | test_load_mvit_(model_name, pretrain) 39 | 40 | def test_hub_model_builder(self): 41 | def _fake_model(in_features=10, out_features=10) -> nn.Module: 42 | """ 43 | A fake model builder with a linear layer. 44 | """ 45 | model = nn.Linear(in_features, out_features) 46 | return model 47 | 48 | in_fea = 5 49 | default_config = {"in_features": in_fea} 50 | model = hub_model_builder( 51 | model_builder_func=_fake_model, default_config=default_config 52 | ) 53 | self.assertEqual(model.in_features, in_fea) 54 | self.assertEqual(model.out_features, 10) 55 | 56 | # Test case where add_config overwrites default_config. 57 | in_fea = 5 58 | default_config = {"in_features": in_fea} 59 | add_in_fea = 2 60 | add_out_fea = 3 61 | 62 | model = hub_model_builder( 63 | model_builder_func=_fake_model, 64 | default_config=default_config, 65 | in_features=add_in_fea, 66 | out_features=add_out_fea, 67 | ) 68 | self.assertEqual(model.in_features, add_in_fea) 69 | self.assertEqual(model.out_features, add_out_fea) 70 | 71 | # Test assertions. 72 | self.assertRaises( 73 | AssertionError, 74 | hub_model_builder, 75 | model_builder_func=_fake_model, 76 | pretrained=True, 77 | default_config={}, 78 | fake_input=None, 79 | ) 80 | -------------------------------------------------------------------------------- /tests/test_models_memory_bank.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import unittest 4 | 5 | import torch 6 | from pytorchvideo.models.memory_bank import MemoryBank 7 | from torch import nn 8 | 9 | 10 | class TestMemoryBank(unittest.TestCase): 11 | def setUp(self): 12 | super().setUp() 13 | torch.set_rng_state(torch.manual_seed(42).get_state()) 14 | 15 | def test_memory_bank(self): 16 | simclr = MemoryBank( 17 | backbone=nn.Linear(8, 4), 18 | mlp=nn.Linear(4, 2), 19 | temperature=0.07, 20 | bank_size=8, 21 | dim=2, 22 | ) 23 | for crop, ind in TestMemoryBank._get_inputs(): 24 | simclr(crop, ind) 25 | 26 | @staticmethod 27 | def _get_inputs(bank_size: int = 8) -> torch.tensor: 28 | """ 29 | Provide different tensors as test cases. 30 | 31 | Yield: 32 | (torch.tensor): tensor as test case input. 33 | """ 34 | # Prepare random inputs as test cases. 35 | shapes = ((2, 8),) 36 | for shape in shapes: 37 | yield torch.rand(shape), torch.randint(0, bank_size, size=(shape[0],)) 38 | -------------------------------------------------------------------------------- /tests/test_simclr.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import unittest 4 | 5 | import torch 6 | from pytorchvideo.models.simclr import SimCLR 7 | from torch import nn 8 | 9 | 10 | class TestSimCLR(unittest.TestCase): 11 | def setUp(self): 12 | super().setUp() 13 | torch.set_rng_state(torch.manual_seed(42).get_state()) 14 | 15 | def test_simclr(self): 16 | simclr = SimCLR( 17 | backbone=nn.Linear(8, 4), 18 | mlp=nn.Linear(4, 2), 19 | temperature=0.07, 20 | ) 21 | for crop1, crop2 in TestSimCLR._get_inputs(): 22 | simclr(crop1, crop2) 23 | 24 | @staticmethod 25 | def _get_inputs() -> torch.tensor: 26 | """ 27 | Provide different tensors as test cases. 28 | 29 | Yield: 30 | (torch.tensor): tensor as test case input. 31 | """ 32 | # Prepare random inputs as test cases. 33 | shapes = ( 34 | (1, 8), 35 | (2, 8), 36 | ) 37 | for shape in shapes: 38 | yield torch.rand(shape), torch.rand(shape) 39 | -------------------------------------------------------------------------------- /tutorials/video_classification_example/environment.yml: -------------------------------------------------------------------------------- 1 | # Conda environment file 2 | # Usage: `conda env update -f environment.yml` 3 | 4 | name: video_classification_example 5 | 6 | channels: 7 | - conda-forge 8 | - pytorch-nightly 9 | 10 | dependencies: 11 | - pytorch-lightning 12 | - submitit 13 | -------------------------------------------------------------------------------- /tutorials/video_classification_example/slurm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | import os 5 | import pathlib 6 | import shutil 7 | 8 | import submitit 9 | 10 | 11 | def init_and_run(run_fn, run_config): 12 | os.environ["RANK"] = os.environ["SLURM_LOCALID"] 13 | os.environ["LOCAL_RANK"] = os.environ["SLURM_LOCALID"] 14 | os.environ["NODE_RANK"] = os.environ["SLURM_LOCALID"] 15 | os.environ["WORLD_SIZE"] = os.environ["SLURM_NTASKS"] 16 | run_fn(run_config) 17 | 18 | 19 | def copy_and_run_with_config(run_fn, run_config, directory, **cluster_config): 20 | working_directory = pathlib.Path(directory) / cluster_config["job_name"] 21 | ignore_list = [ 22 | "lightning_logs", 23 | "logs", 24 | "checkpoints", 25 | "experiments", 26 | ".git", 27 | "output", 28 | "val.csv", 29 | "train.csv", 30 | ] 31 | shutil.copytree(".", working_directory, ignore=lambda x, y: ignore_list) 32 | os.chdir(working_directory) 33 | print(f"Running at {working_directory}") 34 | 35 | executor = submitit.SlurmExecutor(folder=working_directory) 36 | executor.update_parameters(**cluster_config) 37 | job = executor.submit(init_and_run, run_fn, run_config) 38 | print(f"job_id: {job}") 39 | -------------------------------------------------------------------------------- /website/.dockerignore: -------------------------------------------------------------------------------- 1 | */node_modules 2 | *.log 3 | -------------------------------------------------------------------------------- /website/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | 3 | node_modules 4 | 5 | lib/core/metadata.js 6 | lib/core/MetadataBlog.js 7 | 8 | website/translated_docs 9 | website/build/ 10 | website/yarn.lock 11 | website/node_modules 12 | website/i18n/* 13 | -------------------------------------------------------------------------------- /website/docs/tutorial_overview.md: -------------------------------------------------------------------------------- 1 | --- 2 | id: tutorial_overview 3 | title: Tutorials 4 | sidebar_label: Overview 5 | --- 6 | 7 | PyTorchVideo tutorials are designed to help you get acquainted with the library and also give you an idea on how to incorporate different PyTorchVideo components into your own video-research workflow. In the tutorials, through examples, we also show how PyTorchVideo makes it easy to address some of the common deeplearning video use cases. 8 | 9 | PyTorchVideo is built on PyTorch. If you are new to PyTorch, the easiest way to get started is with the [PyTorch: A 60 Minute Blitz](https://pytorch.org/tutorials/beginner/blitz/tensor_tutorial.html#sphx-glr-beginner-blitz-tensor-tutorial-py) tutorial. 10 | -------------------------------------------------------------------------------- /website/website/core/Footer.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2017-present, Facebook, Inc. 3 | * 4 | * This source code is licensed under the MIT license found in the 5 | * LICENSE file in the root directory of this source tree. 6 | */ 7 | 8 | 9 | const PropTypes = require("prop-types"); 10 | const React = require('react'); 11 | 12 | function SocialFooter(props) { 13 | const repoUrl = `https://github.com/${props.config.organizationName}/${props.config.projectName}`; 14 | return ( 15 | 29 | ); 30 | } 31 | 32 | SocialFooter.propTypes = { 33 | config: PropTypes.object 34 | }; 35 | 36 | class Footer extends React.Component { 37 | docUrl(doc, language) { 38 | const baseUrl = this.props.config.baseUrl; 39 | const docsUrl = this.props.config.docsUrl; 40 | const docsPart = `${docsUrl ? `${docsUrl}/` : ''}`; 41 | const langPart = `${language ? `${language}/` : ''}`; 42 | return `${baseUrl}${docsPart}${langPart}${doc}`; 43 | } 44 | 45 | pageUrl(doc, language) { 46 | const baseUrl = this.props.config.baseUrl; 47 | return baseUrl + (language ? `${language}/` : '') + doc; 48 | } 49 | 50 | render() { 51 | const repoUrl = `https://github.com/${this.props.config.organizationName}/${this.props.config.projectName}`; 52 | return ( 53 | 87 | ); 88 | } 89 | } 90 | 91 | module.exports = Footer; -------------------------------------------------------------------------------- /website/website/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "scripts": { 3 | "examples": "docusaurus-examples", 4 | "start": "docusaurus-start", 5 | "build": "docusaurus-build", 6 | "publish-gh-pages": "docusaurus-publish", 7 | "write-translations": "docusaurus-write-translations", 8 | "version": "docusaurus-version", 9 | "rename-version": "docusaurus-rename-version" 10 | }, 11 | "devDependencies": { 12 | "docusaurus": "^1.14.6" 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /website/website/sidebars.json: -------------------------------------------------------------------------------- 1 | { 2 | "docs-other": { 3 | "Tutorials": ["tutorial_overview"], 4 | "Classification": ["tutorial_classification", "tutorial_torchhub_inference"], 5 | "Detection": ["tutorial_torchhub_detection_inference"], 6 | "Accelerator": ["tutorial_accelerator_build_your_model", "tutorial_accelerator_use_accelerator_model_zoo", "tutorial_accelerator_use_model_transmuter"] 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /website/website/siteConfig.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) Facebook, Inc. and its affiliates. 3 | * 4 | * This source code is licensed under the MIT license found in the 5 | * LICENSE file in the root directory of this source tree. 6 | */ 7 | 8 | // See https://docusaurus.io/docs/site-config for all the possible 9 | // site configuration options. 10 | 11 | 12 | const siteConfig = { 13 | title: 'PyTorchVideo', // Title for your website. 14 | tagline: 'A deep learning library for video understanding research', 15 | url: 'https://pytorchvideo.org', // Your website URL 16 | baseUrl: '/', 17 | 18 | // Used for publishing and more 19 | projectName: 'pytorchvideo', 20 | organizationName: 'facebookresearch', 21 | 22 | // For no header links in the top nav bar -> headerLinks: [], 23 | headerLinks: [ 24 | {doc: 'tutorial_overview', label: 'Tutorials'}, 25 | {href: "https://pytorchvideo.readthedocs.io/en/latest/index.html", label: 'Docs'}, // TODO: Change this after the repo becomes public. 26 | {href: "https://github.com/facebookresearch/pytorchvideo/", label: 'GitHub'}, //TODO: Change this after repo becomes public 27 | ], 28 | 29 | 30 | /* path to images for header/footer */ 31 | headerIcon: 'img/logo.svg', 32 | footerIcon: 'img/logo.svg', 33 | favicon: 'img/favicon.png', 34 | 35 | /* Colors for website */ 36 | colors: { 37 | primaryColor: '#812ce5', 38 | secondaryColor: '#cc33cc', 39 | }, 40 | 41 | // This copyright info is used in /core/Footer.js and blog RSS/Atom feeds. 42 | copyright: `Copyright © ${new Date().getFullYear()} Facebook, Inc`, 43 | 44 | highlight: { 45 | // Highlight.js theme to use for syntax highlighting in code blocks. 46 | theme: 'atom-one-dark', 47 | }, 48 | 49 | // Add custom scripts here that would be placed in