├── .flake8
├── .gitattributes
├── .gitignore
├── ACKNOWLEDGMENTS
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── aff.png
├── architecture.png
├── builtin.py
├── builtin_meta.py
├── configs
    ├── ade20k
    │   └── semantic-segmentation
    │   │   ├── Base-ADE20K-SemanticSegmentation.yaml
    │   │   ├── aff
    │   │       ├── maskformer2_aff_mini_1_5th_bs32_80k.yaml
    │   │       ├── maskformer2_aff_mini_bs32_80k.yaml
    │   │       ├── maskformer2_aff_small_1_5th_bs32_80k.yaml
    │   │       ├── maskformer2_aff_small_bs32_80k.yaml
    │   │       ├── maskformer2_aff_tiny_1_5th_bs32_80k.yaml
    │   │       └── maskformer2_aff_tiny_bs32_80k.yaml
    │   │   └── maskformer2_R50_bs16_160k.yaml
    ├── cityscapes
    │   ├── instance-segmentation
    │   │   ├── Base-Cityscapes-InstanceSegmentation.yaml
    │   │   ├── aff
    │   │   │   ├── maskformer2_aff_base_384_bs16_90k.yaml
    │   │   │   ├── maskformer2_aff_mini_bs32_45k.yaml
    │   │   │   ├── maskformer2_aff_small_bs32_45k.yaml
    │   │   │   └── maskformer2_aff_tiny_bs32_45k.yaml
    │   │   └── maskformer2_R50_bs16_90k.yaml
    │   └── panoptic-segmentation
    │   │   ├── Base-Cityscapes-PanopticSegmentation.yaml
    │   │   ├── aff
    │   │       ├── maskformer2_aff_base_384_bs16_90k.yaml
    │   │       ├── maskformer2_aff_mini_bs32_45k.yaml
    │   │       ├── maskformer2_aff_small_bs32_45k.yaml
    │   │       └── maskformer2_aff_tiny_bs32_45k.yaml
    │   │   └── maskformer2_R50_bs16_90k.yaml
    └── coco
    │   └── instance-segmentation
    │       ├── Base-COCO-InstanceSegmentation.yaml
    │       ├── aff
    │           ├── maskformer2_aff_mini_1_5th_bs64_50ep.yaml
    │           ├── maskformer2_aff_mini_bs64_50ep.yaml
    │           ├── maskformer2_aff_small_1_5th_bs64_50ep.yaml
    │           ├── maskformer2_aff_small_bs64_50ep.yaml
    │           ├── maskformer2_aff_tiny_1_5th_bs64_50ep.yaml
    │           └── maskformer2_aff_tiny_bs64_50ep.yaml
    │       └── maskformer2_R50_bs16_50ep.yaml
├── create_env.sh
├── datasets
    ├── README.md
    ├── prepare_ade20k_sem_seg.py
    ├── prepare_coco_semantic_annos_from_panoptic_annos.py
    ├── prepare_cocofied_lvis.py
    └── prepare_cocofied_lvisv1.py
├── demo
    ├── demo.py
    └── predictor.py
├── demo1.png
├── demo2.png
├── mask2former
    ├── __init__.py
    ├── config.py
    ├── data
    │   ├── __init__.py
    │   ├── dataset_mappers
    │   │   ├── __init__.py
    │   │   ├── coco_instance_new_baseline_dataset_mapper.py
    │   │   ├── coco_panoptic_new_baseline_dataset_mapper.py
    │   │   ├── mask_former_instance_dataset_mapper.py
    │   │   ├── mask_former_panoptic_dataset_mapper.py
    │   │   └── mask_former_semantic_dataset_mapper.py
    │   └── datasets
    │   │   ├── __init__.py
    │   │   └── register_coco_panoptic_annos_semseg.py
    ├── evaluation
    │   ├── __init__.py
    │   └── instance_evaluation.py
    ├── maskformer_model.py
    ├── modeling
    │   ├── __init__.py
    │   ├── backbone
    │   │   ├── __init__.py
    │   │   ├── aff.py
    │   │   └── point_utils.py
    │   ├── clusten
    │   │   ├── __init__.py
    │   │   ├── clusten.py
    │   │   ├── src
    │   │   │   ├── clustenav_cuda.cpp
    │   │   │   ├── clustenav_cuda_kernel.cu
    │   │   │   ├── clustenqk_cuda.cpp
    │   │   │   ├── clustenqk_cuda_kernel.cu
    │   │   │   ├── clustenwf_cuda.cpp
    │   │   │   ├── clustenwf_cuda_kernel.cu
    │   │   │   ├── msdetrpc_cuda.cpp
    │   │   │   ├── msdetrpc_cuda_kernel.cu
    │   │   │   ├── setup.py
    │   │   │   ├── weighted_gather_cuda.cpp
    │   │   │   └── weighted_gather_cuda_kernel.cu
    │   │   ├── test_msdetrpc_kernel.py
    │   │   └── test_wg_kernel.py
    │   ├── criterion.py
    │   ├── matcher.py
    │   ├── meta_arch
    │   │   ├── __init__.py
    │   │   └── mask_former_head.py
    │   ├── pixel_decoder
    │   │   └── msdeformattn_pc.py
    │   └── transformer_decoder
    │   │   ├── __init__.py
    │   │   ├── mask2former_transformer_decoder.py
    │   │   ├── position_encoding.py
    │   │   └── transformer.py
    ├── test_time_augmentation.py
    └── utils
    │   ├── __init__.py
    │   └── misc.py
├── run_aff_segmentation.sh
├── run_demo.sh
├── tools
    ├── README.md
    ├── analyze_model.py
    └── convert-pretrained-model-to-d2.py
└── train_net.py


/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | select = B,C,E,F,P,T4,W,B9
 3 | max-line-length = 120
 4 | # C408 ignored because we like the dict keyword argument syntax
 5 | # E501 is not flexible enough, we're using B950 instead
 6 | ignore =
 7 |     E203,E305,E402,E501,E721,E741,F405,F821,F841,F999,W503,W504,C408,E303,E226,
 8 |     # shebang has extra meaning in fbcode lints, so I think it's not worth trying
 9 |     # to line this up with executable bit
10 |     EXE001,
11 |     # these ignores are from flake8-bugbear; please fix!
12 |     B007,B008,
13 |     # these ignores are from flake8-comprehensions; please fix!
14 |     C400,C401,C402,C403,C404,C405,C407,C411,C413,C414,C415,
15 |     # for "unable to detect undefined names"
16 |     F403,
17 |     # for "Too many leading '#' for block comment (E266)"
18 |     E266,
19 |     # for "E731 do not assign a lambda expression, use a def"
20 |     E731,
21 |     # for "future feature annotations is not defined"
22 |     F407,
23 |     # do not use bare 'except'
24 |     E722,
25 | per-file-ignores = 
26 |     __init__.py: F401,
27 |     #pre_table is used as a global variable
28 |     mask2former/modeling/pixel_decoder/msdeformattn_pc.py: F401
29 | optional-ascii-coding = True
30 | exclude =
31 |     ./.git,
32 |     ./docs,
33 |     ./scripts,
34 |     ./test
35 |     ./third_party,
36 |     ./venv,
37 |     *.pyi
38 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.pth filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | *.svg
  2 | .nfs*
  3 | .DS_Store
  4 | __pycache__/
  5 | *swp*
  6 | output/
  7 | 
  8 | # Byte-compiled / optimized / DLL files
  9 | __pycache__/
 10 | *.py[cod]
 11 | *$py.class
 12 | 
 13 | # C extensions
 14 | *.so
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | build/
 19 | develop-eggs/
 20 | dist/
 21 | downloads/
 22 | eggs/
 23 | .eggs/
 24 | lib/
 25 | lib64/
 26 | parts/
 27 | sdist/
 28 | var/
 29 | wheels/
 30 | pip-wheel-metadata/
 31 | share/python-wheels/
 32 | *.egg-info/
 33 | .installed.cfg
 34 | *.egg
 35 | MANIFEST
 36 | 
 37 | # PyInstaller
 38 | #  Usually these files are written by a python script from a template
 39 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 40 | *.manifest
 41 | *.spec
 42 | 
 43 | # Installer logs
 44 | pip-log.txt
 45 | pip-delete-this-directory.txt
 46 | 
 47 | # Unit test / coverage reports
 48 | htmlcov/
 49 | .tox/
 50 | .nox/
 51 | .coverage
 52 | .coverage.*
 53 | .cache
 54 | nosetests.xml
 55 | coverage.xml
 56 | *.cover
 57 | *.py,cover
 58 | .hypothesis/
 59 | .pytest_cache/
 60 | 
 61 | # Translations
 62 | *.mo
 63 | *.pot
 64 | 
 65 | # Django stuff:
 66 | *.log
 67 | local_settings.py
 68 | db.sqlite3
 69 | db.sqlite3-journal
 70 | 
 71 | # Flask stuff:
 72 | instance/
 73 | .webassets-cache
 74 | 
 75 | # Scrapy stuff:
 76 | .scrapy
 77 | 
 78 | # Sphinx documentation
 79 | docs/_build/
 80 | 
 81 | # PyBuilder
 82 | target/
 83 | 
 84 | # Jupyter Notebook
 85 | .ipynb_checkpoints
 86 | 
 87 | # IPython
 88 | profile_default/
 89 | ipython_config.py
 90 | 
 91 | # pyenv
 92 | .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
102 | __pypackages__/
103 | 
104 | # Celery stuff
105 | celerybeat-schedule
106 | celerybeat.pid
107 | 
108 | # SageMath parsed files
109 | *.sage.py
110 | 
111 | # Environments
112 | .env
113 | .venv
114 | env/
115 | venv/
116 | ENV/
117 | env.bak/
118 | venv.bak/
119 | 
120 | # Spyder project settings
121 | .spyderproject
122 | .spyproject
123 | 
124 | # Rope project settings
125 | .ropeproject
126 | 
127 | # mkdocs documentation
128 | /site
129 | 
130 | # mypy
131 | .mypy_cache/
132 | .dmypy.json
133 | dmypy.json
134 | 
135 | # Pyre type checker
136 | .pyre/
137 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, caste, color, religion, or sexual
 10 | identity and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the overall
 26 |   community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or advances of
 31 |   any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email address,
 35 |   without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at [opensource-conduct@group.apple.com](mailto:opensource-conduct@group.apple.com).
 63 | All complaints will be reviewed and investigated promptly and fairly.
 64 | 
 65 | All community leaders are obligated to respect the privacy and security of the
 66 | reporter of any incident.
 67 | 
 68 | ## Enforcement Guidelines
 69 | 
 70 | Community leaders will follow these Community Impact Guidelines in determining
 71 | the consequences for any action they deem in violation of this Code of Conduct:
 72 | 
 73 | ### 1. Correction
 74 | 
 75 | **Community Impact**: Use of inappropriate language or other behavior deemed
 76 | unprofessional or unwelcome in the community.
 77 | 
 78 | **Consequence**: A private, written warning from community leaders, providing
 79 | clarity around the nature of the violation and an explanation of why the
 80 | behavior was inappropriate. A public apology may be requested.
 81 | 
 82 | ### 2. Warning
 83 | 
 84 | **Community Impact**: A violation through a single incident or series of
 85 | actions.
 86 | 
 87 | **Consequence**: A warning with consequences for continued behavior. No
 88 | interaction with the people involved, including unsolicited interaction with
 89 | those enforcing the Code of Conduct, for a specified period of time. This
 90 | includes avoiding interactions in community spaces as well as external channels
 91 | like social media. Violating these terms may lead to a temporary or permanent
 92 | ban.
 93 | 
 94 | ### 3. Temporary Ban
 95 | 
 96 | **Community Impact**: A serious violation of community standards, including
 97 | sustained inappropriate behavior.
 98 | 
 99 | **Consequence**: A temporary ban from any sort of interaction or public
100 | communication with the community for a specified period of time. No public or
101 | private interaction with the people involved, including unsolicited interaction
102 | with those enforcing the Code of Conduct, is allowed during this period.
103 | Violating these terms may lead to a permanent ban.
104 | 
105 | ### 4. Permanent Ban
106 | 
107 | **Community Impact**: Demonstrating a pattern of violation of community
108 | standards, including sustained inappropriate behavior, harassment of an
109 | individual, or aggression toward or disparagement of classes of individuals.
110 | 
111 | **Consequence**: A permanent ban from any sort of public interaction within the
112 | community.
113 | 
114 | ## Attribution
115 | 
116 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
117 | version 2.1, available at
118 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
119 | 
120 | Community Impact Guidelines were inspired by
121 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
122 | 
123 | For answers to common questions about this code of conduct, see the FAQ at
124 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
125 | [https://www.contributor-covenant.org/translations][translations].
126 | 
127 | [homepage]: https://www.contributor-covenant.org
128 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
129 | [Mozilla CoC]: https://github.com/mozilla/diversity
130 | [FAQ]: https://www.contributor-covenant.org/faq
131 | [translations]: https://www.contributor-covenant.org/translations
132 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contribution Guide
 2 | 
 3 | Thanks for your interest in contributing. This project was released to accompany a research paper for purposes of reproducability, and beyond its publication there are limited plans for future development of the repository.
 4 | 
 5 | While we welcome new pull requests and issues please note that our response may be limited. Forks and out-of-tree improvements are strongly encouraged.
 6 | 
 7 | ## Before you get started
 8 | 
 9 | By submitting a pull request, you represent that you have the right to license your contribution to Apple and the community, and agree by submitting the patch that your contributions are licensed under the [LICENSE](LICENSE).
10 | 
11 | We ask that all community members read and observe our [Code of Conduct](CODE_OF_CONDUCT.md).


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (C) 2023 Apple Inc. All Rights Reserved.
 2 | 
 3 | IMPORTANT:  This Apple software is supplied to you by Apple
 4 | Inc. ("Apple") in consideration of your agreement to the following
 5 | terms, and your use, installation, modification or redistribution of
 6 | this Apple software constitutes acceptance of these terms.  If you do
 7 | not agree with these terms, please do not use, install, modify or
 8 | redistribute this Apple software.
 9 | 
10 | In consideration of your agreement to abide by the following terms, and
11 | subject to these terms, Apple grants you a personal, non-exclusive
12 | license, under Apple's copyrights in this original Apple software (the
13 | "Apple Software"), to use, reproduce, modify and redistribute the Apple
14 | Software, with or without modifications, in source and/or binary forms;
15 | provided that if you redistribute the Apple Software in its entirety and
16 | without modifications, you must retain this notice and the following
17 | text and disclaimers in all such redistributions of the Apple Software.
18 | Neither the name, trademarks, service marks or logos of Apple Inc. may
19 | be used to endorse or promote products derived from the Apple Software
20 | without specific prior written permission from Apple.  Except as
21 | expressly stated in this notice, no other rights or licenses, express or
22 | implied, are granted by Apple herein, including but not limited to any
23 | patent rights that may be infringed by your derivative works or by other
24 | works in which the Apple Software may be incorporated.
25 | 
26 | The Apple Software is provided by Apple on an "AS IS" basis.  APPLE
27 | MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
28 | THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
29 | FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
30 | OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
31 | 
32 | IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
33 | OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 | INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
36 | MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
37 | AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
38 | STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
39 | POSSIBILITY OF SUCH DAMAGE.
40 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # AutoFocusFormer
  2 | 
  3 | [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.1-4baaaa.svg)](CODE_OF_CONDUCT.md)
  4 | [![CLUSTEN](https://img.shields.io/badge/CUDA%20Extension-CLUSTEN-red)](clusten/)
  5 | 
  6 | AFF-Base: [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/autofocusformer-image-segmentation-off-the/instance-segmentation-on-cityscapes-val)](https://paperswithcode.com/sota/instance-segmentation-on-cityscapes-val?p=autofocusformer-image-segmentation-off-the) [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/autofocusformer-image-segmentation-off-the/panoptic-segmentation-on-cityscapes-val)](https://paperswithcode.com/sota/panoptic-segmentation-on-cityscapes-val?p=autofocusformer-image-segmentation-off-the)
  7 | 
  8 | This software project accompanies the research paper, *AutoFocusFormer: Image Segmentation off the Grid* (CVPR 2023).
  9 | 
 10 | [Chen Ziwen](https://www.chenziwe.com), Kaushik Patnaik, [Shuangfei Zhai](https://scholar.google.com/citations?user=G6vdBYsAAAAJ&hl=en), [Alvin Wan](http://alvinwan.com), [Zhile Ren](https://jrenzhile.com), [Alex Schwing](https://alexander-schwing.de/), [Alex Colburn](https://www.colburn.org), [Li Fuxin](https://web.engr.oregonstate.edu/~lif/)
 11 | 
 12 | [arXiv](https://arxiv.org/abs/2304.12406) | [video narration](https://youtu.be/i1mZtk70yGY) | [AFF-Classification](https://github.com/apple/ml-autofocusformer) | [AFF-Segmentation (this repo)](https://github.com/apple/ml-autofocusformer-segmentation)
 13 | 
 14 | ## Introduction
 15 | 
 16 | AutoFocusFormer (AFF) is the first **adaptive**-downsampling network capable of **dense** prediction tasks such as semantic/instance segmentation.
 17 | 
 18 | AFF abandons the traditional grid structure of image feature maps, and automatically learns to retain the most important pixels with respect to the task goal.
 19 | 
 20 | <div align="center">
 21 |   <img src="aff.png" width="100%" height="100%"/>
 22 | </div><br/>
 23 | 
 24 | AFF consists of a local-attention transformer backbone and a task-specific head. The backbone consists of four stages, each stage containing three modules: balanced clustering, local-attention transformer blocks, and adaptive downsampling.
 25 | 
 26 | <div align="center">
 27 |   <img src="architecture.png" width="100%" height="100%"/>
 28 | </div><br/>
 29 | 
 30 | AFF demonstrates significant savings on FLOPs (see our models with 1/5 downsampling rate), and significant improvement on recognition of small objects.
 31 | 
 32 | Notably, AFF-Small achieves **44.0** instance segmentation AP and **66.9** panoptic segmentation PQ on Cityscapes val with a backbone of only **42.6M** parameters, a performance on par with Swin-Large, a backbone with **197M** params (saving **78%**!).
 33 | 
 34 | <div align="center">
 35 |   <img src="demo1.png" width="100%" height="100%"/>
 36 | </div><br/>
 37 | 
 38 | <div align="center">
 39 |   <img src="demo2.png" width="100%" height="100%"/>
 40 | </div><br/>
 41 | 
 42 | This repository contains the AFF backbone and the point cloud-version of the Mask2Former segmentation head.
 43 | 
 44 | We also add a few convenient functionalities, such as visualizing prediction results on blurred version of the images, and evaluating on cocofied lvis v1 annotations.
 45 | 
 46 | ## Main Results with Pretrained Models 
 47 | 
 48 | **ADE20K Semantic Segmentation (val)**
 49 | | backbone | method | pretrain | crop size | mIoU | FLOPs | checkpoint |
 50 | | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
 51 | | AFF-Mini | Mask2Former | ImageNet-1K | 512x512 | 46.5 | 48.3G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/ade_sem/aff_mini.pth) |
 52 | | AFF-Mini-1/5 | Mask2Former | ImageNet-1K | 512x512 | 46.0 | 39.9G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/ade_sem/aff_mini_1_5th.pth) |
 53 | | AFF-Tiny | Mask2Former | ImageNet-1K | 512x512 | 50.2 | 64.6G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/ade_sem/aff_tiny.pth) |
 54 | | AFF-Tiny-1/5 | Mask2Former | ImageNet-1K | 512x512 | 50.0 | 51.1G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/ade_sem/aff_tiny_1_5th.pth) |
 55 | | AFF-Small | Mask2Former | ImageNet-1K | 512x512 | 51.2 | 87G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/ade_sem/aff_small.pth) |
 56 | | AFF-Small-1/5 | Mask2Former | ImageNet-1K | 512x512 | 51.9 | 67.2G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/ade_sem/aff_small_1_5th.pth) |
 57 | 
 58 | **Cityscapes Instance Segmentation (val)**
 59 | | backbone | method | pretrain | AP | checkpoint |
 60 | | :---: | :---: | :---: | :---: | :---: |
 61 | | AFF-Mini | Mask2Former | ImageNet-1K | 40.0 | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/city_ins/aff_mini.pth) |
 62 | | AFF-Tiny | Mask2Former | ImageNet-1K | 42.7 | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/city_ins/aff_tiny.pth) |
 63 | | AFF-Small | Mask2Former | ImageNet-1K | 44.0 | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/city_ins/aff_small.pth) |
 64 | | AFF-Base | Mask2Former | ImageNet-22K | 46.2 | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/city_pan/aff_base_22kto1k_384.pth) |
 65 | 
 66 | **Cityscapes Panoptic Segmentation (val)**
 67 | | backbone | method | pretrain | PQ(s.s.) | checkpoint |
 68 | | :---: | :---: | :---: | :---: | :---: |
 69 | | AFF-Mini | Mask2Former | ImageNet-1K | 62.7 | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/city_pan/aff_mini.pth) |
 70 | | AFF-Tiny | Mask2Former | ImageNet-1K | 65.7 | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/city_pan/aff_tiny.pth) |
 71 | | AFF-Small | Mask2Former | ImageNet-1K | 66.9 | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/city_pan/aff_small.pth) |
 72 | | AFF-Base | Mask2Former | ImageNet-22K | 67.7 | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/city_pan/aff_base_22kto1k_384.pth) |
 73 | 
 74 | **COCO Instance Segmentation (val)**
 75 | | backbone | method | pretrain | epochs | AP | FLOPs | checkpoint |
 76 | | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
 77 | | AFF-Mini | Mask2Former | ImageNet-1K | 50 | 42.3 | 148G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/coco_ins/aff_mini.pth) |
 78 | | AFF-Mini-1/5 | Mask2Former | ImageNet-1K | 50 | 42.3 | 120G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/coco_ins/aff_mini_1_5th.pth) |
 79 | | AFF-Tiny | Mask2Former | ImageNet-1K | 50 | 45.3 | 204G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/coco_ins/aff_tiny.pth) |
 80 | | AFF-Tiny-1/5 | Mask2Former | ImageNet-1K | 50 | 44.5 | 152G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/coco_ins/aff_tiny_1_5th.pth) |
 81 | | AFF-Small | Mask2Former | ImageNet-1K | 50 | 46.4 | 281G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/coco_ins/aff_small.pth) |
 82 | | AFF-Small-1/5 | Mask2Former | ImageNet-1K | 50 | 45.7 | 206G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/coco_ins/aff_small_1_5th.pth) |
 83 | 
 84 | ## Getting Started
 85 | 
 86 | ### Clone this repo
 87 | 
 88 | ```bash
 89 | git clone git@github.com:apple/ml-autofocusformer-segmentation.git
 90 | cd ml-autofocusformer-segmentation
 91 | ```
 92 | One can download the pre-trained checkpoints through the links in the tables above.
 93 | 
 94 | ### Create environment and install requirements
 95 | 
 96 | ```bash
 97 | sh create_env.sh
 98 | ```
 99 | 
100 | See further documentation inside the script file.
101 | 
102 | Our experiments are run with `CUDA==11.6` and `pytorch==1.12`.
103 | 
104 | ### Prepare data
105 | 
106 | Please refer to [dataset README](datasets/README.md).
107 | 
108 | ### Prepare pre-trained backbone checkpoint
109 | 
110 | Use `tools/convert-pretrained-model-to-d2.py` to convert any torch checkpoint `.pth` file trained on ImageNet into a Detectron2 model zoo format `.pkl` file.
111 | ```
112 | python tools/convert-pretrained-model-to-d2.py aff_mini.pth aff_mini.pkl
113 | ```
114 | Otherwise, d2 will assume the checkpoint is for the entire segmentation model and will not add `backbone.` to the parameter names, and thus the checkpoint will not be properly loaded. 
115 | 
116 | ### Train and evaluate
117 | 
118 | Modify the arguments in script `run_aff_segmentation.sh` and run
119 | ```bash
120 | sh run_aff_segmentation.sh
121 | ```
122 | for training or evaluation.
123 | 
124 | One can also directly modify the config files in `configs/`.
125 | 
126 | ### Visualize predictions for pre-trained models
127 | 
128 | See script `run_demo.sh`. More details can be found in [Mask2Former GETTING_STARTED.md](https://github.com/facebookresearch/Mask2Former/blob/main/GETTING_STARTED.md).
129 | 
130 | ### Analyze model FLOPs
131 | 
132 | See [tools README](tools/README.md).
133 | 
134 | ## Citing AutoFocusFormer
135 | 
136 | ```BibTeX
137 | @inproceedings{autofocusformer,
138 |     title = {AutoFocusFormer: Image Segmentation off the Grid},
139 |     booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
140 |     author = {Ziwen, Chen and Patnaik, Kaushik and Zhai, Shuangfei and Wan, Alvin and Ren, Zhile and Schwing, Alex and Colburn, Alex and Fuxin, Li},
141 |     year = {2023},
142 | }
143 | ```
144 | 


--------------------------------------------------------------------------------
/aff.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-autofocusformer-segmentation/52908e8ad5112b5bff1d043e6a06a9e8f9aad3ba/aff.png


--------------------------------------------------------------------------------
/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-autofocusformer-segmentation/52908e8ad5112b5bff1d043e6a06a9e8f9aad3ba/architecture.png


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/Base-ADE20K-SemanticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("ade20k_sem_seg_train",)
18 |   TEST: ("ade20k_sem_seg_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 160000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: False
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 512
40 |   MAX_SIZE_TRAIN: 2048
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (512, 512)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: 512  # used in dataset mapper
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_semantic"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [256, 384, 512, 640, 768, 896]
56 |     MAX_SIZE: 3584
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | SEED: 0
63 | 


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/aff/maskformer2_aff_mini_1_5th_bs32_80k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "AutoFocusFormer"
 5 |   AFF:
 6 |     EMBED_DIM: [32,128,256,384]
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [ 2, 4, 8, 16 ]
 9 |     DROP_PATH_RATE: 0.0
10 |     PATCH_NORM: True
11 |     MLP_RATIO: 2.
12 |     CLUSTER_SIZE: 8
13 |     NBHD_SIZE: [48,48,48,48]
14 |     ALPHA: 4.0
15 |     DS_RATE: 0.2
16 |   WEIGHTS: "aff_mini_1_5th.pkl"
17 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
18 |   PIXEL_STD: [58.395, 57.120, 57.375]
19 | SOLVER:
20 |   IMS_PER_BATCH: 32
21 |   BASE_LR: 0.0002
22 |   MAX_ITER: 80000
23 | 


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/aff/maskformer2_aff_mini_bs32_80k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "AutoFocusFormer"
 5 |   AFF:
 6 |     EMBED_DIM: [32,128,256,384]
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [ 2, 4, 8, 16 ]
 9 |     DROP_PATH_RATE: 0.0
10 |     PATCH_NORM: True
11 |     MLP_RATIO: 2.
12 |     CLUSTER_SIZE: 8
13 |     NBHD_SIZE: [48,48,48,48]
14 |     ALPHA: 4.0
15 |     DS_RATE: 0.25
16 |   WEIGHTS: "aff_mini.pkl"
17 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
18 |   PIXEL_STD: [58.395, 57.120, 57.375]
19 | SOLVER:
20 |   IMS_PER_BATCH: 32
21 |   BASE_LR: 0.0002
22 |   MAX_ITER: 80000
23 | 


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/aff/maskformer2_aff_small_1_5th_bs32_80k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "AutoFocusFormer"
 5 |   AFF:
 6 |     EMBED_DIM: [96,192,384,768]
 7 |     DEPTHS: [3,4,18,2]
 8 |     NUM_HEADS: [3,6,12,24]
 9 |     DROP_PATH_RATE: 0.3
10 |     PATCH_NORM: True
11 |     MLP_RATIO: 3.
12 |     CLUSTER_SIZE: 8
13 |     NBHD_SIZE: [48,48,48,48]
14 |     LAYER_SCALE: 1e-5 # turned off if 0.0
15 |     ALPHA: 4.0
16 |     DS_RATE: 0.2
17 |   WEIGHTS: "aff_small_1_5th.pkl"
18 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
19 |   PIXEL_STD: [58.395, 57.120, 57.375]
20 | SOLVER:
21 |   IMS_PER_BATCH: 32
22 |   BASE_LR: 0.0002
23 |   MAX_ITER: 80000
24 | 


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/aff/maskformer2_aff_small_bs32_80k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "AutoFocusFormer"
 5 |   AFF:
 6 |     EMBED_DIM: [96,192,384,768]
 7 |     DEPTHS: [3,4,18,2]
 8 |     NUM_HEADS: [3,6,12,24]
 9 |     DROP_PATH_RATE: 0.3
10 |     PATCH_NORM: True
11 |     MLP_RATIO: 3.
12 |     CLUSTER_SIZE: 8
13 |     NBHD_SIZE: [48,48,48,48]
14 |     LAYER_SCALE: 1e-5 # turned off if 0.0
15 |     ALPHA: 4.0
16 |     DS_RATE: 0.25
17 |   WEIGHTS: "aff_small.pkl"
18 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
19 |   PIXEL_STD: [58.395, 57.120, 57.375]
20 | SOLVER:
21 |   IMS_PER_BATCH: 32
22 |   BASE_LR: 0.0002
23 |   MAX_ITER: 80000
24 | 


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/aff/maskformer2_aff_tiny_1_5th_bs32_80k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "AutoFocusFormer"
 5 |   AFF:
 6 |     EMBED_DIM: [64,128,256,512]
 7 |     DEPTHS: [3,4,18,5]
 8 |     NUM_HEADS: [2,4,8,16]
 9 |     DROP_PATH_RATE: 0.3
10 |     PATCH_NORM: True
11 |     MLP_RATIO: 3.
12 |     CLUSTER_SIZE: 8
13 |     NBHD_SIZE: [48,48,48,48]
14 |     ALPHA: 4.0
15 |     DS_RATE: 0.2
16 |   WEIGHTS: "aff_tiny_1_5th.pkl"
17 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
18 |   PIXEL_STD: [58.395, 57.120, 57.375]
19 | SOLVER:
20 |   IMS_PER_BATCH: 32
21 |   BASE_LR: 0.0002
22 |   MAX_ITER: 80000
23 | 


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/aff/maskformer2_aff_tiny_bs32_80k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "AutoFocusFormer"
 5 |   AFF:
 6 |     EMBED_DIM: [64,128,256,512]
 7 |     DEPTHS: [3,4,18,5]
 8 |     NUM_HEADS: [2,4,8,16]
 9 |     DROP_PATH_RATE: 0.3
10 |     PATCH_NORM: True
11 |     MLP_RATIO: 3.
12 |     CLUSTER_SIZE: 8
13 |     NBHD_SIZE: [48,48,48,48]
14 |     ALPHA: 4.0
15 |     DS_RATE: 0.25
16 |   WEIGHTS: "aff_tiny.pkl"
17 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
18 |   PIXEL_STD: [58.395, 57.120, 57.375]
19 | SOLVER:
20 |   IMS_PER_BATCH: 32
21 |   BASE_LR: 0.0002
22 |   MAX_ITER: 80000
23 | 


--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/maskformer2_R50_bs16_160k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-ADE20K-SemanticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 150
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: False
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/Base-Cityscapes-InstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     NORM: "SyncBN"  # use syncbn for cityscapes dataset
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("cityscapes_fine_instance_seg_train",)
18 |   TEST: ("cityscapes_fine_instance_seg_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 90000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: False
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 1024
40 |   MAX_SIZE_TRAIN: 4096
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (512, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: -1
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_instance"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
56 |     MAX_SIZE: 4096
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | SEED: 0
63 | 


--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/aff/maskformer2_aff_base_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "AutoFocusFormer"
 5 |   AFF:
 6 |     EMBED_DIM: [128, 256, 512, 1024]
 7 |     DEPTHS: [3,4,18,2]
 8 |     NUM_HEADS: [4,8,16,32]
 9 |     DROP_PATH_RATE: 0.3
10 |     PATCH_NORM: True
11 |     MLP_RATIO: 3.
12 |     CLUSTER_SIZE: 24
13 |     NBHD_SIZE: [144,144,144,144]
14 |     LAYER_SCALE: 1e-5 # turned off if 0.0
15 |     ALPHA: 8.0
16 |     DS_RATE: 0.25
17 |   WEIGHTS: "aff_base_22kto1k_384.pkl"
18 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
19 |   PIXEL_STD: [58.395, 57.120, 57.375]
20 |   MASK_FORMER:
21 |     NUM_OBJECT_QUERIES: 250
22 | SOLVER:
23 |   IMS_PER_BATCH: 16
24 |   BASE_LR: 0.0001
25 |   MAX_ITER: 90000
26 | TEST:
27 |   DETECTIONS_PER_IMAGE: 250
28 | 


--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/aff/maskformer2_aff_mini_bs32_45k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "AutoFocusFormer"
 5 |   AFF:
 6 |     EMBED_DIM: [32,128,256,384]
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [ 2, 4, 8, 16 ]
 9 |     DROP_PATH_RATE: 0.0
10 |     PATCH_NORM: True
11 |     MLP_RATIO: 2.
12 |     CLUSTER_SIZE: 8
13 |     NBHD_SIZE: [48,48,48,48]
14 |     ALPHA: 8.0
15 |     DS_RATE: 0.25
16 |   WEIGHTS: "aff_mini.pkl"
17 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
18 |   PIXEL_STD: [58.395, 57.120, 57.375]
19 | SOLVER:
20 |   IMS_PER_BATCH: 32
21 |   BASE_LR: 0.0002
22 |   MAX_ITER: 45000
23 | 


--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/aff/maskformer2_aff_small_bs32_45k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "AutoFocusFormer"
 5 |   AFF:
 6 |     EMBED_DIM: [96,192,384,768]
 7 |     DEPTHS: [3,4,18,2]
 8 |     NUM_HEADS: [3,6,12,24]
 9 |     DROP_PATH_RATE: 0.3
10 |     PATCH_NORM: True
11 |     MLP_RATIO: 3.
12 |     CLUSTER_SIZE: 8
13 |     NBHD_SIZE: [48,48,48,48]
14 |     LAYER_SCALE: 1e-5 # turned off if 0.0
15 |     ALPHA: 8.0
16 |     DS_RATE: 0.25
17 |   WEIGHTS: "aff_small.pkl"
18 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
19 |   PIXEL_STD: [58.395, 57.120, 57.375]
20 | SOLVER:
21 |   IMS_PER_BATCH: 32
22 |   BASE_LR: 0.0002
23 |   MAX_ITER: 45000
24 | 


--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/aff/maskformer2_aff_tiny_bs32_45k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "AutoFocusFormer"
 5 |   AFF:
 6 |     EMBED_DIM: [64,128,256,512]
 7 |     DEPTHS: [3,4,18,5]
 8 |     NUM_HEADS: [2,4,8,16]
 9 |     DROP_PATH_RATE: 0.3
10 |     PATCH_NORM: True
11 |     MLP_RATIO: 3.
12 |     CLUSTER_SIZE: 8
13 |     NBHD_SIZE: [48,48,48,48]
14 |     ALPHA: 8.0
15 |     DS_RATE: 0.25
16 |   WEIGHTS: "aff_tiny.pkl"
17 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
18 |   PIXEL_STD: [58.395, 57.120, 57.375]
19 | SOLVER:
20 |   IMS_PER_BATCH: 32
21 |   BASE_LR: 0.0002
22 |   MAX_ITER: 45000
23 | 


--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/maskformer2_R50_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 8
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: False
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/Base-Cityscapes-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     NORM: "SyncBN"  # use syncbn for cityscapes dataset
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 | DATASETS:
17 |   TRAIN: ("cityscapes_fine_panoptic_train",)
18 |   TEST: ("cityscapes_fine_panoptic_val",)
19 | SOLVER:
20 |   IMS_PER_BATCH: 16
21 |   BASE_LR: 0.0001
22 |   MAX_ITER: 90000
23 |   WARMUP_FACTOR: 1.0
24 |   WARMUP_ITERS: 0
25 |   WEIGHT_DECAY: 0.05
26 |   OPTIMIZER: "ADAMW"
27 |   LR_SCHEDULER_NAME: "WarmupPolyLR"
28 |   BACKBONE_MULTIPLIER: 0.1
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 |     CLIP_TYPE: "full_model"
32 |     CLIP_VALUE: 0.01
33 |     NORM_TYPE: 2.0
34 |   AMP:
35 |     ENABLED: False 
36 | INPUT:
37 |   MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
38 |   MIN_SIZE_TRAIN_SAMPLING: "choice"
39 |   MIN_SIZE_TEST: 1024
40 |   MAX_SIZE_TRAIN: 4096
41 |   MAX_SIZE_TEST: 2048
42 |   CROP:
43 |     ENABLED: True
44 |     TYPE: "absolute"
45 |     SIZE: (512, 1024)
46 |     SINGLE_CATEGORY_MAX_AREA: 1.0
47 |   COLOR_AUG_SSD: True
48 |   SIZE_DIVISIBILITY: -1
49 |   FORMAT: "RGB"
50 |   DATASET_MAPPER_NAME: "mask_former_panoptic"
51 | TEST:
52 |   EVAL_PERIOD: 5000
53 |   AUG:
54 |     ENABLED: False
55 |     MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
56 |     MAX_SIZE: 4096
57 |     FLIP: True
58 | DATALOADER:
59 |   FILTER_EMPTY_ANNOTATIONS: True
60 |   NUM_WORKERS: 4
61 | VERSION: 2
62 | SEED: 0
63 | 


--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/aff/maskformer2_aff_base_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "AutoFocusFormer"
 5 |   AFF:
 6 |     EMBED_DIM: [128, 256, 512, 1024]
 7 |     DEPTHS: [3,4,18,2]
 8 |     NUM_HEADS: [4,8,16,32]
 9 |     DROP_PATH_RATE: 0.3
10 |     PATCH_NORM: True
11 |     MLP_RATIO: 3.
12 |     CLUSTER_SIZE: 24
13 |     NBHD_SIZE: [144,144,144,144]
14 |     LAYER_SCALE: 1e-5 # turned off if 0.0
15 |     ALPHA: 8.0
16 |     DS_RATE: 0.25
17 |   WEIGHTS: "aff_base_22kto1k_384.pkl"
18 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
19 |   PIXEL_STD: [58.395, 57.120, 57.375]
20 |   MASK_FORMER:
21 |     NUM_OBJECT_QUERIES: 250
22 | SOLVER:
23 |   IMS_PER_BATCH: 16
24 |   BASE_LR: 0.0001
25 |   MAX_ITER: 90000
26 | TEST:
27 |   DETECTIONS_PER_IMAGE: 250
28 | 


--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/aff/maskformer2_aff_mini_bs32_45k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "AutoFocusFormer"
 5 |   AFF:
 6 |     EMBED_DIM: [32,128,256,384]
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [ 2, 4, 8, 16 ]
 9 |     DROP_PATH_RATE: 0.0
10 |     PATCH_NORM: True
11 |     MLP_RATIO: 2.
12 |     CLUSTER_SIZE: 8
13 |     NBHD_SIZE: [48,48,48,48]
14 |     ALPHA: 8.0
15 |     DS_RATE: 0.25
16 |   WEIGHTS: "aff_mini.pkl"
17 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
18 |   PIXEL_STD: [58.395, 57.120, 57.375]
19 | SOLVER:
20 |   IMS_PER_BATCH: 32
21 |   BASE_LR: 0.0002
22 |   MAX_ITER: 45000
23 | 


--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/aff/maskformer2_aff_small_bs32_45k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "AutoFocusFormer"
 5 |   AFF:
 6 |     EMBED_DIM: [96,192,384,768]
 7 |     DEPTHS: [3,4,18,2]
 8 |     NUM_HEADS: [3,6,12,24]
 9 |     DROP_PATH_RATE: 0.3
10 |     PATCH_NORM: True
11 |     MLP_RATIO: 3.
12 |     CLUSTER_SIZE: 8
13 |     NBHD_SIZE: [48,48,48,48]
14 |     LAYER_SCALE: 1e-5 # turned off if 0.0
15 |     ALPHA: 8.0
16 |     DS_RATE: 0.25
17 |   WEIGHTS: "aff_small.pkl"
18 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
19 |   PIXEL_STD: [58.395, 57.120, 57.375]
20 | SOLVER:
21 |   IMS_PER_BATCH: 32
22 |   BASE_LR: 0.0002
23 |   MAX_ITER: 45000
24 | 


--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/aff/maskformer2_aff_tiny_bs32_45k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "AutoFocusFormer"
 5 |   AFF:
 6 |     EMBED_DIM: [64,128,256,512]
 7 |     DEPTHS: [3,4,18,5]
 8 |     NUM_HEADS: [2,4,8,16]
 9 |     DROP_PATH_RATE: 0.3
10 |     PATCH_NORM: True
11 |     MLP_RATIO: 3.
12 |     CLUSTER_SIZE: 8
13 |     NBHD_SIZE: [48,48,48,48]
14 |     ALPHA: 8.0
15 |     DS_RATE: 0.25
16 |   WEIGHTS: "aff_tiny.pkl"
17 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
18 |   PIXEL_STD: [58.395, 57.120, 57.375]
19 | SOLVER:
20 |   IMS_PER_BATCH: 32
21 |   BASE_LR: 0.0002
22 |   MAX_ITER: 45000
23 | 


--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/maskformer2_R50_bs16_90k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-Cityscapes-PanopticSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 19
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: True
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: True
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   BACKBONE:
 3 |     FREEZE_AT: 0
 4 |     NAME: "build_resnet_backbone"
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 6 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 7 |   PIXEL_STD: [58.395, 57.120, 57.375]
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     STEM_TYPE: "basic"  # not used
11 |     STEM_OUT_CHANNELS: 64
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 |     # NORM: "SyncBN"
15 |     RES5_MULTI_GRID: [1, 1, 1]  # not used
16 |   AFF:
17 |     SHEPARD_POWER: 4.0
18 |     SHEPARD_POWER_LEARNABLE: False
19 | DATASETS:
20 |   TRAIN: ("coco_2017_train",)
21 |   TEST: ("coco_2017_val",)
22 | SOLVER:
23 |   IMS_PER_BATCH: 16
24 |   BASE_LR: 0.0001
25 |   STEPS: (327778, 355092)
26 |   MAX_ITER: 368750
27 |   WARMUP_FACTOR: 1.0
28 |   WARMUP_ITERS: 10
29 |   WEIGHT_DECAY: 0.05
30 |   OPTIMIZER: "ADAMW"
31 |   BACKBONE_MULTIPLIER: 0.1
32 |   CLIP_GRADIENTS:
33 |     ENABLED: True
34 |     CLIP_TYPE: "full_model"
35 |     CLIP_VALUE: 0.01
36 |     NORM_TYPE: 2.0
37 |   AMP:
38 |     ENABLED: False 
39 | INPUT:
40 |   IMAGE_SIZE: 1024
41 |   MIN_SCALE: 0.1
42 |   MAX_SCALE: 2.0
43 |   FORMAT: "RGB"
44 |   DATASET_MAPPER_NAME: "coco_instance_lsj"
45 | TEST:
46 |   EVAL_PERIOD: 5000
47 | DATALOADER:
48 |   FILTER_EMPTY_ANNOTATIONS: True
49 |   NUM_WORKERS: 4
50 | VERSION: 2
51 | SEED: 0
52 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/aff/maskformer2_aff_mini_1_5th_bs64_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "AutoFocusFormer"
 5 |   AFF:
 6 |     EMBED_DIM: [32,128,256,384]
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [ 2, 4, 8, 16 ]
 9 |     DROP_PATH_RATE: 0.0
10 |     PATCH_NORM: True
11 |     MLP_RATIO: 2.
12 |     CLUSTER_SIZE: 8
13 |     NBHD_SIZE: [48,48,48,48]
14 |     ALPHA: 4.0
15 |     DS_RATE: 0.2
16 |   WEIGHTS: "aff_mini_1_5th.pkl"
17 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
18 |   PIXEL_STD: [58.395, 57.120, 57.375]
19 | SOLVER:
20 |   IMS_PER_BATCH: 64
21 |   BASE_LR: 0.0002
22 |   STEPS: (81945, 88773)
23 |   MAX_ITER: 92188 
24 |   WARMUP_ITERS: 3
25 |   CHECKPOINT_PERIOD: 2500
26 | TEST:
27 |   EVAL_PERIOD: 2500
28 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/aff/maskformer2_aff_mini_bs64_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "AutoFocusFormer"
 5 |   AFF:
 6 |     EMBED_DIM: [32,128,256,384]
 7 |     DEPTHS: [2, 2, 6, 2]
 8 |     NUM_HEADS: [ 2, 4, 8, 16 ]
 9 |     DROP_PATH_RATE: 0.0
10 |     PATCH_NORM: True
11 |     MLP_RATIO: 2.
12 |     CLUSTER_SIZE: 8
13 |     NBHD_SIZE: [48,48,48,48]
14 |     ALPHA: 4.0
15 |     DS_RATE: 0.25
16 |   WEIGHTS: "aff_mini.pkl"
17 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
18 |   PIXEL_STD: [58.395, 57.120, 57.375]
19 | SOLVER:
20 |   IMS_PER_BATCH: 64
21 |   BASE_LR: 0.0002
22 |   STEPS: (81945, 88773)
23 |   MAX_ITER: 92188 
24 |   WARMUP_ITERS: 3
25 |   CHECKPOINT_PERIOD: 2500
26 | TEST:
27 |   EVAL_PERIOD: 2500
28 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/aff/maskformer2_aff_small_1_5th_bs64_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "AutoFocusFormer"
 5 |   AFF:
 6 |     EMBED_DIM: [96,192,384,768]
 7 |     DEPTHS: [3,4,18,2]
 8 |     NUM_HEADS: [3,6,12,24]
 9 |     DROP_PATH_RATE: 0.3
10 |     PATCH_NORM: True
11 |     MLP_RATIO: 3.
12 |     CLUSTER_SIZE: 8
13 |     NBHD_SIZE: [48,48,48,48]
14 |     LAYER_SCALE: 1e-5 # turned off if 0.0
15 |     ALPHA: 4.0
16 |     DS_RATE: 0.2
17 |   WEIGHTS: "aff_small_1_5th.pkl"
18 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
19 |   PIXEL_STD: [58.395, 57.120, 57.375]
20 | SOLVER:
21 |   IMS_PER_BATCH: 64
22 |   BASE_LR: 0.0002
23 |   STEPS: (81945, 88773)
24 |   MAX_ITER: 92188 
25 |   WARMUP_ITERS: 3
26 |   CHECKPOINT_PERIOD: 2500
27 | TEST:
28 |   EVAL_PERIOD: 2500
29 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/aff/maskformer2_aff_small_bs64_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "AutoFocusFormer"
 5 |   AFF:
 6 |     EMBED_DIM: [96,192,384,768]
 7 |     DEPTHS: [3,4,18,2]
 8 |     NUM_HEADS: [3,6,12,24]
 9 |     DROP_PATH_RATE: 0.3
10 |     PATCH_NORM: True
11 |     MLP_RATIO: 3.
12 |     CLUSTER_SIZE: 8
13 |     NBHD_SIZE: [48,48,48,48]
14 |     LAYER_SCALE: 1e-5 # turned off if 0.0
15 |     ALPHA: 4.0
16 |     DS_RATE: 0.25
17 |   WEIGHTS: "aff_small.pkl"
18 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
19 |   PIXEL_STD: [58.395, 57.120, 57.375]
20 | SOLVER:
21 |   IMS_PER_BATCH: 64
22 |   BASE_LR: 0.0002
23 |   STEPS: (81945, 88773)
24 |   MAX_ITER: 92188 
25 |   WARMUP_ITERS: 3
26 |   CHECKPOINT_PERIOD: 2500
27 | TEST:
28 |   EVAL_PERIOD: 2500
29 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/aff/maskformer2_aff_tiny_1_5th_bs64_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "AutoFocusFormer"
 5 |   AFF:
 6 |     EMBED_DIM: [64,128,256,512]
 7 |     DEPTHS: [3,4,18,5]
 8 |     NUM_HEADS: [2,4,8,16]
 9 |     DROP_PATH_RATE: 0.3
10 |     PATCH_NORM: True
11 |     MLP_RATIO: 3.
12 |     CLUSTER_SIZE: 8
13 |     NBHD_SIZE: [48,48,48,48]
14 |     ALPHA: 4.0
15 |     DS_RATE: 0.2
16 |   WEIGHTS: "aff_tiny_1_5th.pkl"
17 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
18 |   PIXEL_STD: [58.395, 57.120, 57.375]
19 | SOLVER:
20 |   IMS_PER_BATCH: 64
21 |   BASE_LR: 0.0002
22 |   STEPS: (81945, 88773)
23 |   MAX_ITER: 92188 
24 |   WARMUP_ITERS: 3
25 |   CHECKPOINT_PERIOD: 2500
26 | TEST:
27 |   EVAL_PERIOD: 2500
28 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/aff/maskformer2_aff_tiny_bs64_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "AutoFocusFormer"
 5 |   AFF:
 6 |     EMBED_DIM: [64,128,256,512]
 7 |     DEPTHS: [3,4,18,5]
 8 |     NUM_HEADS: [2,4,8,16]
 9 |     DROP_PATH_RATE: 0.3
10 |     PATCH_NORM: True
11 |     MLP_RATIO: 3.
12 |     CLUSTER_SIZE: 8
13 |     NBHD_SIZE: [48,48,48,48]
14 |     ALPHA: 4.0
15 |     DS_RATE: 0.25
16 |   WEIGHTS: "aff_tiny.pkl"
17 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
18 |   PIXEL_STD: [58.395, 57.120, 57.375]
19 | SOLVER:
20 |   IMS_PER_BATCH: 64
21 |   BASE_LR: 0.0002
22 |   STEPS: (81945, 88773)
23 |   MAX_ITER: 92188 
24 |   WARMUP_ITERS: 3
25 |   CHECKPOINT_PERIOD: 2500
26 | TEST:
27 |   EVAL_PERIOD: 2500
28 | 


--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/maskformer2_R50_bs16_50ep.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: Base-COCO-InstanceSegmentation.yaml
 2 | MODEL:
 3 |   META_ARCHITECTURE: "MaskFormer"
 4 |   SEM_SEG_HEAD:
 5 |     NAME: "MaskFormerHead"
 6 |     IGNORE_VALUE: 255
 7 |     NUM_CLASSES: 80
 8 |     LOSS_WEIGHT: 1.0
 9 |     CONVS_DIM: 256
10 |     MASK_DIM: 256
11 |     NORM: "GN"
12 |     # pixel decoder
13 |     PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 |     DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 |     COMMON_STRIDE: 4
17 |     TRANSFORMER_ENC_LAYERS: 6
18 |   MASK_FORMER:
19 |     TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 |     TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 |     DEEP_SUPERVISION: True
22 |     NO_OBJECT_WEIGHT: 0.1
23 |     CLASS_WEIGHT: 2.0
24 |     MASK_WEIGHT: 5.0
25 |     DICE_WEIGHT: 5.0
26 |     HIDDEN_DIM: 256
27 |     NUM_OBJECT_QUERIES: 100
28 |     NHEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     ENC_LAYERS: 0
32 |     PRE_NORM: False
33 |     ENFORCE_INPUT_PROJ: False
34 |     SIZE_DIVISIBILITY: 32
35 |     DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
36 |     TRAIN_NUM_POINTS: 12544
37 |     OVERSAMPLE_RATIO: 3.0
38 |     IMPORTANCE_SAMPLE_RATIO: 0.75
39 |     TEST:
40 |       SEMANTIC_ON: False
41 |       INSTANCE_ON: True
42 |       PANOPTIC_ON: False
43 |       OVERLAP_THRESHOLD: 0.8
44 |       OBJECT_MASK_THRESHOLD: 0.8
45 | 


--------------------------------------------------------------------------------
/create_env.sh:
--------------------------------------------------------------------------------
 1 | # Create a conda virtual environment and activate it
 2 | conda create -n aff python=3.8
 3 | conda activate aff
 4 | 
 5 | # Install requirements
 6 | pip install \
 7 |         yacs==0.1.8 \
 8 |         termcolor==2.2.0 \
 9 |         timm==0.6.12 \
10 |         pykeops==2.1.1 \
11 |         ptflops==0.6.9 \
12 |         numpy==1.22.4 \
13 |         cython==0.29.33 \
14 |         scipy==1.9.1 \
15 |         shapely==2.0.1 \
16 |         h5py==3.8.0 \
17 |         submitit==1.4.5 \
18 |         scikit-image==0.20.0
19 | conda install -c conda-forge opencv
20 | conda install pytorch==1.12.0 torchvision==0.13.0 torchaudio==0.12.0 cudatoolkit=11.6 -c pytorch -c conda-forge
21 | 
22 | # Detectron2
23 | python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
24 | 
25 | # add ADE20K_SEM_SEG_CATEGORIES_COLORS for consistent color in ADE prediction visualization
26 | mv ./builtin.py path/to/conda/lib/python3.8/site-packages/detectron2/data/datasets
27 | mv ./builtin_meta.py path/to/conda/lib/python3.8/site-packages/detectron2/data/datasets
28 | 
29 | # Install the custom CUDA kernels for AFF
30 | cd mask2former/modeling/clusten/src && python setup.py install
31 | 


--------------------------------------------------------------------------------
/datasets/README.md:
--------------------------------------------------------------------------------
  1 | # Prepare Datasets
  2 | 
  3 | A dataset can be used by accessing [DatasetCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.DatasetCatalog)
  4 | for its data, or [MetadataCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.MetadataCatalog) for its metadata (class names, etc).
  5 | This document explains how to setup the builtin datasets so they can be used by the above APIs.
  6 | [Use Custom Datasets](https://detectron2.readthedocs.io/tutorials/datasets.html) gives a deeper dive on how to use `DatasetCatalog` and `MetadataCatalog`,
  7 | and how to add new datasets to them.
  8 | 
  9 | The datasets are assumed to exist in a directory specified by the environment variable
 10 | `DETECTRON2_DATASETS`.
 11 | Under this directory, detectron2 will look for datasets in the structure described below, if needed.
 12 | ```
 13 | $DETECTRON2_DATASETS/
 14 |   ADEChallengeData2016/
 15 |   coco/
 16 |   cityscapes/
 17 | ```
 18 | 
 19 | You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`.
 20 | If left unset, the default is `./datasets` relative to your current working directory.
 21 | 
 22 | 
 23 | ## Expected dataset structure for [COCO](https://cocodataset.org/#download):
 24 | 
 25 | ```
 26 | coco/
 27 |   annotations/
 28 |     instances_{train,val}2017.json
 29 |     panoptic_{train,val}2017.json
 30 |   {train,val}2017/
 31 |     # image files that are mentioned in the corresponding json
 32 |   panoptic_{train,val}2017/  # png annotations
 33 |   panoptic_semseg_{train,val}2017/  # generated by the script mentioned below
 34 | ```
 35 | 
 36 | Install panopticapi by:
 37 | ```
 38 | pip install git+https://github.com/cocodataset/panopticapi.git
 39 | ```
 40 | Then, run `python datasets/prepare_coco_semantic_annos_from_panoptic_annos.py`, to extract semantic annotations from panoptic annotations (only used for evaluation).
 41 | 
 42 | 
 43 | ## Expected dataset structure for [cityscapes](https://www.cityscapes-dataset.com/downloads/):
 44 | ```
 45 | cityscapes/
 46 |   gtFine/
 47 |     train/
 48 |       aachen/
 49 |         color.png, instanceIds.png, labelIds.png, polygons.json,
 50 |         labelTrainIds.png
 51 |       ...
 52 |     val/
 53 |     test/
 54 |     # below are generated Cityscapes panoptic annotation
 55 |     cityscapes_panoptic_train.json
 56 |     cityscapes_panoptic_train/
 57 |     cityscapes_panoptic_val.json
 58 |     cityscapes_panoptic_val/
 59 |     cityscapes_panoptic_test.json
 60 |     cityscapes_panoptic_test/
 61 |   leftImg8bit/
 62 |     train/
 63 |     val/
 64 |     test/
 65 | ```
 66 | Download cityscapes scripts by:
 67 | ```
 68 | git clone https://github.com/mcordts/cityscapesScripts.git
 69 | ```
 70 | 
 71 | Note: to create labelTrainIds.png, first prepare the above structure, then run cityscapesescript with:
 72 | ```
 73 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesScripts/cityscapesscripts/preparation/createTrainIdLabelImgs.py
 74 | ```
 75 | These files are not needed for instance segmentation.
 76 | 
 77 | Note: to generate Cityscapes panoptic dataset, run cityscapesescript with:
 78 | ```
 79 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesScripts/cityscapesscripts/preparation/createPanopticImgs.py
 80 | ```
 81 | These files are not needed for semantic and instance segmentation.
 82 | 
 83 | 
 84 | ## Expected dataset structure for [ADE20k](http://sceneparsing.csail.mit.edu/):
 85 | ```
 86 | ADEChallengeData2016/
 87 |   images/
 88 |   annotations/
 89 |   objectInfo150.txt
 90 |   # download instance annotation
 91 |   annotations_instance/
 92 |   # generated by prepare_ade20k_sem_seg.py
 93 |   annotations_detectron2/
 94 |   # below are generated by prepare_ade20k_pan_seg.py
 95 |   ade20k_panoptic_{train,val}.json
 96 |   ade20k_panoptic_{train,val}/
 97 |   # below are generated by prepare_ade20k_ins_seg.py
 98 |   ade20k_instance_{train,val}.json
 99 | ```
100 | 
101 | The directory `annotations_detectron2` is generated by running `python datasets/prepare_ade20k_sem_seg.py`.
102 | 
103 | ## Expected dataset structure for [LVIS instance segmentation](https://www.lvisdataset.org/dataset):
104 | ```
105 | coco/
106 |   {train,val,test}2017/
107 | lvis/
108 |   lvis_v0.5_{train,val}.json
109 |   lvis_v0.5_image_info_test.json
110 |   lvis_v1_{train,val}.json
111 |   lvis_v1_image_info_test{,_challenge}.json
112 | ```
113 | 
114 | Install lvis-api by:
115 | ```
116 | pip install git+https://github.com/lvis-dataset/lvis-api.git
117 | ```
118 | 
119 | To evaluate models trained on the COCO dataset using LVIS annotations,
120 | run `python datasets/prepare_cocofied_lvis.py` to prepare "cocofied" LVIS v0.5 annotations,
121 | or `python datasets/prepare_cocofied_lvisv1.py` to prepare "cocofied" LVIS v1 annotations.
122 | 
123 | Then, add `("lvis_v0.5_val_cocofied",)` or `("lvis_v1_val_cocofied",)` to DATASETS:TEST in config files.
124 | 
125 | Finally, for v1, add `lvis_v1_cocofied` entry
126 | ```
127 |     "lvis_v1_cocofied": {
128 |         "lvis_v1_val_cocofied": ("coco/", "lvis/lvis_v1_val_cocofied.json"),
129 |     },
130 | ```
131 |  to detectron2/data/datasets/builtin.py.
132 | 


--------------------------------------------------------------------------------
/datasets/prepare_ade20k_sem_seg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | 
 5 | import os
 6 | from pathlib import Path
 7 | 
 8 | import numpy as np
 9 | import tqdm
10 | from PIL import Image
11 | 
12 | 
13 | def convert(input, output):
14 |     img = np.asarray(Image.open(input))
15 |     assert img.dtype == np.uint8
16 |     img = img - 1  # 0 (ignore) becomes 255. others are shifted by 1
17 |     Image.fromarray(img).save(output)
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016"
22 |     for name in ["training", "validation"]:
23 |         annotation_dir = dataset_dir / "annotations" / name
24 |         output_dir = dataset_dir / "annotations_detectron2" / name
25 |         output_dir.mkdir(parents=True, exist_ok=True)
26 |         for file in tqdm.tqdm(list(annotation_dir.iterdir())):
27 |             output_file = output_dir / file.name
28 |             convert(file, output_file)
29 | 


--------------------------------------------------------------------------------
/datasets/prepare_coco_semantic_annos_from_panoptic_annos.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | 
 5 | import functools
 6 | import json
 7 | import multiprocessing as mp
 8 | import numpy as np
 9 | import os
10 | import time
11 | from panopticapi.utils import rgb2id
12 | from PIL import Image
13 | 
14 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
15 | 
16 | 
17 | def _process_panoptic_to_semantic(input_panoptic, output_semantic, segments, id_map):
18 |     panoptic = np.asarray(Image.open(input_panoptic), dtype=np.uint32)
19 |     panoptic = rgb2id(panoptic)
20 |     output = np.zeros_like(panoptic, dtype=np.uint8) + 255
21 |     for seg in segments:
22 |         cat_id = seg["category_id"]
23 |         new_cat_id = id_map[cat_id]
24 |         output[panoptic == seg["id"]] = new_cat_id
25 |     Image.fromarray(output).save(output_semantic)
26 | 
27 | 
28 | def separate_coco_semantic_from_panoptic(panoptic_json, panoptic_root, sem_seg_root, categories):
29 |     """
30 |     Create semantic segmentation annotations from panoptic segmentation
31 |     annotations, to be used by PanopticFPN.
32 |     It maps all thing categories to class 0, and maps all unlabeled pixels to class 255.
33 |     It maps all stuff categories to contiguous ids starting from 1.
34 |     Args:
35 |         panoptic_json (str): path to the panoptic json file, in COCO's format.
36 |         panoptic_root (str): a directory with panoptic annotation files, in COCO's format.
37 |         sem_seg_root (str): a directory to output semantic annotation files
38 |         categories (list[dict]): category metadata. Each dict needs to have:
39 |             "id": corresponds to the "category_id" in the json annotations
40 |             "isthing": 0 or 1
41 |     """
42 |     os.makedirs(sem_seg_root, exist_ok=True)
43 | 
44 |     id_map = {}  # map from category id to id in the output semantic annotation
45 |     assert len(categories) <= 254
46 |     for i, k in enumerate(categories):
47 |         id_map[k["id"]] = i
48 |     # what is id = 0?
49 |     # id_map[0] = 255
50 |     print(id_map)
51 | 
52 |     with open(panoptic_json) as f:
53 |         obj = json.load(f)
54 | 
55 |     pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4))
56 | 
57 |     def iter_annotations():
58 |         for anno in obj["annotations"]:
59 |             file_name = anno["file_name"]
60 |             segments = anno["segments_info"]
61 |             input = os.path.join(panoptic_root, file_name)
62 |             output = os.path.join(sem_seg_root, file_name)
63 |             yield input, output, segments
64 | 
65 |     print("Start writing to {} ...".format(sem_seg_root))
66 |     start = time.time()
67 |     pool.starmap(
68 |         functools.partial(_process_panoptic_to_semantic, id_map=id_map),
69 |         iter_annotations(),
70 |         chunksize=100,
71 |     )
72 |     print("Finished. time: {:.2f}s".format(time.time() - start))
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco")
77 |     for s in ["val2017", "train2017"]:
78 |         separate_coco_semantic_from_panoptic(
79 |             os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)),
80 |             os.path.join(dataset_dir, "panoptic_{}".format(s)),
81 |             os.path.join(dataset_dir, "panoptic_semseg_{}".format(s)),
82 |             COCO_CATEGORIES,
83 |         )
84 | 


--------------------------------------------------------------------------------
/datasets/prepare_cocofied_lvis.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # Copyright (c) Facebook, Inc. and its affiliates.
  4 | 
  5 | import copy
  6 | import json
  7 | import os
  8 | from collections import defaultdict
  9 | 
 10 | # This mapping is extracted from the official LVIS mapping:
 11 | # https://github.com/lvis-dataset/lvis-api/blob/master/data/coco_to_synset.json
 12 | COCO_SYNSET_CATEGORIES = [
 13 |     {"synset": "person.n.01", "coco_cat_id": 1},
 14 |     {"synset": "bicycle.n.01", "coco_cat_id": 2},
 15 |     {"synset": "car.n.01", "coco_cat_id": 3},
 16 |     {"synset": "motorcycle.n.01", "coco_cat_id": 4},
 17 |     {"synset": "airplane.n.01", "coco_cat_id": 5},
 18 |     {"synset": "bus.n.01", "coco_cat_id": 6},
 19 |     {"synset": "train.n.01", "coco_cat_id": 7},
 20 |     {"synset": "truck.n.01", "coco_cat_id": 8},
 21 |     {"synset": "boat.n.01", "coco_cat_id": 9},
 22 |     {"synset": "traffic_light.n.01", "coco_cat_id": 10},
 23 |     {"synset": "fireplug.n.01", "coco_cat_id": 11},
 24 |     {"synset": "stop_sign.n.01", "coco_cat_id": 13},
 25 |     {"synset": "parking_meter.n.01", "coco_cat_id": 14},
 26 |     {"synset": "bench.n.01", "coco_cat_id": 15},
 27 |     {"synset": "bird.n.01", "coco_cat_id": 16},
 28 |     {"synset": "cat.n.01", "coco_cat_id": 17},
 29 |     {"synset": "dog.n.01", "coco_cat_id": 18},
 30 |     {"synset": "horse.n.01", "coco_cat_id": 19},
 31 |     {"synset": "sheep.n.01", "coco_cat_id": 20},
 32 |     {"synset": "beef.n.01", "coco_cat_id": 21},
 33 |     {"synset": "elephant.n.01", "coco_cat_id": 22},
 34 |     {"synset": "bear.n.01", "coco_cat_id": 23},
 35 |     {"synset": "zebra.n.01", "coco_cat_id": 24},
 36 |     {"synset": "giraffe.n.01", "coco_cat_id": 25},
 37 |     {"synset": "backpack.n.01", "coco_cat_id": 27},
 38 |     {"synset": "umbrella.n.01", "coco_cat_id": 28},
 39 |     {"synset": "bag.n.04", "coco_cat_id": 31},
 40 |     {"synset": "necktie.n.01", "coco_cat_id": 32},
 41 |     {"synset": "bag.n.06", "coco_cat_id": 33},
 42 |     {"synset": "frisbee.n.01", "coco_cat_id": 34},
 43 |     {"synset": "ski.n.01", "coco_cat_id": 35},
 44 |     {"synset": "snowboard.n.01", "coco_cat_id": 36},
 45 |     {"synset": "ball.n.06", "coco_cat_id": 37},
 46 |     {"synset": "kite.n.03", "coco_cat_id": 38},
 47 |     {"synset": "baseball_bat.n.01", "coco_cat_id": 39},
 48 |     {"synset": "baseball_glove.n.01", "coco_cat_id": 40},
 49 |     {"synset": "skateboard.n.01", "coco_cat_id": 41},
 50 |     {"synset": "surfboard.n.01", "coco_cat_id": 42},
 51 |     {"synset": "tennis_racket.n.01", "coco_cat_id": 43},
 52 |     {"synset": "bottle.n.01", "coco_cat_id": 44},
 53 |     {"synset": "wineglass.n.01", "coco_cat_id": 46},
 54 |     {"synset": "cup.n.01", "coco_cat_id": 47},
 55 |     {"synset": "fork.n.01", "coco_cat_id": 48},
 56 |     {"synset": "knife.n.01", "coco_cat_id": 49},
 57 |     {"synset": "spoon.n.01", "coco_cat_id": 50},
 58 |     {"synset": "bowl.n.03", "coco_cat_id": 51},
 59 |     {"synset": "banana.n.02", "coco_cat_id": 52},
 60 |     {"synset": "apple.n.01", "coco_cat_id": 53},
 61 |     {"synset": "sandwich.n.01", "coco_cat_id": 54},
 62 |     {"synset": "orange.n.01", "coco_cat_id": 55},
 63 |     {"synset": "broccoli.n.01", "coco_cat_id": 56},
 64 |     {"synset": "carrot.n.01", "coco_cat_id": 57},
 65 |     {"synset": "frank.n.02", "coco_cat_id": 58},
 66 |     {"synset": "pizza.n.01", "coco_cat_id": 59},
 67 |     {"synset": "doughnut.n.02", "coco_cat_id": 60},
 68 |     {"synset": "cake.n.03", "coco_cat_id": 61},
 69 |     {"synset": "chair.n.01", "coco_cat_id": 62},
 70 |     {"synset": "sofa.n.01", "coco_cat_id": 63},
 71 |     {"synset": "pot.n.04", "coco_cat_id": 64},
 72 |     {"synset": "bed.n.01", "coco_cat_id": 65},
 73 |     {"synset": "dining_table.n.01", "coco_cat_id": 67},
 74 |     {"synset": "toilet.n.02", "coco_cat_id": 70},
 75 |     {"synset": "television_receiver.n.01", "coco_cat_id": 72},
 76 |     {"synset": "laptop.n.01", "coco_cat_id": 73},
 77 |     {"synset": "mouse.n.04", "coco_cat_id": 74},
 78 |     {"synset": "remote_control.n.01", "coco_cat_id": 75},
 79 |     {"synset": "computer_keyboard.n.01", "coco_cat_id": 76},
 80 |     {"synset": "cellular_telephone.n.01", "coco_cat_id": 77},
 81 |     {"synset": "microwave.n.02", "coco_cat_id": 78},
 82 |     {"synset": "oven.n.01", "coco_cat_id": 79},
 83 |     {"synset": "toaster.n.02", "coco_cat_id": 80},
 84 |     {"synset": "sink.n.01", "coco_cat_id": 81},
 85 |     {"synset": "electric_refrigerator.n.01", "coco_cat_id": 82},
 86 |     {"synset": "book.n.01", "coco_cat_id": 84},
 87 |     {"synset": "clock.n.01", "coco_cat_id": 85},
 88 |     {"synset": "vase.n.01", "coco_cat_id": 86},
 89 |     {"synset": "scissors.n.01", "coco_cat_id": 87},
 90 |     {"synset": "teddy.n.01", "coco_cat_id": 88},
 91 |     {"synset": "hand_blower.n.01", "coco_cat_id": 89},
 92 |     {"synset": "toothbrush.n.01", "coco_cat_id": 90},
 93 | ]
 94 | 
 95 | 
 96 | def cocofy_lvis(input_filename, output_filename):
 97 |     """
 98 |     Filter LVIS instance segmentation annotations to remove all categories that are not included in
 99 |     COCO. The new json files can be used to evaluate COCO AP using `lvis-api`. The category ids in
100 |     the output json are the incontiguous COCO dataset ids.
101 | 
102 |     Args:
103 |         input_filename (str): path to the LVIS json file.
104 |         output_filename (str): path to the COCOfied json file.
105 |     """
106 | 
107 |     with open(input_filename, "r") as f:
108 |         lvis_json = json.load(f)
109 | 
110 |     lvis_annos = lvis_json.pop("annotations")
111 |     cocofied_lvis = copy.deepcopy(lvis_json)
112 |     lvis_json["annotations"] = lvis_annos
113 | 
114 |     # Mapping from lvis cat id to coco cat id via synset
115 |     lvis_cat_id_to_synset = {cat["id"]: cat["synset"] for cat in lvis_json["categories"]}
116 |     synset_to_coco_cat_id = {x["synset"]: x["coco_cat_id"] for x in COCO_SYNSET_CATEGORIES}
117 |     # Synsets that we will keep in the dataset
118 |     synsets_to_keep = set(synset_to_coco_cat_id.keys())
119 |     coco_cat_id_with_instances = defaultdict(int)
120 | 
121 |     new_annos = []
122 |     ann_id = 1
123 |     for ann in lvis_annos:
124 |         lvis_cat_id = ann["category_id"]
125 |         synset = lvis_cat_id_to_synset[lvis_cat_id]
126 |         if synset not in synsets_to_keep:
127 |             continue
128 |         coco_cat_id = synset_to_coco_cat_id[synset]
129 |         new_ann = copy.deepcopy(ann)
130 |         new_ann["category_id"] = coco_cat_id
131 |         new_ann["id"] = ann_id
132 |         ann_id += 1
133 |         new_annos.append(new_ann)
134 |         coco_cat_id_with_instances[coco_cat_id] += 1
135 |     cocofied_lvis["annotations"] = new_annos
136 | 
137 |     for image in cocofied_lvis["images"]:
138 |         for key in ["not_exhaustive_category_ids", "neg_category_ids"]:
139 |             new_category_list = []
140 |             for lvis_cat_id in image[key]:
141 |                 synset = lvis_cat_id_to_synset[lvis_cat_id]
142 |                 if synset not in synsets_to_keep:
143 |                     continue
144 |                 coco_cat_id = synset_to_coco_cat_id[synset]
145 |                 new_category_list.append(coco_cat_id)
146 |                 coco_cat_id_with_instances[coco_cat_id] += 1
147 |             image[key] = new_category_list
148 | 
149 |     coco_cat_id_with_instances = set(coco_cat_id_with_instances.keys())
150 | 
151 |     new_categories = []
152 |     for cat in lvis_json["categories"]:
153 |         synset = cat["synset"]
154 |         if synset not in synsets_to_keep:
155 |             continue
156 |         coco_cat_id = synset_to_coco_cat_id[synset]
157 |         if coco_cat_id not in coco_cat_id_with_instances:
158 |             continue
159 |         new_cat = copy.deepcopy(cat)
160 |         new_cat["id"] = coco_cat_id
161 |         new_categories.append(new_cat)
162 |     cocofied_lvis["categories"] = new_categories
163 | 
164 |     with open(output_filename, "w") as f:
165 |         json.dump(cocofied_lvis, f)
166 |     print("{} is COCOfied and stored in {}.".format(input_filename, output_filename))
167 | 
168 | 
169 | if __name__ == "__main__":
170 |     dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "lvis")
171 |     for s in ["lvis_v0.5_train", "lvis_v0.5_val"]:
172 |         print("Start COCOfing {}.".format(s))
173 |         cocofy_lvis(
174 |             os.path.join(dataset_dir, "{}.json".format(s)),
175 |             os.path.join(dataset_dir, "{}_cocofied.json".format(s)),
176 |         )
177 | 


--------------------------------------------------------------------------------
/datasets/prepare_cocofied_lvisv1.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # For licensing see accompanying LICENSE file.
  3 | # Copyright (C) 2023 Apple Inc. All Rights Reserved.
  4 | #
  5 | 
  6 | import copy
  7 | import json
  8 | import os
  9 | from collections import defaultdict
 10 | 
 11 | # This mapping is extracted from the official LVIS mapping:
 12 | # https://github.com/lvis-dataset/lvis-api/blob/master/data/coco_to_synset.json
 13 | COCO_SYNSET_CATEGORIES = [
 14 |     {"synset": "person.n.01", "coco_cat_id": 1},
 15 |     {"synset": "bicycle.n.01", "coco_cat_id": 2},
 16 |     {"synset": "car.n.01", "coco_cat_id": 3},
 17 |     {"synset": "motorcycle.n.01", "coco_cat_id": 4},
 18 |     {"synset": "airplane.n.01", "coco_cat_id": 5},
 19 |     {"synset": "bus.n.01", "coco_cat_id": 6},
 20 |     {"synset": "train.n.01", "coco_cat_id": 7},
 21 |     {"synset": "truck.n.01", "coco_cat_id": 8},
 22 |     {"synset": "boat.n.01", "coco_cat_id": 9},
 23 |     {"synset": "traffic_light.n.01", "coco_cat_id": 10},
 24 |     {"synset": "fireplug.n.01", "coco_cat_id": 11},
 25 |     {"synset": "stop_sign.n.01", "coco_cat_id": 13},
 26 |     {"synset": "parking_meter.n.01", "coco_cat_id": 14},
 27 |     {"synset": "bench.n.01", "coco_cat_id": 15},
 28 |     {"synset": "bird.n.01", "coco_cat_id": 16},
 29 |     {"synset": "cat.n.01", "coco_cat_id": 17},
 30 |     {"synset": "dog.n.01", "coco_cat_id": 18},
 31 |     {"synset": "horse.n.01", "coco_cat_id": 19},
 32 |     {"synset": "sheep.n.01", "coco_cat_id": 20},
 33 |     {"synset": "beef.n.01", "coco_cat_id": 21},
 34 |     {"synset": "elephant.n.01", "coco_cat_id": 22},
 35 |     {"synset": "bear.n.01", "coco_cat_id": 23},
 36 |     {"synset": "zebra.n.01", "coco_cat_id": 24},
 37 |     {"synset": "giraffe.n.01", "coco_cat_id": 25},
 38 |     {"synset": "backpack.n.01", "coco_cat_id": 27},
 39 |     {"synset": "umbrella.n.01", "coco_cat_id": 28},
 40 |     {"synset": "bag.n.04", "coco_cat_id": 31},
 41 |     {"synset": "necktie.n.01", "coco_cat_id": 32},
 42 |     {"synset": "bag.n.06", "coco_cat_id": 33},
 43 |     {"synset": "frisbee.n.01", "coco_cat_id": 34},
 44 |     {"synset": "ski.n.01", "coco_cat_id": 35},
 45 |     {"synset": "snowboard.n.01", "coco_cat_id": 36},
 46 |     {"synset": "ball.n.06", "coco_cat_id": 37},
 47 |     {"synset": "kite.n.03", "coco_cat_id": 38},
 48 |     {"synset": "baseball_bat.n.01", "coco_cat_id": 39},
 49 |     {"synset": "baseball_glove.n.01", "coco_cat_id": 40},
 50 |     {"synset": "skateboard.n.01", "coco_cat_id": 41},
 51 |     {"synset": "surfboard.n.01", "coco_cat_id": 42},
 52 |     {"synset": "tennis_racket.n.01", "coco_cat_id": 43},
 53 |     {"synset": "bottle.n.01", "coco_cat_id": 44},
 54 |     {"synset": "wineglass.n.01", "coco_cat_id": 46},
 55 |     {"synset": "cup.n.01", "coco_cat_id": 47},
 56 |     {"synset": "fork.n.01", "coco_cat_id": 48},
 57 |     {"synset": "knife.n.01", "coco_cat_id": 49},
 58 |     {"synset": "spoon.n.01", "coco_cat_id": 50},
 59 |     {"synset": "bowl.n.03", "coco_cat_id": 51},
 60 |     {"synset": "banana.n.02", "coco_cat_id": 52},
 61 |     {"synset": "apple.n.01", "coco_cat_id": 53},
 62 |     {"synset": "sandwich.n.01", "coco_cat_id": 54},
 63 |     {"synset": "orange.n.01", "coco_cat_id": 55},
 64 |     {"synset": "broccoli.n.01", "coco_cat_id": 56},
 65 |     {"synset": "carrot.n.01", "coco_cat_id": 57},
 66 |     {"synset": "frank.n.02", "coco_cat_id": 58},
 67 |     {"synset": "pizza.n.01", "coco_cat_id": 59},
 68 |     {"synset": "doughnut.n.02", "coco_cat_id": 60},
 69 |     {"synset": "cake.n.03", "coco_cat_id": 61},
 70 |     {"synset": "chair.n.01", "coco_cat_id": 62},
 71 |     {"synset": "sofa.n.01", "coco_cat_id": 63},
 72 |     {"synset": "pot.n.04", "coco_cat_id": 64},
 73 |     {"synset": "bed.n.01", "coco_cat_id": 65},
 74 |     {"synset": "dining_table.n.01", "coco_cat_id": 67},
 75 |     {"synset": "toilet.n.02", "coco_cat_id": 70},
 76 |     {"synset": "television_receiver.n.01", "coco_cat_id": 72},
 77 |     {"synset": "laptop.n.01", "coco_cat_id": 73},
 78 |     {"synset": "mouse.n.04", "coco_cat_id": 74},
 79 |     {"synset": "remote_control.n.01", "coco_cat_id": 75},
 80 |     {"synset": "computer_keyboard.n.01", "coco_cat_id": 76},
 81 |     {"synset": "cellular_telephone.n.01", "coco_cat_id": 77},
 82 |     {"synset": "microwave.n.02", "coco_cat_id": 78},
 83 |     {"synset": "oven.n.01", "coco_cat_id": 79},
 84 |     {"synset": "toaster.n.02", "coco_cat_id": 80},
 85 |     {"synset": "sink.n.01", "coco_cat_id": 81},
 86 |     {"synset": "electric_refrigerator.n.01", "coco_cat_id": 82},
 87 |     {"synset": "book.n.01", "coco_cat_id": 84},
 88 |     {"synset": "clock.n.01", "coco_cat_id": 85},
 89 |     {"synset": "vase.n.01", "coco_cat_id": 86},
 90 |     {"synset": "scissors.n.01", "coco_cat_id": 87},
 91 |     {"synset": "teddy.n.01", "coco_cat_id": 88},
 92 |     {"synset": "hand_blower.n.01", "coco_cat_id": 89},
 93 |     {"synset": "toothbrush.n.01", "coco_cat_id": 90},
 94 | ]
 95 | 
 96 | 
 97 | def cocofy_lvis(input_filename, output_filename):
 98 |     """
 99 |     Filter LVIS instance segmentation annotations to remove all categories that are not included in
100 |     COCO. The new json files can be used to evaluate COCO AP using `lvis-api`. The category ids in
101 |     the output json are the incontiguous COCO dataset ids.
102 | 
103 |     Args:
104 |         input_filename (str): path to the LVIS json file.
105 |         output_filename (str): path to the COCOfied json file.
106 |     """
107 | 
108 |     with open(input_filename, "r") as f:
109 |         lvis_json = json.load(f)
110 | 
111 |     lvis_annos = lvis_json.pop("annotations")
112 |     lvis_imgs = lvis_json.pop("images")
113 |     cocofied_lvis = copy.deepcopy(lvis_json)
114 |     lvis_json["annotations"] = lvis_annos
115 |     lvis_json["images"] = lvis_imgs
116 | 
117 |     # Mapping from lvis cat id to coco cat id via synset
118 |     lvis_cat_id_to_synset = {cat["id"]: cat["synset"] for cat in lvis_json["categories"]}
119 |     synset_to_coco_cat_id = {x["synset"]: x["coco_cat_id"] for x in COCO_SYNSET_CATEGORIES}
120 |     # Synsets that we will keep in the dataset
121 |     synsets_to_keep = set(synset_to_coco_cat_id.keys())
122 |     coco_cat_id_with_instances = defaultdict(int)
123 | 
124 |     invalid_img_ids = set()
125 |     new_img_id_dict = {}
126 | 
127 |     new_images = []
128 |     img_id = 1
129 |     for image in lvis_imgs:
130 |         coco_url = image['coco_url']
131 |         split, file_name = coco_url.split('/')[-2:]
132 |         if split == 'train2017':
133 |             invalid_img_ids.add(image['id'])
134 |             continue
135 |         new_img = copy.deepcopy(image)
136 |         new_img_id_dict[new_img['id']] = img_id
137 |         new_img['id'] = img_id
138 |         img_id += 1
139 |         new_img['file_name'] = file_name
140 |         for key in ["not_exhaustive_category_ids", "neg_category_ids"]:
141 |             new_category_list = []
142 |             for lvis_cat_id in new_img[key]:
143 |                 synset = lvis_cat_id_to_synset[lvis_cat_id]
144 |                 if synset not in synsets_to_keep:
145 |                     continue
146 |                 coco_cat_id = synset_to_coco_cat_id[synset]
147 |                 new_category_list.append(coco_cat_id)
148 |                 coco_cat_id_with_instances[coco_cat_id] += 1
149 |             new_img[key] = new_category_list
150 |         new_images.append(new_img)
151 |     cocofied_lvis["images"] = new_images
152 | 
153 |     new_annos = []
154 |     ann_id = 1
155 |     for ann in lvis_annos:
156 |         img_id = ann["image_id"]
157 |         if img_id in invalid_img_ids:
158 |             continue
159 |         lvis_cat_id = ann["category_id"]
160 |         synset = lvis_cat_id_to_synset[lvis_cat_id]
161 |         if synset not in synsets_to_keep:
162 |             continue
163 |         coco_cat_id = synset_to_coco_cat_id[synset]
164 |         new_ann = copy.deepcopy(ann)
165 |         new_ann["category_id"] = coco_cat_id
166 |         new_ann["id"] = ann_id
167 |         ann_id += 1
168 |         new_ann["image_id"] = new_img_id_dict[img_id]
169 |         new_annos.append(new_ann)
170 |         coco_cat_id_with_instances[coco_cat_id] += 1
171 |     cocofied_lvis["annotations"] = new_annos
172 | 
173 | 
174 |     coco_cat_id_with_instances = set(coco_cat_id_with_instances.keys())
175 | 
176 |     new_categories = []
177 |     for cat in lvis_json["categories"]:
178 |         synset = cat["synset"]
179 |         if synset not in synsets_to_keep:
180 |             continue
181 |         coco_cat_id = synset_to_coco_cat_id[synset]
182 |         if coco_cat_id not in coco_cat_id_with_instances:
183 |             continue
184 |         new_cat = copy.deepcopy(cat)
185 |         new_cat["id"] = coco_cat_id
186 |         new_categories.append(new_cat)
187 |     cocofied_lvis["categories"] = new_categories
188 | 
189 |     with open(output_filename, "w") as f:
190 |         json.dump(cocofied_lvis, f)
191 |     print("{} is COCOfied and stored in {}.".format(input_filename, output_filename))
192 | 
193 | 
194 | if __name__ == "__main__":
195 |     dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "lvis")
196 |     for s in ["lvis_v1_val"]:
197 |         print("Start COCOfing {}.".format(s))
198 |         cocofy_lvis(
199 |             os.path.join(dataset_dir, "{}.json".format(s)),
200 |             os.path.join(dataset_dir, "{}_cocofied.json".format(s)),
201 |         )
202 | 


--------------------------------------------------------------------------------
/demo/demo.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py
  3 | # Adapted for AutoFocusFormer by Ziwen 2023
  4 | 
  5 | import argparse
  6 | import glob
  7 | import multiprocessing as mp
  8 | import os
  9 | 
 10 | # fmt: off
 11 | import sys
 12 | sys.path.insert(1, os.path.join(sys.path[0], '..'))
 13 | # fmt: on
 14 | 
 15 | import tempfile
 16 | import time
 17 | import warnings
 18 | 
 19 | import cv2
 20 | import numpy as np
 21 | import tqdm
 22 | 
 23 | from detectron2.config import get_cfg
 24 | from detectron2.data.detection_utils import read_image
 25 | from detectron2.projects.deeplab import add_deeplab_config
 26 | from detectron2.utils.logger import setup_logger
 27 | 
 28 | from mask2former import add_maskformer2_config
 29 | from predictor import VisualizationDemo
 30 | 
 31 | 
 32 | # constants
 33 | WINDOW_NAME = "mask2former demo"
 34 | 
 35 | 
 36 | def setup_cfg(args):
 37 |     # load config from file and command-line arguments
 38 |     cfg = get_cfg()
 39 |     add_deeplab_config(cfg)
 40 |     add_maskformer2_config(cfg)
 41 |     cfg.merge_from_file(args.config_file)
 42 |     cfg.merge_from_list(args.opts)
 43 |     cfg.freeze()
 44 |     return cfg
 45 | 
 46 | 
 47 | def get_parser():
 48 |     parser = argparse.ArgumentParser(description="maskformer2 demo for builtin configs")
 49 |     parser.add_argument(
 50 |         "--config-file",
 51 |         default="configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml",
 52 |         metavar="FILE",
 53 |         help="path to config file",
 54 |     )
 55 |     parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.")
 56 |     parser.add_argument("--video-input", help="Path to video file.")
 57 |     parser.add_argument(
 58 |         "--input",
 59 |         nargs="+",
 60 |         help="A list of space separated input images; "
 61 |         "or a single glob pattern such as 'directory/*.jpg'",
 62 |     )
 63 |     parser.add_argument(
 64 |         "--output",
 65 |         help="A file or directory to save output visualizations. "
 66 |         "If not given, will show output in an OpenCV window.",
 67 |     )
 68 | 
 69 |     parser.add_argument(
 70 |         "--confidence-threshold",
 71 |         type=float,
 72 |         default=0.5,
 73 |         help="Minimum score for instance predictions to be shown",
 74 |     )
 75 |     parser.add_argument(
 76 |         "--opts",
 77 |         help="Modify config options using the command-line 'KEY VALUE' pairs",
 78 |         default=[],
 79 |         nargs=argparse.REMAINDER,
 80 |     )
 81 |     parser.add_argument(
 82 |         "--blur",
 83 |         help="A directory containing blurred version of the inputs (e.g., blurred human faces). "
 84 |         "If given, predictions are visualized on the blurred images."
 85 |         "Images inside this folder need to have the same name as the input images",
 86 |     )
 87 |     return parser
 88 | 
 89 | 
 90 | def test_opencv_video_format(codec, file_ext):
 91 |     with tempfile.TemporaryDirectory(prefix="video_format_test") as dir:
 92 |         filename = os.path.join(dir, "test_file" + file_ext)
 93 |         writer = cv2.VideoWriter(
 94 |             filename=filename,
 95 |             fourcc=cv2.VideoWriter_fourcc(*codec),
 96 |             fps=float(30),
 97 |             frameSize=(10, 10),
 98 |             isColor=True,
 99 |         )
100 |         [writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)]
101 |         writer.release()
102 |         if os.path.isfile(filename):
103 |             return True
104 |         return False
105 | 
106 | 
107 | if __name__ == "__main__":
108 |     mp.set_start_method("spawn", force=True)
109 |     args = get_parser().parse_args()
110 |     setup_logger(name="fvcore")
111 |     logger = setup_logger()
112 |     logger.info("Arguments: " + str(args))
113 | 
114 |     cfg = setup_cfg(args)
115 | 
116 |     demo = VisualizationDemo(cfg)
117 | 
118 |     if args.input:
119 |         if len(args.input) == 1:
120 |             args.input = glob.glob(os.path.expanduser(args.input[0]))
121 |             assert args.input, "The input path(s) was not found"
122 |         for path in tqdm.tqdm(args.input, disable=not args.output):
123 |             # use PIL, to be consistent with evaluation
124 |             img = read_image(path, format="BGR")
125 |             if args.blur:
126 |                 path_blur = os.path.join(args.blur, path.split('/')[-1])
127 |                 img_blur = read_image(path_blur, format="BGR")
128 |             else:
129 |                 img_blur = None
130 |             start_time = time.time()
131 |             predictions, visualized_output = demo.run_on_image(img, blur=img_blur)
132 |             logger.info(
133 |                 "{}: {} in {:.2f}s".format(
134 |                     path,
135 |                     "detected {} instances".format(len(predictions["instances"]))
136 |                     if "instances" in predictions
137 |                     else "finished",
138 |                     time.time() - start_time,
139 |                 )
140 |             )
141 | 
142 |             if args.output:
143 |                 if os.path.isdir(args.output):
144 |                     assert os.path.isdir(args.output), args.output
145 |                     out_filename = os.path.join(args.output, os.path.basename(path))
146 |                 else:
147 |                     assert len(args.input) == 1, "Please specify a directory with args.output"
148 |                     out_filename = args.output
149 |                 visualized_output.save(out_filename)
150 |             else:
151 |                 cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
152 |                 cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
153 |                 if cv2.waitKey(0) == 27:
154 |                     break  # esc to quit
155 |     elif args.webcam:
156 |         assert args.input is None, "Cannot have both --input and --webcam!"
157 |         assert args.output is None, "output not yet supported with --webcam!"
158 |         cam = cv2.VideoCapture(0)
159 |         for vis in tqdm.tqdm(demo.run_on_video(cam)):
160 |             cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
161 |             cv2.imshow(WINDOW_NAME, vis)
162 |             if cv2.waitKey(1) == 27:
163 |                 break  # esc to quit
164 |         cam.release()
165 |         cv2.destroyAllWindows()
166 |     elif args.video_input:
167 |         video = cv2.VideoCapture(args.video_input)
168 |         width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
169 |         height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
170 |         frames_per_second = video.get(cv2.CAP_PROP_FPS)
171 |         num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
172 |         basename = os.path.basename(args.video_input)
173 |         codec, file_ext = (
174 |             ("x264", ".mkv") if test_opencv_video_format("x264", ".mkv") else ("mp4v", ".mp4")
175 |         )
176 |         if codec == ".mp4v":
177 |             warnings.warn("x264 codec not available, switching to mp4v")
178 |         if args.output:
179 |             if os.path.isdir(args.output):
180 |                 output_fname = os.path.join(args.output, basename)
181 |                 output_fname = os.path.splitext(output_fname)[0] + file_ext
182 |             else:
183 |                 output_fname = args.output
184 |             assert not os.path.isfile(output_fname), output_fname
185 |             output_file = cv2.VideoWriter(
186 |                 filename=output_fname,
187 |                 # some installation of opencv may not support x264 (due to its license),
188 |                 # you can try other format (e.g. MPEG)
189 |                 fourcc=cv2.VideoWriter_fourcc(*codec),
190 |                 fps=float(frames_per_second),
191 |                 frameSize=(width, height),
192 |                 isColor=True,
193 |             )
194 |         assert os.path.isfile(args.video_input)
195 |         for vis_frame in tqdm.tqdm(demo.run_on_video(video), total=num_frames):
196 |             if args.output:
197 |                 output_file.write(vis_frame)
198 |             else:
199 |                 cv2.namedWindow(basename, cv2.WINDOW_NORMAL)
200 |                 cv2.imshow(basename, vis_frame)
201 |                 if cv2.waitKey(1) == 27:
202 |                     break  # esc to quit
203 |         video.release()
204 |         if args.output:
205 |             output_file.release()
206 |         else:
207 |             cv2.destroyAllWindows()
208 | 


--------------------------------------------------------------------------------
/demo/predictor.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Copied from: https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py
  3 | # Adapted for AutoFocusFormer by Ziwen 2023
  4 | 
  5 | import atexit
  6 | import bisect
  7 | import multiprocessing as mp
  8 | from collections import deque
  9 | 
 10 | import cv2
 11 | import torch
 12 | 
 13 | from detectron2.data import MetadataCatalog
 14 | from detectron2.engine.defaults import DefaultPredictor
 15 | from detectron2.utils.video_visualizer import VideoVisualizer
 16 | from detectron2.utils.visualizer import ColorMode, Visualizer
 17 | 
 18 | 
 19 | class VisualizationDemo(object):
 20 |     def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
 21 |         """
 22 |         Args:
 23 |             cfg (CfgNode):
 24 |             instance_mode (ColorMode):
 25 |             parallel (bool): whether to run the model in different processes from visualization.
 26 |                 Useful since the visualization logic can be slow.
 27 |         """
 28 |         self.metadata = MetadataCatalog.get(
 29 |             cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
 30 |         )
 31 |         self.cpu_device = torch.device("cpu")
 32 |         self.instance_mode = instance_mode
 33 | 
 34 |         self.parallel = parallel
 35 |         if parallel:
 36 |             num_gpu = torch.cuda.device_count()
 37 |             self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
 38 |         else:
 39 |             self.predictor = DefaultPredictor(cfg)
 40 | 
 41 |     def run_on_image(self, image, blur=None):
 42 |         """
 43 |         Args:
 44 |             image (np.ndarray): an image of shape (H, W, C) (in BGR order).
 45 |                 This is the format used by OpenCV.
 46 |         Returns:
 47 |             predictions (dict): the output of the model.
 48 |             vis_output (VisImage): the visualized image output.
 49 |         """
 50 |         vis_output = None
 51 |         predictions = self.predictor(image)
 52 |         # Convert image from OpenCV BGR format to Matplotlib RGB format.
 53 |         if blur is not None:
 54 |             blur = blur[:, :, ::-1]
 55 |             visualizer = Visualizer(blur, self.metadata, instance_mode=self.instance_mode)
 56 |         else:
 57 |             image = image[:, :, ::-1]
 58 |             visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode)
 59 |         if "panoptic_seg" in predictions:
 60 |             panoptic_seg, segments_info = predictions["panoptic_seg"]
 61 |             vis_output = visualizer.draw_panoptic_seg_predictions(
 62 |                 panoptic_seg.to(self.cpu_device), segments_info
 63 |             )
 64 |         else:
 65 |             if "sem_seg" in predictions:
 66 |                 vis_output = visualizer.draw_sem_seg(
 67 |                     predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
 68 |                 )
 69 |             if "instances" in predictions:
 70 |                 instances = predictions["instances"].to(self.cpu_device)
 71 |                 vis_output = visualizer.draw_instance_predictions(predictions=instances)
 72 | 
 73 |         return predictions, vis_output
 74 | 
 75 |     def _frame_from_video(self, video):
 76 |         while video.isOpened():
 77 |             success, frame = video.read()
 78 |             if success:
 79 |                 yield frame
 80 |             else:
 81 |                 break
 82 | 
 83 |     def run_on_video(self, video):
 84 |         """
 85 |         Visualizes predictions on frames of the input video.
 86 |         Args:
 87 |             video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be
 88 |                 either a webcam or a video file.
 89 |         Yields:
 90 |             ndarray: BGR visualizations of each video frame.
 91 |         """
 92 |         video_visualizer = VideoVisualizer(self.metadata, self.instance_mode)
 93 | 
 94 |         def process_predictions(frame, predictions):
 95 |             frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
 96 |             if "panoptic_seg" in predictions:
 97 |                 panoptic_seg, segments_info = predictions["panoptic_seg"]
 98 |                 vis_frame = video_visualizer.draw_panoptic_seg_predictions(
 99 |                     frame, panoptic_seg.to(self.cpu_device), segments_info
100 |                 )
101 |             elif "instances" in predictions:
102 |                 predictions = predictions["instances"].to(self.cpu_device)
103 |                 vis_frame = video_visualizer.draw_instance_predictions(frame, predictions)
104 |             elif "sem_seg" in predictions:
105 |                 vis_frame = video_visualizer.draw_sem_seg(
106 |                     frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
107 |                 )
108 | 
109 |             # Converts Matplotlib RGB format to OpenCV BGR format
110 |             vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR)
111 |             return vis_frame
112 | 
113 |         frame_gen = self._frame_from_video(video)
114 |         if self.parallel:
115 |             buffer_size = self.predictor.default_buffer_size
116 | 
117 |             frame_data = deque()
118 | 
119 |             for cnt, frame in enumerate(frame_gen):
120 |                 frame_data.append(frame)
121 |                 self.predictor.put(frame)
122 | 
123 |                 if cnt >= buffer_size:
124 |                     frame = frame_data.popleft()
125 |                     predictions = self.predictor.get()
126 |                     yield process_predictions(frame, predictions)
127 | 
128 |             while len(frame_data):
129 |                 frame = frame_data.popleft()
130 |                 predictions = self.predictor.get()
131 |                 yield process_predictions(frame, predictions)
132 |         else:
133 |             for frame in frame_gen:
134 |                 yield process_predictions(frame, self.predictor(frame))
135 | 
136 | 
137 | class AsyncPredictor:
138 |     """
139 |     A predictor that runs the model asynchronously, possibly on >1 GPUs.
140 |     Because rendering the visualization takes considerably amount of time,
141 |     this helps improve throughput a little bit when rendering videos.
142 |     """
143 | 
144 |     class _StopToken:
145 |         pass
146 | 
147 |     class _PredictWorker(mp.Process):
148 |         def __init__(self, cfg, task_queue, result_queue):
149 |             self.cfg = cfg
150 |             self.task_queue = task_queue
151 |             self.result_queue = result_queue
152 |             super().__init__()
153 | 
154 |         def run(self):
155 |             predictor = DefaultPredictor(self.cfg)
156 | 
157 |             while True:
158 |                 task = self.task_queue.get()
159 |                 if isinstance(task, AsyncPredictor._StopToken):
160 |                     break
161 |                 idx, data = task
162 |                 result = predictor(data)
163 |                 self.result_queue.put((idx, result))
164 | 
165 |     def __init__(self, cfg, num_gpus: int = 1):
166 |         """
167 |         Args:
168 |             cfg (CfgNode):
169 |             num_gpus (int): if 0, will run on CPU
170 |         """
171 |         num_workers = max(num_gpus, 1)
172 |         self.task_queue = mp.Queue(maxsize=num_workers * 3)
173 |         self.result_queue = mp.Queue(maxsize=num_workers * 3)
174 |         self.procs = []
175 |         for gpuid in range(max(num_gpus, 1)):
176 |             cfg = cfg.clone()
177 |             cfg.defrost()
178 |             cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu"
179 |             self.procs.append(
180 |                 AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)
181 |             )
182 | 
183 |         self.put_idx = 0
184 |         self.get_idx = 0
185 |         self.result_rank = []
186 |         self.result_data = []
187 | 
188 |         for p in self.procs:
189 |             p.start()
190 |         atexit.register(self.shutdown)
191 | 
192 |     def put(self, image):
193 |         self.put_idx += 1
194 |         self.task_queue.put((self.put_idx, image))
195 | 
196 |     def get(self):
197 |         self.get_idx += 1  # the index needed for this request
198 |         if len(self.result_rank) and self.result_rank[0] == self.get_idx:
199 |             res = self.result_data[0]
200 |             del self.result_data[0], self.result_rank[0]
201 |             return res
202 | 
203 |         while True:
204 |             # make sure the results are returned in the correct order
205 |             idx, res = self.result_queue.get()
206 |             if idx == self.get_idx:
207 |                 return res
208 |             insert = bisect.bisect(self.result_rank, idx)
209 |             self.result_rank.insert(insert, idx)
210 |             self.result_data.insert(insert, res)
211 | 
212 |     def __len__(self):
213 |         return self.put_idx - self.get_idx
214 | 
215 |     def __call__(self, image):
216 |         self.put(image)
217 |         return self.get()
218 | 
219 |     def shutdown(self):
220 |         for _ in self.procs:
221 |             self.task_queue.put(AsyncPredictor._StopToken())
222 | 
223 |     @property
224 |     def default_buffer_size(self):
225 |         return len(self.procs) * 5
226 | 


--------------------------------------------------------------------------------
/demo1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-autofocusformer-segmentation/52908e8ad5112b5bff1d043e6a06a9e8f9aad3ba/demo1.png


--------------------------------------------------------------------------------
/demo2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-autofocusformer-segmentation/52908e8ad5112b5bff1d043e6a06a9e8f9aad3ba/demo2.png


--------------------------------------------------------------------------------
/mask2former/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | from . import data  # register all new datasets
 4 | from . import modeling
 5 | 
 6 | # config
 7 | from .config import add_maskformer2_config
 8 | 
 9 | # dataset loading
10 | from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper
11 | from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper
12 | from .data.dataset_mappers.mask_former_instance_dataset_mapper import (
13 |     MaskFormerInstanceDatasetMapper,
14 | )
15 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import (
16 |     MaskFormerPanopticDatasetMapper,
17 | )
18 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
19 |     MaskFormerSemanticDatasetMapper,
20 | )
21 | 
22 | # models
23 | from .maskformer_model import MaskFormer
24 | from .test_time_augmentation import SemanticSegmentorWithTTA
25 | 
26 | # evaluation
27 | from .evaluation.instance_evaluation import InstanceSegEvaluator
28 | 


--------------------------------------------------------------------------------
/mask2former/config.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Adapted for AutoFocusFormer by Ziwen 2023
  3 | 
  4 | from detectron2.config import CfgNode as CN
  5 | 
  6 | 
  7 | def add_maskformer2_config(cfg):
  8 |     """
  9 |     Add config for MASK_FORMER.
 10 |     """
 11 |     # NOTE: configs from original maskformer
 12 |     # data config
 13 |     # select the dataset mapper
 14 |     cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic"
 15 |     # Color augmentation
 16 |     cfg.INPUT.COLOR_AUG_SSD = False
 17 |     # We retry random cropping until no single category in semantic segmentation GT occupies more
 18 |     # than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
 19 |     cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
 20 |     # Pad image and segmentation GT in dataset mapper.
 21 |     cfg.INPUT.SIZE_DIVISIBILITY = -1
 22 | 
 23 |     # solver config
 24 |     # weight decay on embedding
 25 |     cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0
 26 |     # optimizer
 27 |     cfg.SOLVER.OPTIMIZER = "ADAMW"
 28 |     cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
 29 | 
 30 |     # mask_former model config
 31 |     cfg.MODEL.MASK_FORMER = CN()
 32 | 
 33 |     # loss
 34 |     cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True
 35 |     cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1
 36 |     cfg.MODEL.MASK_FORMER.CLASS_WEIGHT = 1.0
 37 |     cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0
 38 |     cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0
 39 | 
 40 |     # transformer config
 41 |     cfg.MODEL.MASK_FORMER.NHEADS = 8
 42 |     cfg.MODEL.MASK_FORMER.DROPOUT = 0.1
 43 |     cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048
 44 |     cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0
 45 |     cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6
 46 |     cfg.MODEL.MASK_FORMER.PRE_NORM = False
 47 | 
 48 |     cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256
 49 |     cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100
 50 | 
 51 |     cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5"
 52 |     cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False
 53 | 
 54 |     # mask_former inference config
 55 |     cfg.MODEL.MASK_FORMER.TEST = CN()
 56 |     cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True
 57 |     cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = False
 58 |     cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False
 59 |     cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0
 60 |     cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0
 61 |     cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False
 62 | 
 63 |     # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
 64 |     # you can use this config to override
 65 |     cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32
 66 | 
 67 |     # pixel decoder config
 68 |     cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256
 69 |     # adding transformer in pixel decoder
 70 |     cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0
 71 |     # pixel decoder
 72 |     cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "MSDeformAttnPixelDecoder"
 73 | 
 74 |     # autofocusformer backbone
 75 |     cfg.MODEL.AFF = CN()
 76 |     cfg.MODEL.AFF.EMBED_DIM = [32, 128, 256, 384]
 77 |     cfg.MODEL.AFF.DEPTHS = [2, 2, 6, 2]
 78 |     cfg.MODEL.AFF.NUM_HEADS = [3, 6, 12, 24]
 79 |     cfg.MODEL.AFF.MLP_RATIO = 2.0
 80 |     cfg.MODEL.AFF.CLUSTER_SIZE = 8
 81 |     cfg.MODEL.AFF.NBHD_SIZE = [48, 48, 48, 48]
 82 |     cfg.MODEL.AFF.LAYER_SCALE = 0.0
 83 |     cfg.MODEL.AFF.ALPHA = 4.0
 84 |     cfg.MODEL.AFF.DS_RATE = 0.25
 85 |     cfg.MODEL.AFF.RESERVE = True
 86 |     cfg.MODEL.AFF.DROP_RATE = 0.0
 87 |     cfg.MODEL.AFF.ATTN_DROP_RATE = 0.0
 88 |     cfg.MODEL.AFF.DROP_PATH_RATE = 0.3
 89 |     cfg.MODEL.AFF.PATCH_NORM = True
 90 |     cfg.MODEL.AFF.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
 91 |     cfg.MODEL.AFF.SHEPARD_POWER = 6.0
 92 |     cfg.MODEL.AFF.SHEPARD_POWER_LEARNABLE = True
 93 | 
 94 |     # NOTE: maskformer2 extra configs
 95 |     # transformer module
 96 |     cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME = "MultiScaleMaskedTransformerDecoder"
 97 | 
 98 |     # LSJ aug
 99 |     cfg.INPUT.IMAGE_SIZE = 1024
100 |     cfg.INPUT.MIN_SCALE = 0.1
101 |     cfg.INPUT.MAX_SCALE = 2.0
102 | 
103 |     # MSDeformAttn encoder configs
104 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"]
105 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4
106 |     cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8
107 | 
108 |     # point loss configs
109 |     # Number of points sampled during training for a mask point head.
110 |     cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS = 112 * 112
111 |     # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the
112 |     # original paper.
113 |     cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO = 3.0
114 |     # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in
115 |     # the original paper.
116 |     cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75
117 | 


--------------------------------------------------------------------------------
/mask2former/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 
3 | from . import datasets
4 | 


--------------------------------------------------------------------------------
/mask2former/data/dataset_mappers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/mask2former/data/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
  3 | 
  4 | import copy
  5 | import logging
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | 
 10 | from detectron2.config import configurable
 11 | from detectron2.data import detection_utils as utils
 12 | from detectron2.data import transforms as T
 13 | 
 14 | from pycocotools import mask as coco_mask
 15 | 
 16 | __all__ = ["COCOInstanceNewBaselineDatasetMapper"]
 17 | 
 18 | 
 19 | def convert_coco_poly_to_mask(segmentations, height, width):
 20 |     masks = []
 21 |     for polygons in segmentations:
 22 |         rles = coco_mask.frPyObjects(polygons, height, width)
 23 |         mask = coco_mask.decode(rles)
 24 |         if len(mask.shape) < 3:
 25 |             mask = mask[..., None]
 26 |         mask = torch.as_tensor(mask, dtype=torch.uint8)
 27 |         mask = mask.any(dim=2)
 28 |         masks.append(mask)
 29 |     if masks:
 30 |         masks = torch.stack(masks, dim=0)
 31 |     else:
 32 |         masks = torch.zeros((0, height, width), dtype=torch.uint8)
 33 |     return masks
 34 | 
 35 | 
 36 | def build_transform_gen(cfg, is_train):
 37 |     """
 38 |     Create a list of default :class:`Augmentation` from config.
 39 |     Now it includes resizing and flipping.
 40 |     Returns:
 41 |         list[Augmentation]
 42 |     """
 43 |     assert is_train, "Only support training augmentation"
 44 |     image_size = cfg.INPUT.IMAGE_SIZE
 45 |     min_scale = cfg.INPUT.MIN_SCALE
 46 |     max_scale = cfg.INPUT.MAX_SCALE
 47 | 
 48 |     augmentation = []
 49 | 
 50 |     if cfg.INPUT.RANDOM_FLIP != "none":
 51 |         augmentation.append(
 52 |             T.RandomFlip(
 53 |                 horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
 54 |                 vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
 55 |             )
 56 |         )
 57 | 
 58 |     augmentation.extend([
 59 |         T.ResizeScale(
 60 |             min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
 61 |         ),
 62 |         T.FixedSizeCrop(crop_size=(image_size, image_size)),
 63 |     ])
 64 | 
 65 |     return augmentation
 66 | 
 67 | 
 68 | # This is specifically designed for the COCO dataset.
 69 | class COCOInstanceNewBaselineDatasetMapper:
 70 |     """
 71 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 72 |     and map it into a format used by MaskFormer.
 73 | 
 74 |     This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
 75 | 
 76 |     The callable currently does the following:
 77 | 
 78 |     1. Read the image from "file_name"
 79 |     2. Applies geometric transforms to the image and annotation
 80 |     3. Find and applies suitable cropping to the image and annotation
 81 |     4. Prepare image and annotation to Tensors
 82 |     """
 83 | 
 84 |     @configurable
 85 |     def __init__(
 86 |         self,
 87 |         is_train=True,
 88 |         *,
 89 |         tfm_gens,
 90 |         image_format,
 91 |     ):
 92 |         """
 93 |         NOTE: this interface is experimental.
 94 |         Args:
 95 |             is_train: for training or inference
 96 |             augmentations: a list of augmentations or deterministic transforms to apply
 97 |             tfm_gens: data augmentation
 98 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 99 |         """
100 |         self.tfm_gens = tfm_gens
101 |         logging.getLogger(__name__).info(
102 |             "[COCOInstanceNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(str(self.tfm_gens))
103 |         )
104 | 
105 |         self.img_format = image_format
106 |         self.is_train = is_train
107 | 
108 |     @classmethod
109 |     def from_config(cls, cfg, is_train=True):
110 |         # Build augmentation
111 |         tfm_gens = build_transform_gen(cfg, is_train)
112 | 
113 |         ret = {
114 |             "is_train": is_train,
115 |             "tfm_gens": tfm_gens,
116 |             "image_format": cfg.INPUT.FORMAT,
117 |         }
118 |         return ret
119 | 
120 |     def __call__(self, dataset_dict):
121 |         """
122 |         Args:
123 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
124 | 
125 |         Returns:
126 |             dict: a format that builtin models in detectron2 accept
127 |         """
128 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
129 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
130 |         utils.check_image_size(dataset_dict, image)
131 | 
132 |         # TODO: get padding mask
133 |         # by feeding a "segmentation mask" to the same transforms
134 |         padding_mask = np.ones(image.shape[:2])
135 | 
136 |         image, transforms = T.apply_transform_gens(self.tfm_gens, image)
137 |         # the crop transformation has default padding value 0 for segmentation
138 |         padding_mask = transforms.apply_segmentation(padding_mask)
139 |         padding_mask = ~ padding_mask.astype(bool)
140 | 
141 |         image_shape = image.shape[:2]  # h, w
142 | 
143 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
144 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
145 |         # Therefore it's important to use torch.Tensor.
146 |         dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
147 |         dataset_dict["padding_mask"] = torch.as_tensor(np.ascontiguousarray(padding_mask))
148 | 
149 |         if not self.is_train:
150 |             # USER: Modify this if you want to keep them for some reason.
151 |             dataset_dict.pop("annotations", None)
152 |             return dataset_dict
153 | 
154 |         if "annotations" in dataset_dict:
155 |             # USER: Modify this if you want to keep them for some reason.
156 |             for anno in dataset_dict["annotations"]:
157 |                 # Let's always keep mask
158 |                 # if not self.mask_on:
159 |                 #     anno.pop("segmentation", None)
160 |                 anno.pop("keypoints", None)
161 | 
162 |             # USER: Implement additional transformations if you have other types of data
163 |             annos = [
164 |                 utils.transform_instance_annotations(obj, transforms, image_shape)
165 |                 for obj in dataset_dict.pop("annotations")
166 |                 if obj.get("iscrowd", 0) == 0
167 |             ]
168 |             # NOTE: does not support BitMask due to augmentation
169 |             # Current BitMask cannot handle empty objects
170 |             instances = utils.annotations_to_instances(annos, image_shape)
171 |             # After transforms such as cropping are applied, the bounding box may no longer
172 |             # tightly bound the object. As an example, imagine a triangle object
173 |             # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
174 |             # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
175 |             # the intersection of original bounding box and the cropping box.
176 |             instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
177 |             # Need to filter empty instances first (due to augmentation)
178 |             instances = utils.filter_empty_instances(instances)
179 |             # Generate masks from polygon
180 |             h, w = instances.image_size
181 |             # image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float)
182 |             if hasattr(instances, 'gt_masks'):
183 |                 gt_masks = instances.gt_masks
184 |                 gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
185 |                 instances.gt_masks = gt_masks
186 |             dataset_dict["instances"] = instances
187 | 
188 |         return dataset_dict
189 | 


--------------------------------------------------------------------------------
/mask2former/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
  3 | 
  4 | import copy
  5 | import logging
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | 
 10 | from detectron2.config import configurable
 11 | from detectron2.data import detection_utils as utils
 12 | from detectron2.data import transforms as T
 13 | from detectron2.structures import BitMasks, Boxes, Instances
 14 | 
 15 | __all__ = ["COCOPanopticNewBaselineDatasetMapper"]
 16 | 
 17 | 
 18 | def build_transform_gen(cfg, is_train):
 19 |     """
 20 |     Create a list of default :class:`Augmentation` from config.
 21 |     Now it includes resizing and flipping.
 22 |     Returns:
 23 |         list[Augmentation]
 24 |     """
 25 |     assert is_train, "Only support training augmentation"
 26 |     image_size = cfg.INPUT.IMAGE_SIZE
 27 |     min_scale = cfg.INPUT.MIN_SCALE
 28 |     max_scale = cfg.INPUT.MAX_SCALE
 29 | 
 30 |     augmentation = []
 31 | 
 32 |     if cfg.INPUT.RANDOM_FLIP != "none":
 33 |         augmentation.append(
 34 |             T.RandomFlip(
 35 |                 horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
 36 |                 vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
 37 |             )
 38 |         )
 39 | 
 40 |     augmentation.extend([
 41 |         T.ResizeScale(
 42 |             min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
 43 |         ),
 44 |         T.FixedSizeCrop(crop_size=(image_size, image_size)),
 45 |     ])
 46 | 
 47 |     return augmentation
 48 | 
 49 | 
 50 | # This is specifically designed for the COCO dataset.
 51 | class COCOPanopticNewBaselineDatasetMapper:
 52 |     """
 53 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 54 |     and map it into a format used by MaskFormer.
 55 | 
 56 |     This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
 57 | 
 58 |     The callable currently does the following:
 59 | 
 60 |     1. Read the image from "file_name"
 61 |     2. Applies geometric transforms to the image and annotation
 62 |     3. Find and applies suitable cropping to the image and annotation
 63 |     4. Prepare image and annotation to Tensors
 64 |     """
 65 | 
 66 |     @configurable
 67 |     def __init__(
 68 |         self,
 69 |         is_train=True,
 70 |         *,
 71 |         tfm_gens,
 72 |         image_format,
 73 |     ):
 74 |         """
 75 |         NOTE: this interface is experimental.
 76 |         Args:
 77 |             is_train: for training or inference
 78 |             augmentations: a list of augmentations or deterministic transforms to apply
 79 |             crop_gen: crop augmentation
 80 |             tfm_gens: data augmentation
 81 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 82 |         """
 83 |         self.tfm_gens = tfm_gens
 84 |         logging.getLogger(__name__).info(
 85 |             "[COCOPanopticNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(
 86 |                 str(self.tfm_gens)
 87 |             )
 88 |         )
 89 | 
 90 |         self.img_format = image_format
 91 |         self.is_train = is_train
 92 | 
 93 |     @classmethod
 94 |     def from_config(cls, cfg, is_train=True):
 95 |         # Build augmentation
 96 |         tfm_gens = build_transform_gen(cfg, is_train)
 97 | 
 98 |         ret = {
 99 |             "is_train": is_train,
100 |             "tfm_gens": tfm_gens,
101 |             "image_format": cfg.INPUT.FORMAT,
102 |         }
103 |         return ret
104 | 
105 |     def __call__(self, dataset_dict):
106 |         """
107 |         Args:
108 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
109 | 
110 |         Returns:
111 |             dict: a format that builtin models in detectron2 accept
112 |         """
113 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
114 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
115 |         utils.check_image_size(dataset_dict, image)
116 | 
117 |         image, transforms = T.apply_transform_gens(self.tfm_gens, image)
118 |         image_shape = image.shape[:2]  # h, w
119 | 
120 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
121 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
122 |         # Therefore it's important to use torch.Tensor.
123 |         dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
124 | 
125 |         if not self.is_train:
126 |             # USER: Modify this if you want to keep them for some reason.
127 |             dataset_dict.pop("annotations", None)
128 |             return dataset_dict
129 | 
130 |         if "pan_seg_file_name" in dataset_dict:
131 |             pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
132 |             segments_info = dataset_dict["segments_info"]
133 | 
134 |             # apply the same transformation to panoptic segmentation
135 |             pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
136 | 
137 |             from panopticapi.utils import rgb2id
138 | 
139 |             pan_seg_gt = rgb2id(pan_seg_gt)
140 | 
141 |             instances = Instances(image_shape)
142 |             classes = []
143 |             masks = []
144 |             for segment_info in segments_info:
145 |                 class_id = segment_info["category_id"]
146 |                 if not segment_info["iscrowd"]:
147 |                     classes.append(class_id)
148 |                     masks.append(pan_seg_gt == segment_info["id"])
149 | 
150 |             classes = np.array(classes)
151 |             instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
152 |             if len(masks) == 0:
153 |                 # Some image does not have annotation (all ignored)
154 |                 instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
155 |                 instances.gt_boxes = Boxes(torch.zeros((0, 4)))
156 |             else:
157 |                 masks = BitMasks(
158 |                     torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
159 |                 )
160 |                 instances.gt_masks = masks.tensor
161 |                 instances.gt_boxes = masks.get_bounding_boxes()
162 | 
163 |             dataset_dict["instances"] = instances
164 | 
165 |         return dataset_dict
166 | 


--------------------------------------------------------------------------------
/mask2former/data/dataset_mappers/mask_former_instance_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | import copy
  4 | import logging
  5 | 
  6 | import numpy as np
  7 | import pycocotools.mask as mask_util
  8 | import torch
  9 | from torch.nn import functional as F
 10 | 
 11 | from detectron2.config import configurable
 12 | from detectron2.data import detection_utils as utils
 13 | from detectron2.data import transforms as T
 14 | from detectron2.projects.point_rend import ColorAugSSDTransform
 15 | from detectron2.structures import BitMasks, Instances, polygons_to_bitmask
 16 | 
 17 | __all__ = ["MaskFormerInstanceDatasetMapper"]
 18 | 
 19 | 
 20 | class MaskFormerInstanceDatasetMapper:
 21 |     """
 22 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 23 |     and map it into a format used by MaskFormer for instance segmentation.
 24 | 
 25 |     The callable currently does the following:
 26 | 
 27 |     1. Read the image from "file_name"
 28 |     2. Applies geometric transforms to the image and annotation
 29 |     3. Find and applies suitable cropping to the image and annotation
 30 |     4. Prepare image and annotation to Tensors
 31 |     """
 32 | 
 33 |     @configurable
 34 |     def __init__(
 35 |         self,
 36 |         is_train=True,
 37 |         *,
 38 |         augmentations,
 39 |         image_format,
 40 |         size_divisibility,
 41 |     ):
 42 |         """
 43 |         NOTE: this interface is experimental.
 44 |         Args:
 45 |             is_train: for training or inference
 46 |             augmentations: a list of augmentations or deterministic transforms to apply
 47 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 48 |             size_divisibility: pad image size to be divisible by this value
 49 |         """
 50 |         self.is_train = is_train
 51 |         self.tfm_gens = augmentations
 52 |         self.img_format = image_format
 53 |         self.size_divisibility = size_divisibility
 54 | 
 55 |         logger = logging.getLogger(__name__)
 56 |         mode = "training" if is_train else "inference"
 57 |         logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
 58 | 
 59 |     @classmethod
 60 |     def from_config(cls, cfg, is_train=True):
 61 |         # Build augmentation
 62 |         augs = [
 63 |             T.ResizeShortestEdge(
 64 |                 cfg.INPUT.MIN_SIZE_TRAIN,
 65 |                 cfg.INPUT.MAX_SIZE_TRAIN,
 66 |                 cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
 67 |             )
 68 |         ]
 69 |         if cfg.INPUT.CROP.ENABLED:
 70 |             augs.append(
 71 |                 T.RandomCrop(
 72 |                     cfg.INPUT.CROP.TYPE,
 73 |                     cfg.INPUT.CROP.SIZE,
 74 |                 )
 75 |             )
 76 |         if cfg.INPUT.COLOR_AUG_SSD:
 77 |             augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
 78 |         augs.append(T.RandomFlip())
 79 | 
 80 |         ret = {
 81 |             "is_train": is_train,
 82 |             "augmentations": augs,
 83 |             "image_format": cfg.INPUT.FORMAT,
 84 |             "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
 85 |         }
 86 |         return ret
 87 | 
 88 |     def __call__(self, dataset_dict):
 89 |         """
 90 |         Args:
 91 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
 92 | 
 93 |         Returns:
 94 |             dict: a format that builtin models in detectron2 accept
 95 |         """
 96 |         assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
 97 | 
 98 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
 99 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
100 |         utils.check_image_size(dataset_dict, image)
101 | 
102 |         aug_input = T.AugInput(image)
103 |         aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
104 |         image = aug_input.image
105 | 
106 |         # transform instnace masks
107 |         assert "annotations" in dataset_dict
108 |         for anno in dataset_dict["annotations"]:
109 |             anno.pop("keypoints", None)
110 | 
111 |         annos = [
112 |             utils.transform_instance_annotations(obj, transforms, image.shape[:2])
113 |             for obj in dataset_dict.pop("annotations")
114 |             if obj.get("iscrowd", 0) == 0
115 |         ]
116 | 
117 |         if len(annos):
118 |             assert "segmentation" in annos[0]
119 |         segms = [obj["segmentation"] for obj in annos]
120 |         masks = []
121 |         for segm in segms:
122 |             if isinstance(segm, list):
123 |                 # polygon
124 |                 masks.append(polygons_to_bitmask(segm, *image.shape[:2]))
125 |             elif isinstance(segm, dict):
126 |                 # COCO RLE
127 |                 masks.append(mask_util.decode(segm))
128 |             elif isinstance(segm, np.ndarray):
129 |                 assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
130 |                     segm.ndim
131 |                 )
132 |                 # mask array
133 |                 masks.append(segm)
134 |             else:
135 |                 raise ValueError(
136 |                     "Cannot convert segmentation of type '{}' to BitMasks!"
137 |                     "Supported types are: polygons as list[list[float] or ndarray],"
138 |                     " COCO-style RLE as a dict, or a binary segmentation mask "
139 |                     " in a 2D numpy array of shape HxW.".format(type(segm))
140 |                 )
141 | 
142 |         # Pad image and segmentation label here!
143 |         image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
144 |         masks = [torch.from_numpy(np.ascontiguousarray(x)) for x in masks]
145 | 
146 |         classes = [int(obj["category_id"]) for obj in annos]
147 |         classes = torch.tensor(classes, dtype=torch.int64)
148 | 
149 |         if self.size_divisibility > 0:
150 |             image_size = (image.shape[-2], image.shape[-1])
151 |             padding_size = [
152 |                 0,
153 |                 self.size_divisibility - image_size[1],
154 |                 0,
155 |                 self.size_divisibility - image_size[0],
156 |             ]
157 |             # pad image
158 |             image = F.pad(image, padding_size, value=128).contiguous()
159 |             # pad mask
160 |             masks = [F.pad(x, padding_size, value=0).contiguous() for x in masks]
161 | 
162 |         image_shape = (image.shape[-2], image.shape[-1])  # h, w
163 | 
164 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
165 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
166 |         # Therefore it's important to use torch.Tensor.
167 |         dataset_dict["image"] = image
168 | 
169 |         # Prepare per-category binary masks
170 |         instances = Instances(image_shape)
171 |         instances.gt_classes = classes
172 |         if len(masks) == 0:
173 |             # Some image does not have annotation (all ignored)
174 |             instances.gt_masks = torch.zeros((0, image.shape[-2], image.shape[-1]))
175 |         else:
176 |             masks = BitMasks(torch.stack(masks))
177 |             instances.gt_masks = masks.tensor
178 | 
179 |         dataset_dict["instances"] = instances
180 | 
181 |         return dataset_dict
182 | 


--------------------------------------------------------------------------------
/mask2former/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | import copy
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from torch.nn import functional as F
  8 | 
  9 | from detectron2.config import configurable
 10 | from detectron2.data import detection_utils as utils
 11 | from detectron2.data import transforms as T
 12 | from detectron2.structures import BitMasks, Instances
 13 | 
 14 | from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper
 15 | 
 16 | __all__ = ["MaskFormerPanopticDatasetMapper"]
 17 | 
 18 | 
 19 | class MaskFormerPanopticDatasetMapper(MaskFormerSemanticDatasetMapper):
 20 |     """
 21 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 22 |     and map it into a format used by MaskFormer for panoptic segmentation.
 23 | 
 24 |     The callable currently does the following:
 25 | 
 26 |     1. Read the image from "file_name"
 27 |     2. Applies geometric transforms to the image and annotation
 28 |     3. Find and applies suitable cropping to the image and annotation
 29 |     4. Prepare image and annotation to Tensors
 30 |     """
 31 | 
 32 |     @configurable
 33 |     def __init__(
 34 |         self,
 35 |         is_train=True,
 36 |         *,
 37 |         augmentations,
 38 |         image_format,
 39 |         ignore_label,
 40 |         size_divisibility,
 41 |     ):
 42 |         """
 43 |         NOTE: this interface is experimental.
 44 |         Args:
 45 |             is_train: for training or inference
 46 |             augmentations: a list of augmentations or deterministic transforms to apply
 47 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 48 |             ignore_label: the label that is ignored to evaluation
 49 |             size_divisibility: pad image size to be divisible by this value
 50 |         """
 51 |         super().__init__(
 52 |             is_train,
 53 |             augmentations=augmentations,
 54 |             image_format=image_format,
 55 |             ignore_label=ignore_label,
 56 |             size_divisibility=size_divisibility,
 57 |         )
 58 | 
 59 |     def __call__(self, dataset_dict):
 60 |         """
 61 |         Args:
 62 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
 63 | 
 64 |         Returns:
 65 |             dict: a format that builtin models in detectron2 accept
 66 |         """
 67 |         assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
 68 | 
 69 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
 70 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
 71 |         utils.check_image_size(dataset_dict, image)
 72 | 
 73 |         # semantic segmentation
 74 |         if "sem_seg_file_name" in dataset_dict:
 75 |             # PyTorch transformation not implemented for uint16, so converting it to double first
 76 |             sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
 77 |         else:
 78 |             sem_seg_gt = None
 79 | 
 80 |         # panoptic segmentation
 81 |         if "pan_seg_file_name" in dataset_dict:
 82 |             pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
 83 |             segments_info = dataset_dict["segments_info"]
 84 |         else:
 85 |             pan_seg_gt = None
 86 |             segments_info = None
 87 | 
 88 |         if pan_seg_gt is None:
 89 |             raise ValueError(
 90 |                 "Cannot find 'pan_seg_file_name' for panoptic segmentation dataset {}.".format(
 91 |                     dataset_dict["file_name"]
 92 |                 )
 93 |             )
 94 | 
 95 |         aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
 96 |         aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
 97 |         image = aug_input.image
 98 |         if sem_seg_gt is not None:
 99 |             sem_seg_gt = aug_input.sem_seg
100 | 
101 |         # apply the same transformation to panoptic segmentation
102 |         pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
103 | 
104 |         from panopticapi.utils import rgb2id
105 | 
106 |         pan_seg_gt = rgb2id(pan_seg_gt)
107 | 
108 |         # Pad image and segmentation label here!
109 |         image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
110 |         if sem_seg_gt is not None:
111 |             sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
112 |         pan_seg_gt = torch.as_tensor(pan_seg_gt.astype("long"))
113 | 
114 |         if self.size_divisibility > 0:
115 |             image_size = (image.shape[-2], image.shape[-1])
116 |             padding_size = [
117 |                 0,
118 |                 self.size_divisibility - image_size[1],
119 |                 0,
120 |                 self.size_divisibility - image_size[0],
121 |             ]
122 |             image = F.pad(image, padding_size, value=128).contiguous()
123 |             if sem_seg_gt is not None:
124 |                 sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
125 |             pan_seg_gt = F.pad(
126 |                 pan_seg_gt, padding_size, value=0
127 |             ).contiguous()  # 0 is the VOID panoptic label
128 | 
129 |         image_shape = (image.shape[-2], image.shape[-1])  # h, w
130 | 
131 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
132 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
133 |         # Therefore it's important to use torch.Tensor.
134 |         dataset_dict["image"] = image
135 |         if sem_seg_gt is not None:
136 |             dataset_dict["sem_seg"] = sem_seg_gt.long()
137 | 
138 |         if "annotations" in dataset_dict:
139 |             raise ValueError("Pemantic segmentation dataset should not have 'annotations'.")
140 | 
141 |         # Prepare per-category binary masks
142 |         pan_seg_gt = pan_seg_gt.numpy()
143 |         instances = Instances(image_shape)
144 |         classes = []
145 |         masks = []
146 |         for segment_info in segments_info:
147 |             class_id = segment_info["category_id"]
148 |             if not segment_info["iscrowd"]:
149 |                 classes.append(class_id)
150 |                 masks.append(pan_seg_gt == segment_info["id"])
151 | 
152 |         classes = np.array(classes)
153 |         instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
154 |         if len(masks) == 0:
155 |             # Some image does not have annotation (all ignored)
156 |             instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
157 |         else:
158 |             masks = BitMasks(
159 |                 torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
160 |             )
161 |             instances.gt_masks = masks.tensor
162 | 
163 |         dataset_dict["instances"] = instances
164 | 
165 |         return dataset_dict
166 | 


--------------------------------------------------------------------------------
/mask2former/data/dataset_mappers/mask_former_semantic_dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | import copy
  4 | import logging
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from torch.nn import functional as F
  9 | 
 10 | from detectron2.config import configurable
 11 | from detectron2.data import MetadataCatalog
 12 | from detectron2.data import detection_utils as utils
 13 | from detectron2.data import transforms as T
 14 | from detectron2.projects.point_rend import ColorAugSSDTransform
 15 | from detectron2.structures import BitMasks, Instances
 16 | 
 17 | __all__ = ["MaskFormerSemanticDatasetMapper"]
 18 | 
 19 | 
 20 | class MaskFormerSemanticDatasetMapper:
 21 |     """
 22 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 23 |     and map it into a format used by MaskFormer for semantic segmentation.
 24 | 
 25 |     The callable currently does the following:
 26 | 
 27 |     1. Read the image from "file_name"
 28 |     2. Applies geometric transforms to the image and annotation
 29 |     3. Find and applies suitable cropping to the image and annotation
 30 |     4. Prepare image and annotation to Tensors
 31 |     """
 32 | 
 33 |     @configurable
 34 |     def __init__(
 35 |         self,
 36 |         is_train=True,
 37 |         *,
 38 |         augmentations,
 39 |         image_format,
 40 |         ignore_label,
 41 |         size_divisibility,
 42 |     ):
 43 |         """
 44 |         NOTE: this interface is experimental.
 45 |         Args:
 46 |             is_train: for training or inference
 47 |             augmentations: a list of augmentations or deterministic transforms to apply
 48 |             image_format: an image format supported by :func:`detection_utils.read_image`.
 49 |             ignore_label: the label that is ignored to evaluation
 50 |             size_divisibility: pad image size to be divisible by this value
 51 |         """
 52 |         self.is_train = is_train
 53 |         self.tfm_gens = augmentations
 54 |         self.img_format = image_format
 55 |         self.ignore_label = ignore_label
 56 |         self.size_divisibility = size_divisibility
 57 | 
 58 |         logger = logging.getLogger(__name__)
 59 |         mode = "training" if is_train else "inference"
 60 |         logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
 61 | 
 62 |     @classmethod
 63 |     def from_config(cls, cfg, is_train=True):
 64 |         # Build augmentation
 65 |         augs = [
 66 |             T.ResizeShortestEdge(
 67 |                 cfg.INPUT.MIN_SIZE_TRAIN,
 68 |                 cfg.INPUT.MAX_SIZE_TRAIN,
 69 |                 cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
 70 |             )
 71 |         ]
 72 |         if cfg.INPUT.CROP.ENABLED:
 73 |             augs.append(
 74 |                 T.RandomCrop_CategoryAreaConstraint(
 75 |                     cfg.INPUT.CROP.TYPE,
 76 |                     cfg.INPUT.CROP.SIZE,
 77 |                     cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA,
 78 |                     cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
 79 |                 )
 80 |             )
 81 |         if cfg.INPUT.COLOR_AUG_SSD:
 82 |             augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
 83 |         augs.append(T.RandomFlip())
 84 | 
 85 |         # Assume always applies to the training set.
 86 |         dataset_names = cfg.DATASETS.TRAIN
 87 |         meta = MetadataCatalog.get(dataset_names[0])
 88 |         ignore_label = meta.ignore_label
 89 | 
 90 |         ret = {
 91 |             "is_train": is_train,
 92 |             "augmentations": augs,
 93 |             "image_format": cfg.INPUT.FORMAT,
 94 |             "ignore_label": ignore_label,
 95 |             "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
 96 |         }
 97 |         return ret
 98 | 
 99 |     def __call__(self, dataset_dict):
100 |         """
101 |         Args:
102 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
103 | 
104 |         Returns:
105 |             dict: a format that builtin models in detectron2 accept
106 |         """
107 |         assert self.is_train, "MaskFormerSemanticDatasetMapper should only be used for training!"
108 | 
109 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
110 |         image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
111 |         utils.check_image_size(dataset_dict, image)
112 | 
113 |         if "sem_seg_file_name" in dataset_dict:
114 |             # PyTorch transformation not implemented for uint16, so converting it to double first
115 |             sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
116 |         else:
117 |             sem_seg_gt = None
118 | 
119 |         if sem_seg_gt is None:
120 |             raise ValueError(
121 |                 "Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format(
122 |                     dataset_dict["file_name"]
123 |                 )
124 |             )
125 | 
126 |         aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
127 |         aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
128 |         image = aug_input.image
129 |         sem_seg_gt = aug_input.sem_seg
130 | 
131 |         # Pad image and segmentation label here!
132 |         image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
133 |         if sem_seg_gt is not None:
134 |             sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
135 | 
136 |         if self.size_divisibility > 0:
137 |             image_size = (image.shape[-2], image.shape[-1])
138 |             padding_size = [
139 |                 0,
140 |                 self.size_divisibility - image_size[1],
141 |                 0,
142 |                 self.size_divisibility - image_size[0],
143 |             ]
144 |             image = F.pad(image, padding_size, value=128).contiguous()
145 |             if sem_seg_gt is not None:
146 |                 sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
147 | 
148 |         image_shape = (image.shape[-2], image.shape[-1])  # h, w
149 | 
150 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
151 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
152 |         # Therefore it's important to use torch.Tensor.
153 |         dataset_dict["image"] = image
154 | 
155 |         if sem_seg_gt is not None:
156 |             dataset_dict["sem_seg"] = sem_seg_gt.long()
157 | 
158 |         if "annotations" in dataset_dict:
159 |             raise ValueError("Semantic segmentation dataset should not have 'annotations'.")
160 | 
161 |         # Prepare per-category binary masks
162 |         if sem_seg_gt is not None:
163 |             sem_seg_gt = sem_seg_gt.numpy()
164 |             instances = Instances(image_shape)
165 |             classes = np.unique(sem_seg_gt)
166 |             # remove ignored region
167 |             classes = classes[classes != self.ignore_label]
168 |             instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
169 | 
170 |             masks = []
171 |             for class_id in classes:
172 |                 masks.append(sem_seg_gt == class_id)
173 | 
174 |             if len(masks) == 0:
175 |                 # Some image does not have annotation (all ignored)
176 |                 instances.gt_masks = torch.zeros((0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1]))
177 |             else:
178 |                 masks = BitMasks(
179 |                     torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
180 |                 )
181 |                 instances.gt_masks = masks.tensor
182 | 
183 |             dataset_dict["instances"] = instances
184 | 
185 |         return dataset_dict
186 | 


--------------------------------------------------------------------------------
/mask2former/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 
3 | from . import register_coco_panoptic_annos_semseg
4 | 


--------------------------------------------------------------------------------
/mask2former/data/datasets/register_coco_panoptic_annos_semseg.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | import json
  4 | import os
  5 | 
  6 | from detectron2.data import DatasetCatalog, MetadataCatalog
  7 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
  8 | from detectron2.utils.file_io import PathManager
  9 | 
 10 | 
 11 | _PREDEFINED_SPLITS_COCO_PANOPTIC = {
 12 |     "coco_2017_train_panoptic": (
 13 |         # This is the original panoptic annotation directory
 14 |         "coco/panoptic_train2017",
 15 |         "coco/annotations/panoptic_train2017.json",
 16 |         # This directory contains semantic annotations that are
 17 |         # converted from panoptic annotations.
 18 |         # It is used by PanopticFPN.
 19 |         # You can use the script at detectron2/datasets/prepare_panoptic_fpn.py
 20 |         # to create these directories.
 21 |         "coco/panoptic_semseg_train2017",
 22 |     ),
 23 |     "coco_2017_val_panoptic": (
 24 |         "coco/panoptic_val2017",
 25 |         "coco/annotations/panoptic_val2017.json",
 26 |         "coco/panoptic_semseg_val2017",
 27 |     ),
 28 | }
 29 | 
 30 | 
 31 | def get_metadata():
 32 |     meta = {}
 33 |     # The following metadata maps contiguous id from [0, #thing categories +
 34 |     # #stuff categories) to their names and colors. We have to replica of the
 35 |     # same name and color under "thing_*" and "stuff_*" because the current
 36 |     # visualization function in D2 handles thing and class classes differently
 37 |     # due to some heuristic used in Panoptic FPN. We keep the same naming to
 38 |     # enable reusing existing visualization functions.
 39 |     thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1]
 40 |     thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1]
 41 |     stuff_classes = [k["name"] for k in COCO_CATEGORIES]
 42 |     stuff_colors = [k["color"] for k in COCO_CATEGORIES]
 43 | 
 44 |     meta["thing_classes"] = thing_classes
 45 |     meta["thing_colors"] = thing_colors
 46 |     meta["stuff_classes"] = stuff_classes
 47 |     meta["stuff_colors"] = stuff_colors
 48 | 
 49 |     # Convert category id for training:
 50 |     #   category id: like semantic segmentation, it is the class id for each
 51 |     #   pixel. Since there are some classes not used in evaluation, the category
 52 |     #   id is not always contiguous and thus we have two set of category ids:
 53 |     #       - original category id: category id in the original dataset, mainly
 54 |     #           used for evaluation.
 55 |     #       - contiguous category id: [0, #classes), in order to train the linear
 56 |     #           softmax classifier.
 57 |     thing_dataset_id_to_contiguous_id = {}
 58 |     stuff_dataset_id_to_contiguous_id = {}
 59 | 
 60 |     for i, cat in enumerate(COCO_CATEGORIES):
 61 |         if cat["isthing"]:
 62 |             thing_dataset_id_to_contiguous_id[cat["id"]] = i
 63 |         # else:
 64 |         #     stuff_dataset_id_to_contiguous_id[cat["id"]] = i
 65 | 
 66 |         # in order to use sem_seg evaluator
 67 |         stuff_dataset_id_to_contiguous_id[cat["id"]] = i
 68 | 
 69 |     meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
 70 |     meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
 71 | 
 72 |     return meta
 73 | 
 74 | 
 75 | def load_coco_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta):
 76 |     """
 77 |     Args:
 78 |         image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
 79 |         gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
 80 |         json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
 81 |     Returns:
 82 |         list[dict]: a list of dicts in Detectron2 standard format. (See
 83 |         `Using Custom Datasets </tutorials/datasets.html>`_ )
 84 |     """
 85 | 
 86 |     def _convert_category_id(segment_info, meta):
 87 |         if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
 88 |             segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
 89 |                 segment_info["category_id"]
 90 |             ]
 91 |             segment_info["isthing"] = True
 92 |         else:
 93 |             segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
 94 |                 segment_info["category_id"]
 95 |             ]
 96 |             segment_info["isthing"] = False
 97 |         return segment_info
 98 | 
 99 |     with PathManager.open(json_file) as f:
100 |         json_info = json.load(f)
101 | 
102 |     ret = []
103 |     for ann in json_info["annotations"]:
104 |         image_id = int(ann["image_id"])
105 |         # TODO: currently we assume image and label has the same filename but
106 |         # different extension, and images have extension ".jpg" for COCO. Need
107 |         # to make image extension a user-provided argument if we extend this
108 |         # function to support other COCO-like datasets.
109 |         image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg")
110 |         label_file = os.path.join(gt_dir, ann["file_name"])
111 |         sem_label_file = os.path.join(semseg_dir, ann["file_name"])
112 |         segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]]
113 |         ret.append(
114 |             {
115 |                 "file_name": image_file,
116 |                 "image_id": image_id,
117 |                 "pan_seg_file_name": label_file,
118 |                 "sem_seg_file_name": sem_label_file,
119 |                 "segments_info": segments_info,
120 |             }
121 |         )
122 |     assert len(ret), f"No images found in {image_dir}!"
123 |     assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
124 |     assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"]
125 |     assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"]
126 |     return ret
127 | 
128 | 
129 | def register_coco_panoptic_annos_sem_seg(
130 |     name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json
131 | ):
132 |     panoptic_name = name
133 |     delattr(MetadataCatalog.get(panoptic_name), "thing_classes")
134 |     delattr(MetadataCatalog.get(panoptic_name), "thing_colors")
135 |     MetadataCatalog.get(panoptic_name).set(
136 |         thing_classes=metadata["thing_classes"],
137 |         thing_colors=metadata["thing_colors"],
138 |         # thing_dataset_id_to_contiguous_id=metadata["thing_dataset_id_to_contiguous_id"],
139 |     )
140 | 
141 |     # the name is "coco_2017_train_panoptic_with_sem_seg" and "coco_2017_val_panoptic_with_sem_seg"
142 |     semantic_name = name + "_with_sem_seg"
143 |     DatasetCatalog.register(
144 |         semantic_name,
145 |         lambda: load_coco_panoptic_json(panoptic_json, image_root, panoptic_root, sem_seg_root, metadata),
146 |     )
147 |     MetadataCatalog.get(semantic_name).set(
148 |         sem_seg_root=sem_seg_root,
149 |         panoptic_root=panoptic_root,
150 |         image_root=image_root,
151 |         panoptic_json=panoptic_json,
152 |         json_file=instances_json,
153 |         evaluator_type="coco_panoptic_seg",
154 |         ignore_label=255,
155 |         label_divisor=1000,
156 |         **metadata,
157 |     )
158 | 
159 | 
160 | def register_all_coco_panoptic_annos_sem_seg(root):
161 |     for (
162 |         prefix,
163 |         (panoptic_root, panoptic_json, semantic_root),
164 |     ) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items():
165 |         prefix_instances = prefix[: -len("_panoptic")]
166 |         instances_meta = MetadataCatalog.get(prefix_instances)
167 |         image_root, instances_json = instances_meta.image_root, instances_meta.json_file
168 | 
169 |         register_coco_panoptic_annos_sem_seg(
170 |             prefix,
171 |             get_metadata(),
172 |             image_root,
173 |             os.path.join(root, panoptic_root),
174 |             os.path.join(root, panoptic_json),
175 |             os.path.join(root, semantic_root),
176 |             instances_json,
177 |         )
178 | 
179 | 
180 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
181 | register_all_coco_panoptic_annos_sem_seg(_root)
182 | 


--------------------------------------------------------------------------------
/mask2former/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/mask2former/evaluation/instance_evaluation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | import itertools
 4 | import json
 5 | import os
 6 | 
 7 | from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco
 8 | from detectron2.utils.file_io import PathManager
 9 | 
10 | 
11 | # modified from COCOEvaluator for instance segmetnat
12 | class InstanceSegEvaluator(COCOEvaluator):
13 |     """
14 |     Evaluate AR for object proposals, AP for instance detection/segmentation, AP
15 |     for keypoint detection outputs using COCO's metrics.
16 |     See http://cocodataset.org/#detection-eval and
17 |     http://cocodataset.org/#keypoints-eval to understand its metrics.
18 |     The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means
19 |     the metric cannot be computed (e.g. due to no predictions made).
20 | 
21 |     In addition to COCO, this evaluator is able to support any bounding box detection,
22 |     instance segmentation, or keypoint detection dataset.
23 |     """
24 | 
25 |     def _eval_predictions(self, predictions, img_ids=None):
26 |         """
27 |         Evaluate predictions. Fill self._results with the metrics of the tasks.
28 |         """
29 |         self._logger.info("Preparing results for COCO format ...")
30 |         coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
31 |         tasks = self._tasks or self._tasks_from_predictions(coco_results)
32 | 
33 |         # unmap the category ids for COCO
34 |         if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
35 |             dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
36 |             # all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
37 |             # num_classes = len(all_contiguous_ids)
38 |             # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
39 | 
40 |             reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
41 |             for result in coco_results:
42 |                 category_id = result["category_id"]
43 |                 # assert category_id < num_classes, (
44 |                 #     f"A prediction has class={category_id}, "
45 |                 #     f"but the dataset only has {num_classes} classes and "
46 |                 #     f"predicted class id should be in [0, {num_classes - 1}]."
47 |                 # )
48 |                 assert category_id in reverse_id_mapping, (
49 |                     f"A prediction has class={category_id}, "
50 |                     f"but the dataset only has class ids in {dataset_id_to_contiguous_id}."
51 |                 )
52 |                 result["category_id"] = reverse_id_mapping[category_id]
53 | 
54 |         if self._output_dir:
55 |             file_path = os.path.join(self._output_dir, "coco_instances_results.json")
56 |             self._logger.info("Saving results to {}".format(file_path))
57 |             with PathManager.open(file_path, "w") as f:
58 |                 f.write(json.dumps(coco_results))
59 |                 f.flush()
60 | 
61 |         if not self._do_evaluation:
62 |             self._logger.info("Annotations are not available for evaluation.")
63 |             return
64 | 
65 |         self._logger.info(
66 |             "Evaluating predictions with {} COCO API...".format(
67 |                 "unofficial" if self._use_fast_impl else "official"
68 |             )
69 |         )
70 |         for task in sorted(tasks):
71 |             assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!"
72 |             coco_eval = (
73 |                 _evaluate_predictions_on_coco(
74 |                     self._coco_api,
75 |                     coco_results,
76 |                     task,
77 |                     kpt_oks_sigmas=self._kpt_oks_sigmas,
78 |                     use_fast_impl=self._use_fast_impl,
79 |                     img_ids=img_ids,
80 |                     max_dets_per_image=self._max_dets_per_image,
81 |                 )
82 |                 if len(coco_results) > 0
83 |                 else None  # cocoapi does not handle empty results very well
84 |             )
85 | 
86 |             res = self._derive_coco_results(
87 |                 coco_eval, task, class_names=self._metadata.get("thing_classes")
88 |             )
89 |             self._results[task] = res
90 | 


--------------------------------------------------------------------------------
/mask2former/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Adapted for AutoFocusFormer by Ziwen 2023
3 | 
4 | from .backbone.aff import AutoFocusFormer
5 | 
6 | from .pixel_decoder.msdeformattn_pc import MSDeformAttnPixelDecoder
7 | from .meta_arch.mask_former_head import MaskFormerHead
8 | 


--------------------------------------------------------------------------------
/mask2former/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # For licensing see accompanying LICENSE file.
3 | # Copyright (C) 2023 Apple Inc. All Rights Reserved.
4 | #
5 | 


--------------------------------------------------------------------------------
/mask2former/modeling/clusten/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # For licensing see accompanying LICENSE file.
3 | # Copyright (C) 2023 Apple Inc. All Rights Reserved.
4 | #
5 | 
6 | from .clusten import CLUSTENQKFunction, CLUSTENAVFunction, CLUSTENWFFunction, WEIGHTEDGATHERFunction, MSDETRPCFunction
7 | 


--------------------------------------------------------------------------------
/mask2former/modeling/clusten/clusten.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # For licensing see accompanying LICENSE file.
  3 | # Copyright (C) 2023 Apple Inc. All Rights Reserved.
  4 | #
  5 | 
  6 | from torch.autograd import Function
  7 | 
  8 | try:
  9 |     import clustenqk_cuda
 10 |     import clustenav_cuda
 11 |     import clustenwf_cuda
 12 |     import weighted_gather_cuda
 13 |     import msdetrpc_cuda
 14 | except ImportError:
 15 |     raise RuntimeError("Could not load CLUSTEN CUDA extension. " +
 16 |                        "Please make sure your device has CUDA, the CUDA toolkit for PyTorch is installed, and that you've compiled CLUSTEN correctly.")
 17 | 
 18 | 
 19 | class CLUSTENQKFunction(Function):
 20 |     """
 21 |     query times key function
 22 |     """
 23 |     @staticmethod
 24 |     def forward(ctx, query, key, nbhd_idx):
 25 |         query = query.contiguous()
 26 |         key = key.contiguous()
 27 |         if key.dtype != query.dtype:
 28 |             key = key.to(query.dtype)
 29 |         nbhd_idx = nbhd_idx.contiguous()
 30 |         attn = clustenqk_cuda.forward(
 31 |             query,
 32 |             key.permute(0, 1, 3, 2).contiguous(),
 33 |             nbhd_idx)
 34 |         ctx.save_for_backward(query, key, nbhd_idx)
 35 |         return attn
 36 | 
 37 |     @staticmethod
 38 |     def backward(ctx, grad_attn):
 39 |         outputs = clustenqk_cuda.backward(
 40 |             grad_attn.contiguous(), *ctx.saved_tensors)
 41 |         d_query, d_key = outputs
 42 |         return d_query, d_key, None
 43 | 
 44 | 
 45 | class CLUSTENAVFunction(Function):
 46 |     """
 47 |     attention times value function
 48 |     """
 49 |     @staticmethod
 50 |     def forward(ctx, attn, v, nbhd_idx):
 51 |         attn = attn.contiguous()
 52 |         v = v.contiguous()
 53 |         nbhd_idx = nbhd_idx.contiguous()
 54 |         if attn.dtype != v.dtype:
 55 |             v = v.to(attn.dtype)
 56 |         feat = clustenav_cuda.forward(
 57 |             attn,
 58 |             v,
 59 |             nbhd_idx)
 60 |         ctx.save_for_backward(attn, v, nbhd_idx)
 61 |         return feat
 62 | 
 63 |     @staticmethod
 64 |     def backward(ctx, grad_feat):
 65 |         outputs = clustenav_cuda.backward(
 66 |             grad_feat.contiguous(), *ctx.saved_tensors)
 67 |         d_attn, d_v = outputs
 68 |         return d_attn, d_v, None
 69 | 
 70 | 
 71 | class CLUSTENWFFunction(Function):
 72 |     """
 73 |     weight times feature function
 74 |     """
 75 |     @staticmethod
 76 |     def forward(ctx, weights, feat, nbhd_idx):
 77 |         weights = weights.contiguous()
 78 |         feat = feat.contiguous()
 79 |         nbhd_idx = nbhd_idx.contiguous()
 80 |         if feat.dtype != weights.dtype:
 81 |             feat = feat.to(weights.dtype)
 82 |         feat_new = clustenwf_cuda.forward(
 83 |             weights,
 84 |             feat,
 85 |             nbhd_idx)
 86 |         ctx.save_for_backward(weights, feat, nbhd_idx)
 87 |         return feat_new
 88 | 
 89 |     @staticmethod
 90 |     def backward(ctx, grad_feat_new):
 91 |         outputs = clustenwf_cuda.backward(
 92 |             grad_feat_new.contiguous(), *ctx.saved_tensors)
 93 |         d_weights, d_feat = outputs
 94 |         return d_weights, d_feat, None
 95 | 
 96 | 
 97 | class WEIGHTEDGATHERFunction(Function):
 98 |     """
 99 |     weighted gather function
100 |     """
101 |     @staticmethod
102 |     def forward(ctx, nbhd_idx, weights, feat):
103 |         nbhd_idx = nbhd_idx.contiguous()
104 |         weights = weights.contiguous()
105 |         feat = feat.contiguous()
106 |         if feat.dtype != weights.dtype:
107 |             weights = weights.to(feat.dtype)
108 |         feat_new = weighted_gather_cuda.forward(
109 |             nbhd_idx,
110 |             weights,
111 |             feat)
112 |         ctx.save_for_backward(nbhd_idx, weights, feat)
113 |         return feat_new
114 | 
115 |     @staticmethod
116 |     def backward(ctx, grad_feat_new):
117 |         outputs = weighted_gather_cuda.backward(
118 |             grad_feat_new.contiguous(), *ctx.saved_tensors)
119 |         d_weights, d_feat = outputs
120 |         return None, d_weights, d_feat
121 | 
122 | 
123 | class MSDETRPCFunction(Function):
124 |     """
125 |     deformable multi scale detr point cloud function
126 |     """
127 |     @staticmethod
128 |     def forward(ctx, nn_idx, nn_weight, attn, val):
129 |         nn_idx = nn_idx.contiguous()
130 |         nn_weight = nn_weight.contiguous()
131 |         attn = attn.contiguous()
132 |         val = val.contiguous()
133 |         feat = msdetrpc_cuda.forward(
134 |             nn_idx,
135 |             nn_weight,
136 |             attn,
137 |             val)
138 |         ctx.save_for_backward(nn_idx, nn_weight, attn, val)
139 |         return feat
140 | 
141 |     @staticmethod
142 |     def backward(ctx, grad_feat):
143 |         outputs = msdetrpc_cuda.backward(
144 |             grad_feat.contiguous(), *ctx.saved_tensors)
145 |         d_weight, d_attn, d_val = outputs
146 |         return None, d_weight, d_attn, d_val
147 | 


--------------------------------------------------------------------------------
/mask2former/modeling/clusten/src/clustenav_cuda.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * For licensing see accompanying LICENSE file.
 3 |  * Copyright (C) 2023 Apple Inc. All Rights Reserved.
 4 |  */
 5 | 
 6 | #include <torch/extension.h>
 7 | #include <vector>
 8 | 
 9 | torch::Tensor clusten_av_cuda_forward(
10 |     const torch::Tensor &attn,                          // b x h x n x m
11 |     const torch::Tensor &v,                             // b x h x n x c
12 |     const torch::Tensor &nbhd_idx);                     // b x n x m
13 | 
14 | std::vector<torch::Tensor> clusten_av_cuda_backward(
15 |     const torch::Tensor &d_feat, 
16 |     const torch::Tensor &attn,
17 |     const torch::Tensor &v,
18 |     const torch::Tensor &nbhd_idx);
19 | 
20 | // C++ interface
21 | #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
22 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
23 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
24 | 
25 | torch::Tensor clusten_av_forward(           
26 |     const torch::Tensor &attn,
27 |     const torch::Tensor &v,
28 |     const torch::Tensor &nbhd_idx) {
29 |     CHECK_INPUT(attn);
30 |     CHECK_INPUT(v);
31 |     CHECK_INPUT(nbhd_idx);
32 |     return clusten_av_cuda_forward(attn, v, nbhd_idx);
33 | }
34 | 
35 | std::vector<torch::Tensor> clusten_av_backward(
36 |     const torch::Tensor &d_feat,
37 |     const torch::Tensor &attn,
38 |     const torch::Tensor &v,
39 |     const torch::Tensor &nbhd_idx) {
40 |     CHECK_INPUT(d_feat);
41 |     CHECK_INPUT(attn);
42 |     CHECK_INPUT(v);
43 |     CHECK_INPUT(nbhd_idx);
44 |     return clusten_av_cuda_backward(d_feat, attn, v, nbhd_idx);
45 | }
46 | 
47 | 
48 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
49 |   m.def("forward", &clusten_av_forward, "CLUSTENAV forward (CUDA)");
50 |   m.def("backward", &clusten_av_backward, "CLUSTENAV backward (CUDA)");
51 | }
52 | 


--------------------------------------------------------------------------------
/mask2former/modeling/clusten/src/clustenqk_cuda.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * For licensing see accompanying LICENSE file.
 3 |  * Copyright (C) 2023 Apple Inc. All Rights Reserved.
 4 |  */
 5 | 
 6 | #include <torch/extension.h>
 7 | #include <vector>
 8 | 
 9 | torch::Tensor clusten_qk_cuda_forward(
10 |     const torch::Tensor &query,             // b x h x n x c
11 |     const torch::Tensor &key,               // b x h x n x c
12 |     const torch::Tensor &nbhd_idx);         // b x n x m
13 | 
14 | std::vector<torch::Tensor> clusten_qk_cuda_backward(
15 |     const torch::Tensor &d_attn,
16 |     const torch::Tensor &query,
17 |     const torch::Tensor &key,
18 |     const torch::Tensor &nbhd_idx);
19 | 
20 | // C++ interface
21 | #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
22 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
23 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
24 | 
25 | torch::Tensor clusten_qk_forward(
26 |     const torch::Tensor &query,
27 |     const torch::Tensor &key,
28 |     const torch::Tensor &nbhd_idx) {
29 |     CHECK_INPUT(query);
30 |     CHECK_INPUT(key);
31 |     CHECK_INPUT(nbhd_idx);
32 |     return clusten_qk_cuda_forward(query, key, nbhd_idx);
33 | }
34 | 
35 | std::vector<torch::Tensor> clusten_qk_backward(
36 |     const torch::Tensor &d_attn,
37 |     const torch::Tensor &query,
38 |     const torch::Tensor &key,
39 |     const torch::Tensor &nbhd_idx) {
40 |     CHECK_INPUT(d_attn);
41 |     CHECK_INPUT(query);
42 |     CHECK_INPUT(key);
43 |     CHECK_INPUT(nbhd_idx);
44 |     return clusten_qk_cuda_backward(d_attn, query, key, nbhd_idx);
45 | }
46 | 
47 | 
48 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
49 |   m.def("forward", &clusten_qk_forward, "CLUSTENQK forward (CUDA)");
50 |   m.def("backward", &clusten_qk_backward, "CLUSTENQK backward (CUDA)");
51 | }
52 | 


--------------------------------------------------------------------------------
/mask2former/modeling/clusten/src/clustenqk_cuda_kernel.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * For licensing see accompanying LICENSE file.
  3 |  * Copyright (C) 2023 Apple Inc. All Rights Reserved.
  4 |  */
  5 | 
  6 | #include <torch/extension.h>
  7 | 
  8 | #include <cuda.h>
  9 | #include <cuda_runtime.h>
 10 | #include <vector>
 11 | #include <ATen/cuda/CUDAContext.h>
 12 | #include <ATen/ATen.h>
 13 | #include <ATen/native/cuda/KernelUtils.cuh>
 14 | #include <ATen/AccumulateType.h>
 15 | 
 16 | #define CUDA_NUM_THREADS 1024
 17 | 
 18 | template <typename scalar_t>
 19 | __global__ void clusten_qk_cuda_forward_kernel(
 20 |     const torch::PackedTensorAccessor32<scalar_t,4,torch::DefaultPtrTraits> query,      // b x h x n x c
 21 |     const torch::PackedTensorAccessor32<scalar_t,4,torch::DefaultPtrTraits> key,        // b x h x c x n (reordered by cluster)
 22 |     const torch::PackedTensorAccessor32<int64_t,3,torch::DefaultPtrTraits> nbhd_idx,    // b x n x m
 23 |     torch::PackedTensorAccessor32<scalar_t,4,torch::DefaultPtrTraits> attn,             // b x h x n x m
 24 |     const int length,           // n
 25 |     const int batch_size,       // b
 26 |     const int heads,            // h
 27 |     const int nbhd_size,        // m
 28 |     const int dim) {            // c
 29 | 
 30 |     const int z = blockIdx.z * blockDim.z + threadIdx.z;
 31 |     if (z < batch_size * heads){
 32 |         const int i = blockIdx.y * blockDim.y + threadIdx.y;
 33 |         if (i < length){
 34 |             const int ni = blockIdx.x * blockDim.x + threadIdx.x;
 35 |             if (ni < nbhd_size){
 36 |                 const int b = z / heads;
 37 |                 const int h = z - b * heads;
 38 |                 int64_t nbi = nbhd_idx[b][i][ni];
 39 |                 // calculate q@k
 40 |                 scalar_t updt = scalar_t(0);
 41 |                 #pragma unroll
 42 |                 for (unsigned int c=0; c < dim; ++c) {
 43 |                     updt += query[b][h][i][c] * key[b][h][c][nbi];
 44 |                 }
 45 |                 attn[b][h][i][ni] = updt;
 46 |             }
 47 |         }
 48 |     }
 49 | }
 50 | 
 51 | 
 52 | torch::Tensor clusten_qk_cuda_forward(
 53 |     const torch::Tensor &query,
 54 |     const torch::Tensor &key,
 55 |     const torch::Tensor &nbhd_idx) {
 56 | 
 57 |     int64_t batch_size = query.size(0);
 58 |     int64_t heads = query.size(1);
 59 |     int64_t length = query.size(2);
 60 |     int64_t dim = query.size(3);
 61 |     int64_t nbhd_size = nbhd_idx.size(2);
 62 |     int zsize = batch_size * heads;
 63 | 
 64 |     int NBHDTHREADS = min(int64_t(CUDA_NUM_THREADS), nbhd_size);
 65 |     int TOKENTHREADS = min(int64_t(CUDA_NUM_THREADS / NBHDTHREADS), length);
 66 |     int BATCHTHREADS = max(1, CUDA_NUM_THREADS / (TOKENTHREADS * NBHDTHREADS));
 67 | 
 68 |     auto attn = torch::zeros(
 69 |             {batch_size, heads, length, nbhd_size}, query.options());
 70 | 
 71 |     const auto stream = c10::cuda::getCurrentCUDAStream();
 72 |     const dim3 blocks(
 73 |             (dim + NBHDTHREADS - 1) / NBHDTHREADS,
 74 |             (length + TOKENTHREADS - 1) / TOKENTHREADS,
 75 |             (zsize + BATCHTHREADS - 1) / BATCHTHREADS);
 76 |     const dim3 threads(NBHDTHREADS, TOKENTHREADS, BATCHTHREADS);
 77 | 
 78 |     AT_DISPATCH_FLOATING_TYPES_AND_HALF(query.scalar_type(), "clusten_qk_cuda_forward", ([&] {
 79 |         const auto query_a = query.packed_accessor32<scalar_t,4,torch::DefaultPtrTraits>();
 80 |         const auto key_a = key.packed_accessor32<scalar_t,4,torch::DefaultPtrTraits>();
 81 |         const auto nbhd_idx_a = nbhd_idx.packed_accessor32<int64_t,3,torch::DefaultPtrTraits>();
 82 |         auto attn_a = attn.packed_accessor32<scalar_t,4,torch::DefaultPtrTraits>();
 83 | 
 84 |         clusten_qk_cuda_forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
 85 |                 query_a, key_a, nbhd_idx_a, attn_a, 
 86 |                 length, batch_size, heads, nbhd_size, dim);
 87 |     }));
 88 |     return attn;
 89 | }
 90 | 
 91 | template <typename scalar_t>
 92 | __global__ void clusten_qk_cuda_backward_kernel(
 93 |     const torch::PackedTensorAccessor32<scalar_t,4,torch::DefaultPtrTraits> d_attn,
 94 |     const torch::PackedTensorAccessor32<scalar_t,4,torch::DefaultPtrTraits> query,
 95 |     const torch::PackedTensorAccessor32<scalar_t,4,torch::DefaultPtrTraits> key,
 96 |     const torch::PackedTensorAccessor32<int64_t,3,torch::DefaultPtrTraits> nbhd_idx,
 97 |     torch::PackedTensorAccessor32<scalar_t,4,torch::DefaultPtrTraits> d_query,
 98 |     torch::PackedTensorAccessor32<scalar_t,4,torch::DefaultPtrTraits> d_key,
 99 |     const int length,
100 |     const int batch_size,
101 |     const int heads,
102 |     const int nbhd_size,
103 |     const int dim,
104 |     const size_t d_key_numel) {
105 | 
106 |     const int z = blockIdx.z * blockDim.z + threadIdx.z;
107 |     if (z < batch_size * heads){
108 |         const int i = blockIdx.y * blockDim.y + threadIdx.y;
109 |         if (i < length){
110 |             const int c = blockIdx.x * blockDim.x + threadIdx.x;
111 |             if (c < dim){
112 |                 const int b = z / heads;
113 |                 const int h = z - b * heads;
114 |                 size_t index;
115 |                 scalar_t dq_update = scalar_t(0);
116 |                 scalar_t d_attn_tmp;
117 |                 #pragma unroll
118 |                 for (unsigned int ni=0; ni < nbhd_size; ++ni) {
119 |                     const int64_t nbi = nbhd_idx[b][i][ni];
120 |                     // calculate d_query = key * d_att
121 |                     // calculate d_key = query * d_att
122 |                     d_attn_tmp = d_attn[b][h][i][ni];
123 |                     dq_update += key[b][h][nbi][c] * d_attn_tmp;
124 |                     index = b*d_key.stride(0) + h*d_key.stride(1) + nbi*d_key.stride(2) + c;
125 |                     at::native::fastAtomicAdd(d_key.data(), index, d_key_numel, query[b][h][i][c] * d_attn_tmp, true);
126 |                     //atomicAdd(&(d_key[b][h][nbi][c]), query[b][h][i][c] * d_attn_tmp); // avoid race condition
127 |                 }
128 |                 d_query[b][h][i][c] = dq_update;
129 |             }
130 |         }
131 |     }
132 | }
133 | 
134 | std::vector<torch::Tensor> clusten_qk_cuda_backward(
135 |     const torch::Tensor &d_attn,
136 |     const torch::Tensor &query,
137 |     const torch::Tensor &key,
138 |     const torch::Tensor &nbhd_idx) {
139 | 
140 |     int64_t batch_size = query.size(0);
141 |     int64_t heads = query.size(1);
142 |     int64_t length = query.size(2);
143 |     int64_t dim = query.size(3);
144 |     int64_t nbhd_size = nbhd_idx.size(2);
145 |     int zsize = batch_size * heads;
146 | 
147 |     int CHANNELTHREADS = min(int64_t(CUDA_NUM_THREADS), dim);
148 |     int TOKENTHREADS = min(int64_t(CUDA_NUM_THREADS / CHANNELTHREADS), length);
149 |     int BATCHTHREADS = max(1, CUDA_NUM_THREADS / (TOKENTHREADS * CHANNELTHREADS));
150 | 
151 |     auto d_query = torch::zeros_like(query);
152 |     auto d_key = torch::zeros_like(key);
153 | 
154 |     const auto stream = c10::cuda::getCurrentCUDAStream();
155 | 
156 |     const dim3 blocks(
157 |             (dim + CHANNELTHREADS - 1) / CHANNELTHREADS,
158 |             (length + TOKENTHREADS - 1) / TOKENTHREADS,
159 |             (zsize + BATCHTHREADS - 1) / BATCHTHREADS);
160 | 
161 |     const dim3 threads(CHANNELTHREADS, TOKENTHREADS, BATCHTHREADS);
162 | 
163 |     AT_DISPATCH_FLOATING_TYPES_AND_HALF(query.scalar_type(), "clusten_qk_cuda_backward", ([&] {
164 |         const auto d_attn_a = d_attn.packed_accessor32<scalar_t,4,torch::DefaultPtrTraits>();
165 |         const auto query_a = query.packed_accessor32<scalar_t,4,torch::DefaultPtrTraits>();
166 |         const auto key_a = key.packed_accessor32<scalar_t,4,torch::DefaultPtrTraits>();
167 |         const auto nbhd_idx_a = nbhd_idx.packed_accessor32<int64_t,3,torch::DefaultPtrTraits>();
168 |         auto d_query_a = d_query.packed_accessor32<scalar_t,4,torch::DefaultPtrTraits>();
169 |         auto d_key_a = d_key.packed_accessor32<scalar_t,4,torch::DefaultPtrTraits>();
170 | 
171 |         const size_t d_key_numel = d_key.numel();
172 |         clusten_qk_cuda_backward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
173 |                 d_attn_a, query_a, key_a, nbhd_idx_a, d_query_a, d_key_a,
174 |                 length, batch_size, heads, nbhd_size, dim, d_key_numel);
175 |     }));
176 | 
177 |     return {d_query, d_key};
178 | }
179 | 


--------------------------------------------------------------------------------
/mask2former/modeling/clusten/src/clustenwf_cuda.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * For licensing see accompanying LICENSE file.
 3 |  * Copyright (C) 2023 Apple Inc. All Rights Reserved.
 4 |  */
 5 | 
 6 | #include <torch/extension.h>
 7 | #include <vector>
 8 | 
 9 | torch::Tensor clusten_wf_cuda_forward(
10 |     const torch::Tensor &weights,                           // b x n_ x m x ic
11 |     const torch::Tensor &feat,                              // b x n x c
12 |     const torch::Tensor &nbhd_idx);                         // b x n_ x m
13 | 
14 | std::vector<torch::Tensor> clusten_wf_cuda_backward(
15 |     const torch::Tensor &d_feat_new, 
16 |     const torch::Tensor &weights,
17 |     const torch::Tensor &feat,
18 |     const torch::Tensor &nbhd_idx);
19 | 
20 | // C++ interface
21 | #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
22 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
23 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
24 | 
25 | torch::Tensor clusten_wf_forward(           
26 |     const torch::Tensor &weights,
27 |     const torch::Tensor &feat,
28 |     const torch::Tensor &nbhd_idx) {
29 |     CHECK_INPUT(weights);
30 |     CHECK_INPUT(feat);
31 |     CHECK_INPUT(nbhd_idx);
32 |     return clusten_wf_cuda_forward(weights, feat, nbhd_idx);
33 | }
34 | 
35 | std::vector<torch::Tensor> clusten_wf_backward(
36 |     const torch::Tensor &d_feat_new,
37 |     const torch::Tensor &weights,
38 |     const torch::Tensor &feat,
39 |     const torch::Tensor &nbhd_idx) {
40 |     CHECK_INPUT(d_feat_new);
41 |     CHECK_INPUT(weights);
42 |     CHECK_INPUT(feat);
43 |     CHECK_INPUT(nbhd_idx);
44 |     return clusten_wf_cuda_backward(d_feat_new, weights, feat, nbhd_idx);
45 | }
46 | 
47 | 
48 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
49 |   m.def("forward", &clusten_wf_forward, "CLUSTENWF forward (CUDA)");
50 |   m.def("backward", &clusten_wf_backward, "CLUSTENWF backward (CUDA)");
51 | }
52 | 


--------------------------------------------------------------------------------
/mask2former/modeling/clusten/src/msdetrpc_cuda.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * For licensing see accompanying LICENSE file.
 3 |  * Copyright (C) 2023 Apple Inc. All Rights Reserved.
 4 |  */
 5 | 
 6 | #include <torch/extension.h>
 7 | #include <vector>
 8 | 
 9 | torch::Tensor msdetrpc_cuda_forward(
10 |     const torch::Tensor &nn_idx,                            // b x n x m x k
11 |     const torch::Tensor &nn_weight,                         // b x n x m x k
12 |     const torch::Tensor &attn,                              // b x n x m
13 |     const torch::Tensor &val);                              // b x n_ x c
14 | 
15 | std::vector<torch::Tensor> msdetrpc_cuda_backward(
16 |     const torch::Tensor &d_feat, 
17 |     const torch::Tensor &nn_idx,
18 |     const torch::Tensor &nn_weight,
19 |     const torch::Tensor &attn,
20 |     const torch::Tensor &val);
21 | 
22 | // C++ interface
23 | #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
24 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
25 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
26 | 
27 | torch::Tensor msdetrpc_forward(           
28 |     const torch::Tensor &nn_idx,
29 |     const torch::Tensor &nn_weight,
30 |     const torch::Tensor &attn,
31 |     const torch::Tensor &val) {
32 |     CHECK_INPUT(nn_idx);
33 |     CHECK_INPUT(nn_weight);
34 |     CHECK_INPUT(attn);
35 |     CHECK_INPUT(val);
36 |     return msdetrpc_cuda_forward(nn_idx, nn_weight, attn, val);
37 | }
38 | 
39 | std::vector<torch::Tensor> msdetrpc_backward(
40 |     const torch::Tensor &d_feat,
41 |     const torch::Tensor &nn_idx,
42 |     const torch::Tensor &nn_weight,
43 |     const torch::Tensor &attn,
44 |     const torch::Tensor &val) {
45 |     CHECK_INPUT(d_feat);
46 |     CHECK_INPUT(nn_idx);
47 |     CHECK_INPUT(nn_weight);
48 |     CHECK_INPUT(attn);
49 |     CHECK_INPUT(val);
50 |     return msdetrpc_cuda_backward(d_feat, nn_idx, nn_weight, attn, val);
51 | }
52 | 
53 | 
54 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
55 |   m.def("forward", &msdetrpc_forward, "MSDETRPC forward (CUDA)");
56 |   m.def("backward", &msdetrpc_backward, "MSDETRPC backward (CUDA)");
57 | }
58 | 


--------------------------------------------------------------------------------
/mask2former/modeling/clusten/src/setup.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # For licensing see accompanying LICENSE file.
 3 | # Copyright (C) 2023 Apple Inc. All Rights Reserved.
 4 | #
 5 | 
 6 | from setuptools import setup
 7 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 8 | 
 9 | setup(
10 |     name='clustencuda',
11 |     version='0.1',
12 |     author='Ziwen Chen',
13 |     author_email='chenziw@oregonstate.edu',
14 |     description='Cluster Attention CUDA Kernel',
15 |     ext_modules=[
16 |         CUDAExtension('clustenqk_cuda', [
17 |             'clustenqk_cuda.cpp',
18 |             'clustenqk_cuda_kernel.cu',
19 |         ]),
20 |         CUDAExtension('clustenav_cuda', [
21 |             'clustenav_cuda.cpp',
22 |             'clustenav_cuda_kernel.cu',
23 |         ]),
24 |         CUDAExtension('clustenwf_cuda', [
25 |             'clustenwf_cuda.cpp',
26 |             'clustenwf_cuda_kernel.cu',
27 |         ]),
28 |         CUDAExtension('weighted_gather_cuda', [
29 |             'weighted_gather_cuda.cpp',
30 |             'weighted_gather_cuda_kernel.cu',
31 |         ]),
32 |         CUDAExtension('msdetrpc_cuda', [
33 |             'msdetrpc_cuda.cpp',
34 |             'msdetrpc_cuda_kernel.cu',
35 |         ]),
36 |     ],
37 |     cmdclass={
38 |         'build_ext': BuildExtension
39 |     })
40 | 


--------------------------------------------------------------------------------
/mask2former/modeling/clusten/src/weighted_gather_cuda.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * For licensing see accompanying LICENSE file.
 3 |  * Copyright (C) 2023 Apple Inc. All Rights Reserved.
 4 |  */
 5 | 
 6 | #include <torch/extension.h>
 7 | #include <vector>
 8 | 
 9 | torch::Tensor weighted_gather_cuda_forward(
10 |     const torch::Tensor &nbhd_idx,                          // b x n x m
11 |     const torch::Tensor &weights,                           // b x n x m
12 |     const torch::Tensor &feat);                             // b x n_ x c
13 | 
14 | std::vector<torch::Tensor> weighted_gather_cuda_backward(
15 |     const torch::Tensor &d_feat_new, 
16 |     const torch::Tensor &nbhd_idx,
17 |     const torch::Tensor &weights,
18 |     const torch::Tensor &feat);
19 | 
20 | // C++ interface
21 | #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
22 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
23 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
24 | 
25 | torch::Tensor weighted_gather_forward(           
26 |     const torch::Tensor &nbhd_idx,
27 |     const torch::Tensor &weights,
28 |     const torch::Tensor &feat) {
29 |     CHECK_INPUT(nbhd_idx);
30 |     CHECK_INPUT(weights);
31 |     CHECK_INPUT(feat);
32 |     return weighted_gather_cuda_forward(nbhd_idx, weights, feat);
33 | }
34 | 
35 | std::vector<torch::Tensor> weighted_gather_backward(
36 |     const torch::Tensor &d_feat_new,
37 |     const torch::Tensor &nbhd_idx,
38 |     const torch::Tensor &weights,
39 |     const torch::Tensor &feat) {
40 |     CHECK_INPUT(d_feat_new);
41 |     CHECK_INPUT(nbhd_idx);
42 |     CHECK_INPUT(weights);
43 |     CHECK_INPUT(feat);
44 |     return weighted_gather_cuda_backward(d_feat_new, nbhd_idx, weights, feat);
45 | }
46 | 
47 | 
48 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
49 |   m.def("forward", &weighted_gather_forward, "WEIGHTEDGATHER forward (CUDA)");
50 |   m.def("backward", &weighted_gather_backward, "WEIGHTEDGATHER backward (CUDA)");
51 | }
52 | 


--------------------------------------------------------------------------------
/mask2former/modeling/clusten/test_msdetrpc_kernel.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # For licensing see accompanying LICENSE file.
 3 | # Copyright (C) 2023 Apple Inc. All Rights Reserved.
 4 | #
 5 | 
 6 | import torch
 7 | from clusten import MSDETRPCFunction
 8 | 
 9 | """
10 | Test the correctness of MSDETR (point cloud) custom kernel
11 | """
12 | 
13 | b = 100
14 | n = 50
15 | n_ = 100
16 | m = 8
17 | k = 4
18 | c = 32
19 | 
20 | # dummy data
21 | nn_idx = torch.randint(n_, (b, n, m, k)).cuda()
22 | nn_weights = torch.rand(b, n, m, k).cuda()
23 | attn = torch.rand(b, n, m).cuda()
24 | val = torch.rand(b, n_, c).cuda()
25 | 
26 | nn_weights.requires_grad_(True)
27 | nn_weights.retain_grad()
28 | attn.requires_grad_(True)
29 | attn.retain_grad()
30 | val.requires_grad_(True)
31 | val.retain_grad()
32 | 
33 | # use the custom kernel
34 | feat = MSDETRPCFunction.apply(nn_idx, nn_weights, attn, val)
35 | feat.mean().backward()
36 | grad_weights = nn_weights.grad.clone().detach()
37 | grad_attn = attn.grad.clone().detach()
38 | grad_val = val.grad.clone().detach()
39 | nn_weights.grad.data.zero_()
40 | attn.grad.data.zero_()
41 | val.grad.data.zero_()
42 | 
43 | # use the pytorch equivalent
44 | nn_val = val.gather(index=nn_idx.view(b, -1).unsqueeze(2).expand(-1, -1, c), dim=1).reshape(b, n, m, k, c)
45 | feat2 = ((nn_val * nn_weights.unsqueeze(4)).sum(3) * attn.unsqueeze(3)).sum(2)  # b x n x c
46 | feat2.mean().backward()
47 | grad_weights2 = nn_weights.grad.clone().detach()
48 | grad_attn2 = attn.grad.clone().detach()
49 | grad_val2 = val.grad.clone().detach()
50 | nn_weights.grad.data.zero_()
51 | attn.grad.data.zero_()
52 | val.grad.data.zero_()
53 | 
54 | print('diff of forward: ', torch.linalg.norm(feat2 - feat))
55 | print('diff of grad weights: ', torch.linalg.norm(grad_weights2 - grad_weights))
56 | print('diff of grad attn: ', torch.linalg.norm(grad_attn2 - grad_attn))
57 | print('diff of grad val: ', torch.linalg.norm(grad_val2 - grad_val))
58 | 


--------------------------------------------------------------------------------
/mask2former/modeling/clusten/test_wg_kernel.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # For licensing see accompanying LICENSE file.
 3 | # Copyright (C) 2023 Apple Inc. All Rights Reserved.
 4 | #
 5 | 
 6 | import torch
 7 | from clusten import WEIGHTEDGATHERFunction
 8 | 
 9 | """
10 | Test the correctness of WeightedGather custom kernel
11 | """
12 | 
13 | b = 100
14 | n = 50
15 | n_ = 100
16 | k = 4
17 | c = 32
18 | 
19 | # dummy data
20 | nn_idx = torch.randint(n_, (b, n, k)).cuda()
21 | nn_weights = torch.rand(b, n, k).cuda()
22 | feature = torch.rand(b, n_, c).cuda()
23 | nn_weights.requires_grad_(True)
24 | nn_weights.retain_grad()
25 | feature.requires_grad_(True)
26 | feature.retain_grad()
27 | 
28 | # use the custom kernel
29 | up_features = WEIGHTEDGATHERFunction.apply(nn_idx, nn_weights, feature)
30 | up_features.mean().backward()
31 | grad_weights = nn_weights.grad.clone().detach()
32 | grad_feat = feature.grad.clone().detach()
33 | nn_weights.grad.data.zero_()
34 | feature.grad.data.zero_()
35 | 
36 | # use the pytorch equivalent
37 | nn_features = feature.gather(index=nn_idx.view(b, -1).unsqueeze(2).expand(-1, -1, c), dim=1).reshape(b, n, k, c)
38 | up_features2 = nn_features.mul(nn_weights.unsqueeze(3).expand(-1, -1, -1, c)).sum(dim=2)  # b x n x c
39 | up_features2.mean().backward()
40 | grad_weights2 = nn_weights.grad.clone().detach()
41 | grad_feat2 = feature.grad.clone().detach()
42 | nn_weights.grad.data.zero_()
43 | feature.grad.data.zero_()
44 | 
45 | print('diff of forward: ', torch.linalg.norm(up_features2 - up_features))
46 | print('diff of grad weights: ', torch.linalg.norm(grad_weights2 - grad_weights))
47 | print('diff of grad feat: ', torch.linalg.norm(grad_feat2 - grad_feat))
48 | 


--------------------------------------------------------------------------------
/mask2former/modeling/matcher.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/matcher.py
  3 | 
  4 | """
  5 | Modules to compute the matching cost and solve the corresponding LSAP.
  6 | """
  7 | import torch
  8 | import torch.nn.functional as F
  9 | from scipy.optimize import linear_sum_assignment
 10 | from torch import nn
 11 | from torch.cuda.amp import autocast
 12 | 
 13 | from detectron2.projects.point_rend.point_features import point_sample
 14 | 
 15 | 
 16 | def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor):
 17 |     """
 18 |     Compute the DICE loss, similar to generalized IOU for masks
 19 |     Args:
 20 |         inputs: A float tensor of arbitrary shape.
 21 |                 The predictions for each example.
 22 |         targets: A float tensor with the same shape as inputs. Stores the binary
 23 |                  classification label for each element in inputs
 24 |                 (0 for the negative class and 1 for the positive class).
 25 |     """
 26 |     inputs = inputs.sigmoid()
 27 |     inputs = inputs.flatten(1)
 28 |     numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets)
 29 |     denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :]
 30 |     loss = 1 - (numerator + 1) / (denominator + 1)
 31 |     return loss
 32 | 
 33 | 
 34 | batch_dice_loss_jit = torch.jit.script(
 35 |     batch_dice_loss
 36 | )  # type: torch.jit.ScriptModule
 37 | 
 38 | 
 39 | def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor):
 40 |     """
 41 |     Args:
 42 |         inputs: A float tensor of arbitrary shape.
 43 |                 The predictions for each example.
 44 |         targets: A float tensor with the same shape as inputs. Stores the binary
 45 |                  classification label for each element in inputs
 46 |                 (0 for the negative class and 1 for the positive class).
 47 |     Returns:
 48 |         Loss tensor
 49 |     """
 50 |     hw = inputs.shape[1]
 51 | 
 52 |     pos = F.binary_cross_entropy_with_logits(
 53 |         inputs, torch.ones_like(inputs), reduction="none"
 54 |     )
 55 |     neg = F.binary_cross_entropy_with_logits(
 56 |         inputs, torch.zeros_like(inputs), reduction="none"
 57 |     )
 58 | 
 59 |     loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum(
 60 |         "nc,mc->nm", neg, (1 - targets)
 61 |     )
 62 | 
 63 |     return loss / hw
 64 | 
 65 | 
 66 | batch_sigmoid_ce_loss_jit = torch.jit.script(
 67 |     batch_sigmoid_ce_loss
 68 | )  # type: torch.jit.ScriptModule
 69 | 
 70 | 
 71 | class HungarianMatcher(nn.Module):
 72 |     """This class computes an assignment between the targets and the predictions of the network
 73 | 
 74 |     For efficiency reasons, the targets don't include the no_object. Because of this, in general,
 75 |     there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
 76 |     while the others are un-matched (and thus treated as non-objects).
 77 |     """
 78 | 
 79 |     def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float = 1, num_points: int = 0):
 80 |         """Creates the matcher
 81 | 
 82 |         Params:
 83 |             cost_class: This is the relative weight of the classification error in the matching cost
 84 |             cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost
 85 |             cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost
 86 |         """
 87 |         super().__init__()
 88 |         self.cost_class = cost_class
 89 |         self.cost_mask = cost_mask
 90 |         self.cost_dice = cost_dice
 91 | 
 92 |         assert cost_class != 0 or cost_mask != 0 or cost_dice != 0, "all costs cant be 0"
 93 | 
 94 |         self.num_points = num_points
 95 | 
 96 |     @torch.no_grad()
 97 |     def memory_efficient_forward(self, outputs, targets):
 98 |         """More memory-friendly matching"""
 99 |         bs, num_queries = outputs["pred_logits"].shape[:2]
100 | 
101 |         indices = []
102 | 
103 |         # Iterate through batch size
104 |         for b in range(bs):
105 | 
106 |             out_prob = outputs["pred_logits"][b].softmax(-1)  # [num_queries, num_classes]
107 |             tgt_ids = targets[b]["labels"]
108 | 
109 |             # Compute the classification cost. Contrary to the loss, we don't use the NLL,
110 |             # but approximate it in 1 - proba[target class].
111 |             # The 1 is a constant that doesn't change the matching, it can be ommitted.
112 |             cost_class = -out_prob[:, tgt_ids]
113 | 
114 |             out_mask = outputs["pred_masks"][b]  # [num_queries, H_pred, W_pred]
115 |             # gt masks are already padded when preparing target
116 |             tgt_mask = targets[b]["masks"].to(out_mask)
117 | 
118 |             out_mask = out_mask[:, None]
119 |             tgt_mask = tgt_mask[:, None]
120 |             # all masks share the same set of points for efficient matching!
121 |             point_coords = torch.rand(1, self.num_points, 2, device=out_mask.device)
122 |             # get gt labels
123 |             tgt_mask = point_sample(
124 |                 tgt_mask,
125 |                 point_coords.repeat(tgt_mask.shape[0], 1, 1),
126 |                 align_corners=False,
127 |             ).squeeze(1)
128 | 
129 |             out_mask = point_sample(
130 |                 out_mask,
131 |                 point_coords.repeat(out_mask.shape[0], 1, 1),
132 |                 align_corners=False,
133 |             ).squeeze(1)
134 | 
135 |             with autocast(enabled=False):
136 |                 out_mask = out_mask.float()
137 |                 tgt_mask = tgt_mask.float()
138 |                 # Compute the focal loss between masks
139 |                 # cost_mask = batch_sigmoid_ce_loss_jit(out_mask, tgt_mask)
140 |                 cost_mask = batch_sigmoid_ce_loss(out_mask, tgt_mask)
141 | 
142 |                 # Compute the dice loss betwen masks
143 |                 # cost_dice = batch_dice_loss_jit(out_mask, tgt_mask)
144 |                 cost_dice = batch_dice_loss(out_mask, tgt_mask)
145 | 
146 |             # Final cost matrix
147 |             C = (
148 |                 self.cost_mask * cost_mask
149 |                 + self.cost_class * cost_class
150 |                 + self.cost_dice * cost_dice
151 |             )
152 |             C = C.reshape(num_queries, -1).cpu()
153 | 
154 |             indices.append(linear_sum_assignment(C))
155 | 
156 |         return [
157 |             (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64))
158 |             for i, j in indices
159 |         ]
160 | 
161 |     @torch.no_grad()
162 |     def forward(self, outputs, targets):
163 |         """Performs the matching
164 | 
165 |         Params:
166 |             outputs: This is a dict that contains at least these entries:
167 |                  "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
168 |                  "pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks
169 | 
170 |             targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
171 |                  "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
172 |                            objects in the target) containing the class labels
173 |                  "masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks
174 | 
175 |         Returns:
176 |             A list of size batch_size, containing tuples of (index_i, index_j) where:
177 |                 - index_i is the indices of the selected predictions (in order)
178 |                 - index_j is the indices of the corresponding selected targets (in order)
179 |             For each batch element, it holds:
180 |                 len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
181 |         """
182 |         return self.memory_efficient_forward(outputs, targets)
183 | 
184 |     def __repr__(self, _repr_indent=4):
185 |         head = "Matcher " + self.__class__.__name__
186 |         body = [
187 |             "cost_class: {}".format(self.cost_class),
188 |             "cost_mask: {}".format(self.cost_mask),
189 |             "cost_dice: {}".format(self.cost_dice),
190 |         ]
191 |         lines = [head] + [" " * _repr_indent + line for line in body]
192 |         return "\n".join(lines)
193 | 


--------------------------------------------------------------------------------
/mask2former/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/mask2former/modeling/meta_arch/mask_former_head.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | import logging
  4 | from typing import Dict
  5 | 
  6 | from torch import nn
  7 | 
  8 | from detectron2.config import configurable
  9 | from detectron2.layers import ShapeSpec
 10 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
 11 | 
 12 | from ..transformer_decoder.mask2former_transformer_decoder import build_transformer_decoder
 13 | from ..pixel_decoder.msdeformattn_pc import build_pixel_decoder
 14 | 
 15 | 
 16 | @SEM_SEG_HEADS_REGISTRY.register()
 17 | class MaskFormerHead(nn.Module):
 18 | 
 19 |     _version = 2
 20 | 
 21 |     def _load_from_state_dict(
 22 |         self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
 23 |     ):
 24 |         version = local_metadata.get("version", None)
 25 |         if version is None or version < 2:
 26 |             # Do not warn if train from scratch
 27 |             scratch = True
 28 |             logger = logging.getLogger(__name__)
 29 |             for k in list(state_dict.keys()):
 30 |                 newk = k
 31 |                 if newk != k:
 32 |                     state_dict[newk] = state_dict[k]
 33 |                     del state_dict[k]
 34 |                     scratch = False
 35 | 
 36 |             if not scratch:
 37 |                 logger.warning(
 38 |                     f"Weight format of {self.__class__.__name__} have changed! "
 39 |                     "Please upgrade your models. Applying automatic conversion now ..."
 40 |                 )
 41 | 
 42 | 
 43 |     @configurable
 44 |     def __init__(
 45 |         self,
 46 |         input_shape: Dict[str, ShapeSpec],
 47 |         *,
 48 |         num_classes: int,
 49 |         pixel_decoder: nn.Module,
 50 |         loss_weight: float = 1.0,
 51 |         ignore_value: int = -1,
 52 |         # extra parameters
 53 |         transformer_predictor: nn.Module,
 54 |         transformer_in_feature: str,
 55 |     ):
 56 |         """
 57 |         NOTE: this interface is experimental.
 58 |         Args:
 59 |             input_shape: shapes (channels and stride) of the input features
 60 |             num_classes: number of classes to predict
 61 |             pixel_decoder: the pixel decoder module
 62 |             loss_weight: loss weight
 63 |             ignore_value: category id to be ignored during training.
 64 |             transformer_predictor: the transformer decoder that makes prediction
 65 |             transformer_in_feature: input feature name to the transformer_predictor
 66 |         """
 67 |         super().__init__()
 68 |         input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
 69 |         self.in_features = [k for k, v in input_shape]
 70 |         feature_strides = [v.stride for k, v in input_shape]
 71 |         feature_channels = [v.channels for k, v in input_shape]
 72 | 
 73 |         self.ignore_value = ignore_value
 74 |         self.common_stride = 4
 75 |         self.loss_weight = loss_weight
 76 | 
 77 |         self.pixel_decoder = pixel_decoder
 78 |         self.predictor = transformer_predictor
 79 |         self.transformer_in_feature = transformer_in_feature
 80 | 
 81 |         self.num_classes = num_classes
 82 | 
 83 |     @classmethod
 84 |     def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
 85 |         # figure out in_channels to transformer predictor
 86 |         if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder":
 87 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
 88 |         elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding":
 89 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
 90 |         elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder":  # for maskformer2
 91 |             transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
 92 |         else:
 93 |             transformer_predictor_in_channels = input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels
 94 | 
 95 |         return {
 96 |             "input_shape": {
 97 |                 k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
 98 |             },
 99 |             "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
100 |             "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
101 |             "pixel_decoder": build_pixel_decoder(cfg, input_shape),
102 |             "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
103 |             "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE,
104 |             "transformer_predictor": build_transformer_decoder(
105 |                 cfg,
106 |                 transformer_predictor_in_channels,
107 |                 mask_classification=True,
108 |             ),
109 |         }
110 | 
111 |     def forward(self, features, mask=None):
112 |         return self.layers(features, mask)
113 | 
114 |     def layers(self, features, mask=None):
115 |         mask_features, mf_pos, transformer_encoder_features, multi_scale_features, multi_scale_poss = self.pixel_decoder.forward_features(features)
116 |         if self.transformer_in_feature == "multi_scale_pixel_decoder":
117 |             predictions = self.predictor(multi_scale_features, multi_scale_poss, mask_features, mf_pos, mask)
118 |         else:
119 |             if self.transformer_in_feature == "transformer_encoder":
120 |                 assert (
121 |                     transformer_encoder_features is not None
122 |                 ), "Please use the TransformerEncoderPixelDecoder."
123 |                 predictions = self.predictor(transformer_encoder_features, mask_features, mask)
124 |             elif self.transformer_in_feature == "pixel_embedding":
125 |                 predictions = self.predictor(mask_features, mask_features, mask)
126 |             else:
127 |                 predictions = self.predictor(features[self.transformer_in_feature], mask_features, mask)
128 |         return predictions
129 | 


--------------------------------------------------------------------------------
/mask2former/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 
3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder
4 | 


--------------------------------------------------------------------------------
/mask2former/modeling/transformer_decoder/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
 3 | # Adapted for AutoFocusFormer by Ziwen 2023
 4 | 
 5 | """
 6 | Various positional encodings for the transformer.
 7 | """
 8 | import math
 9 | 
10 | import torch
11 | from torch import nn
12 | 
13 | 
14 | class PositionEmbeddingSine(nn.Module):
15 |     """
16 |     This is a more standard version of the position embedding, very similar to the one
17 |     used by the Attention is all you need paper, generalized to work on images.
18 |     """
19 | 
20 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
21 |         super().__init__()
22 |         self.num_pos_feats = num_pos_feats
23 |         self.temperature = temperature
24 |         self.normalize = normalize
25 |         if scale is not None and normalize is False:
26 |             raise ValueError("normalize should be True if scale is passed")
27 |         if scale is None:
28 |             scale = 2 * math.pi
29 |         self.scale = scale
30 | 
31 |     def forward(self, pos):
32 |         '''
33 |         pos - b x n x d
34 |         '''
35 |         b, n, d = pos.shape
36 |         y_embed = pos[:, :, 1]  # b x n
37 |         x_embed = pos[:, :, 0]
38 |         if self.normalize:
39 |             eps = 1e-6
40 |             y_embed = y_embed / (y_embed.max() + eps) * self.scale
41 |             x_embed = x_embed / (x_embed.max() + eps) * self.scale
42 | 
43 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=pos.device)  # npf
44 |         dim_t = self.temperature ** (2 * (dim_t.div(2, rounding_mode='floor')) / self.num_pos_feats)  # npf
45 | 
46 |         pos_x = x_embed[:, :, None] / dim_t  # b x n x npf
47 |         pos_y = y_embed[:, :, None] / dim_t
48 |         pos_x = torch.cat(
49 |             (pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=2
50 |         )
51 |         pos_y = torch.cat(
52 |             (pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=2
53 |         )
54 |         pos = torch.cat((pos_x, pos_y), dim=2)  # b x n x d'
55 |         return pos
56 | 
57 |     def __repr__(self, _repr_indent=4):
58 |         head = "Positional encoding " + self.__class__.__name__
59 |         body = [
60 |             "num_pos_feats: {}".format(self.num_pos_feats),
61 |             "temperature: {}".format(self.temperature),
62 |             "normalize: {}".format(self.normalize),
63 |             "scale: {}".format(self.scale),
64 |         ]
65 |         # _repr_indent = 4
66 |         lines = [head] + [" " * _repr_indent + line for line in body]
67 |         return "\n".join(lines)
68 | 


--------------------------------------------------------------------------------
/mask2former/test_time_augmentation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | import copy
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from fvcore.transforms import HFlipTransform
  8 | from torch import nn
  9 | from torch.nn.parallel import DistributedDataParallel
 10 | 
 11 | from detectron2.data.detection_utils import read_image
 12 | from detectron2.modeling import DatasetMapperTTA
 13 | 
 14 | 
 15 | __all__ = [
 16 |     "SemanticSegmentorWithTTA",
 17 | ]
 18 | 
 19 | 
 20 | class SemanticSegmentorWithTTA(nn.Module):
 21 |     """
 22 |     A SemanticSegmentor with test-time augmentation enabled.
 23 |     Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`.
 24 |     """
 25 | 
 26 |     def __init__(self, cfg, model, tta_mapper=None, batch_size=1):
 27 |         """
 28 |         Args:
 29 |             cfg (CfgNode):
 30 |             model (SemanticSegmentor): a SemanticSegmentor to apply TTA on.
 31 |             tta_mapper (callable): takes a dataset dict and returns a list of
 32 |                 augmented versions of the dataset dict. Defaults to
 33 |                 `DatasetMapperTTA(cfg)`.
 34 |             batch_size (int): batch the augmented images into this batch size for inference.
 35 |         """
 36 |         super().__init__()
 37 |         if isinstance(model, DistributedDataParallel):
 38 |             model = model.module
 39 |         self.cfg = cfg.clone()
 40 | 
 41 |         self.model = model
 42 | 
 43 |         if tta_mapper is None:
 44 |             tta_mapper = DatasetMapperTTA(cfg)
 45 |         self.tta_mapper = tta_mapper
 46 |         self.batch_size = batch_size
 47 | 
 48 |     def __call__(self, batched_inputs):
 49 |         """
 50 |         Same input/output format as :meth:`SemanticSegmentor.forward`
 51 |         """
 52 | 
 53 |         def _maybe_read_image(dataset_dict):
 54 |             ret = copy.copy(dataset_dict)
 55 |             if "image" not in ret:
 56 |                 image = read_image(ret.pop("file_name"), self.model.input_format)
 57 |                 image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1)))  # CHW
 58 |                 ret["image"] = image
 59 |             if "height" not in ret and "width" not in ret:
 60 |                 ret["height"] = image.shape[1]
 61 |                 ret["width"] = image.shape[2]
 62 |             return ret
 63 | 
 64 |         processed_results = []
 65 |         for x in batched_inputs:
 66 |             result = self._inference_one_image(_maybe_read_image(x))
 67 |             processed_results.append(result)
 68 |         return processed_results
 69 | 
 70 |     def _inference_one_image(self, input):
 71 |         """
 72 |         Args:
 73 |             input (dict): one dataset dict with "image" field being a CHW tensor
 74 |         Returns:
 75 |             dict: one output dict
 76 |         """
 77 |         orig_shape = (input["height"], input["width"])
 78 |         augmented_inputs, tfms = self._get_augmented_inputs(input)
 79 | 
 80 |         final_predictions = None
 81 |         count_predictions = 0
 82 |         for input, tfm in zip(augmented_inputs, tfms):
 83 |             count_predictions += 1
 84 |             with torch.no_grad():
 85 |                 if final_predictions is None:
 86 |                     if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
 87 |                         final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2])
 88 |                     else:
 89 |                         final_predictions = self.model([input])[0].pop("sem_seg")
 90 |                 else:
 91 |                     if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
 92 |                         final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2])
 93 |                     else:
 94 |                         final_predictions += self.model([input])[0].pop("sem_seg")
 95 | 
 96 |         final_predictions = final_predictions / count_predictions
 97 |         return {"sem_seg": final_predictions}
 98 | 
 99 |     def _get_augmented_inputs(self, input):
100 |         augmented_inputs = self.tta_mapper(input)
101 |         tfms = [x.pop("transforms") for x in augmented_inputs]
102 |         return augmented_inputs, tfms
103 | 


--------------------------------------------------------------------------------
/mask2former/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/mask2former/utils/misc.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | """
  4 | Misc functions, including distributed helpers.
  5 | 
  6 | Mostly copy-paste from torchvision references.
  7 | """
  8 | from typing import List, Optional
  9 | 
 10 | import torch
 11 | import torch.distributed as dist
 12 | import torchvision
 13 | from torch import Tensor
 14 | 
 15 | 
 16 | def _max_by_axis(the_list):
 17 |     # type: (List[List[int]]) -> List[int]
 18 |     maxes = the_list[0]
 19 |     for sublist in the_list[1:]:
 20 |         for index, item in enumerate(sublist):
 21 |             maxes[index] = max(maxes[index], item)
 22 |     return maxes
 23 | 
 24 | 
 25 | class NestedTensor(object):
 26 |     def __init__(self, tensors, mask: Optional[Tensor]):
 27 |         self.tensors = tensors
 28 |         self.mask = mask
 29 | 
 30 |     def to(self, device):
 31 |         # type: (Device) -> NestedTensor # noqa
 32 |         cast_tensor = self.tensors.to(device)
 33 |         mask = self.mask
 34 |         if mask is not None:
 35 |             assert mask is not None
 36 |             cast_mask = mask.to(device)
 37 |         else:
 38 |             cast_mask = None
 39 |         return NestedTensor(cast_tensor, cast_mask)
 40 | 
 41 |     def decompose(self):
 42 |         return self.tensors, self.mask
 43 | 
 44 |     def __repr__(self):
 45 |         return str(self.tensors)
 46 | 
 47 | 
 48 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
 49 |     # TODO make this more general
 50 |     if tensor_list[0].ndim == 3:
 51 |         if torchvision._is_tracing():
 52 |             # nested_tensor_from_tensor_list() does not export well to ONNX
 53 |             # call _onnx_nested_tensor_from_tensor_list() instead
 54 |             return _onnx_nested_tensor_from_tensor_list(tensor_list)
 55 | 
 56 |         # TODO make it support different-sized images
 57 |         max_size = _max_by_axis([list(img.shape) for img in tensor_list])
 58 |         # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
 59 |         batch_shape = [len(tensor_list)] + max_size
 60 |         b, c, h, w = batch_shape
 61 |         dtype = tensor_list[0].dtype
 62 |         device = tensor_list[0].device
 63 |         tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
 64 |         mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
 65 |         for img, pad_img, m in zip(tensor_list, tensor, mask):
 66 |             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 67 |             m[: img.shape[1], : img.shape[2]] = False
 68 |     else:
 69 |         raise ValueError("not supported")
 70 |     return NestedTensor(tensor, mask)
 71 | 
 72 | 
 73 | # _onnx_nested_tensor_from_tensor_list() is an implementation of
 74 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing.
 75 | @torch.jit.unused
 76 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
 77 |     max_size = []
 78 |     for i in range(tensor_list[0].dim()):
 79 |         max_size_i = torch.max(
 80 |             torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
 81 |         ).to(torch.int64)
 82 |         max_size.append(max_size_i)
 83 |     max_size = tuple(max_size)
 84 | 
 85 |     # work around for
 86 |     # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 87 |     # m[: img.shape[1], :img.shape[2]] = False
 88 |     # which is not yet supported in onnx
 89 |     padded_imgs = []
 90 |     padded_masks = []
 91 |     for img in tensor_list:
 92 |         padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
 93 |         padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
 94 |         padded_imgs.append(padded_img)
 95 | 
 96 |         m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
 97 |         padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
 98 |         padded_masks.append(padded_mask.to(torch.bool))
 99 | 
100 |     tensor = torch.stack(padded_imgs)
101 |     mask = torch.stack(padded_masks)
102 | 
103 |     return NestedTensor(tensor, mask=mask)
104 | 
105 | 
106 | def is_dist_avail_and_initialized():
107 |     if not dist.is_available():
108 |         return False
109 |     if not dist.is_initialized():
110 |         return False
111 |     return True
112 | 


--------------------------------------------------------------------------------
/run_aff_segmentation.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # number of parallel gpus
 4 | GPUS=2
 5 | 
 6 | # path to config file
 7 | CONFIG=configs/cityscapes/panoptic-segmentation/aff/maskformer2_aff_small_bs32_45k.yaml
 8 | 
 9 | # checkpoint path for resume
10 | RESUME=checkpoints/city_pan/aff_small.pth
11 | 
12 | # output folder
13 | OUTPUT=outputs/
14 | 
15 | python train_net.py --num-gpus $GPUS \
16 |   --config-file $CONFIG \
17 |   --dist-url tcp://127.0.0.1:12345 \
18 |   --resume \
19 |   --eval-only \
20 |   MODEL.WEIGHTS $RESUME \
21 |   OUTPUT_DIR $OUTPUT
22 | 
23 | # Remove '--resume', '--eval-only' and 'MODEL.WEIGHTS' to start training from fresh.
24 | # Note that if '--resume' is on, the 'MODEL.WEIGHTS' option will be overwritten by the last_checkpoint file in the output folder (auto-resume), if the file exists.
25 | # The KEY VALUE pairs must be at the end, after all the flags.
26 | 


--------------------------------------------------------------------------------
/run_demo.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # path to config file
 4 | CONFIG="../configs/cityscapes/panoptic-segmentation/aff/maskformer2_aff_small_bs32_45k.yaml"
 5 | 
 6 | # path to pre-trained checkpoint
 7 | CKPT="../checkpoints/city_pan/aff_small.pth"
 8 | 
 9 | # path to images for prediction
10 | INPUTS="../imgs/*.jpg"
11 | 
12 | # path to blurred version of input images (optional)
13 | BLUR="../imgs_blur/"
14 | 
15 | # output folder to store results
16 | OUTPUT="demo_res"
17 | 
18 | # create output folder
19 | mkdir $OUTPUT
20 | 
21 | # run visualization code
22 | cd demo/
23 | python demo.py --config-file $CONFIG \
24 |   --input $INPUTS \
25 |   --output ../$OUTPUT \
26 |   --blur $BLUR \
27 |   --opts MODEL.WEIGHTS $CKPT \
28 | 
29 | # The --opts flag should always be the last one
30 | # Remove --blur flag to visualize predictions on original images
31 | 


--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
 1 | This directory contains a few tools.
 2 | 
 3 | * `convert-pretrained-model-to-d2.py`
 4 | 
 5 | Tool to convert ImageNet pre-trained weights for D2.
 6 | 
 7 | * `analyze_model.py`
 8 | 
 9 | Tool to analyze model parameters and flops.
10 | 
11 | Usage for semantic segmentation (ADE20K only, use with caution!):
12 | 
13 | ```
14 | python tools/analyze_model.py --num-inputs 1 --tasks flop --use-fixed-input-size --config-file CONFIG_FILE
15 | ```
16 | 
17 | Note that, for semantic segmentation (ADE20K only), we use a dummy image with fixed size that equals to `cfg.INPUT.CROP.SIZE[0] x cfg.INPUT.CROP.SIZE[0]`.
18 | Please do not use `--use-fixed-input-size` for calculating FLOPs on other datasets like COCO!
19 | 
20 | Usage for panoptic and instance segmentation:
21 | 
22 | ```
23 | python tools/analyze_model.py --num-inputs 100 --tasks flop --config-file CONFIG_FILE
24 | ```
25 | 
26 | Note that, for panoptic and instance segmentation, we compute the average flops over 100 real validation images.
27 | 


--------------------------------------------------------------------------------
/tools/analyze_model.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detectron2/blob/main/tools/analyze_model.py
  4 | 
  5 | import logging
  6 | import numpy as np
  7 | from collections import Counter
  8 | import tqdm
  9 | from fvcore.nn import flop_count_table  # can also try flop_count_str
 10 | 
 11 | from detectron2.checkpoint import DetectionCheckpointer
 12 | from detectron2.config import CfgNode, LazyConfig, get_cfg, instantiate
 13 | from detectron2.data import build_detection_test_loader
 14 | from detectron2.engine import default_argument_parser
 15 | from detectron2.modeling import build_model
 16 | from detectron2.projects.deeplab import add_deeplab_config
 17 | from detectron2.utils.analysis import (
 18 |     FlopCountAnalysis,
 19 |     activation_count_operators,
 20 |     parameter_count_table,
 21 | )
 22 | from detectron2.utils.logger import setup_logger
 23 | 
 24 | # fmt: off
 25 | import os
 26 | import sys
 27 | sys.path.insert(1, os.path.join(sys.path[0], '..'))
 28 | # fmt: on
 29 | 
 30 | from mask2former import add_maskformer2_config
 31 | 
 32 | logger = logging.getLogger("detectron2")
 33 | 
 34 | """
 35 | Analyzes FLOP count, parameter count, model structure and operator activation count for models
 36 | For usage example, please refer to tools/README.md
 37 | """
 38 | 
 39 | 
 40 | def setup(args):
 41 |     if args.config_file.endswith(".yaml"):
 42 |         cfg = get_cfg()
 43 |         add_deeplab_config(cfg)
 44 |         add_maskformer2_config(cfg)
 45 |         cfg.merge_from_file(args.config_file)
 46 |         cfg.DATALOADER.NUM_WORKERS = 0
 47 |         cfg.merge_from_list(args.opts)
 48 |         cfg.freeze()
 49 |     else:
 50 |         cfg = LazyConfig.load(args.config_file)
 51 |         cfg = LazyConfig.apply_overrides(cfg, args.opts)
 52 |     setup_logger(name="fvcore")
 53 |     setup_logger()
 54 |     return cfg
 55 | 
 56 | 
 57 | def do_flop(cfg):
 58 |     if isinstance(cfg, CfgNode):
 59 |         data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0])
 60 |         model = build_model(cfg)
 61 |         DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
 62 |     else:
 63 |         data_loader = instantiate(cfg.dataloader.test)
 64 |         model = instantiate(cfg.model)
 65 |         model.to(cfg.train.device)
 66 |         DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
 67 |     model.eval()
 68 | 
 69 |     counts = Counter()
 70 |     total_flops = []
 71 |     for idx, data in zip(tqdm.trange(args.num_inputs), data_loader):  # noqa
 72 |         if args.use_fixed_input_size and isinstance(cfg, CfgNode):
 73 |             import torch
 74 |             crop_size = cfg.INPUT.CROP.SIZE[0]
 75 |             data[0]["image"] = torch.zeros((3, crop_size, crop_size))
 76 |         flops = FlopCountAnalysis(model, data)
 77 |         if idx > 0:
 78 |             flops.unsupported_ops_warnings(False).uncalled_modules_warnings(False)
 79 |         counts += flops.by_operator()
 80 |         total_flops.append(flops.total())
 81 | 
 82 |     logger.info("Flops table computed from only one input sample:\n" + flop_count_table(flops))
 83 |     logger.info(
 84 |         "Average GFlops for each type of operators:\n"
 85 |         + str([(k, v / (idx + 1) / 1e9) for k, v in counts.items()])
 86 |     )
 87 |     logger.info(
 88 |         "Total GFlops: {:.1f}±{:.1f}".format(np.mean(total_flops) / 1e9, np.std(total_flops) / 1e9)
 89 |     )
 90 | 
 91 | 
 92 | def do_activation(cfg):
 93 |     if isinstance(cfg, CfgNode):
 94 |         data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0])
 95 |         model = build_model(cfg)
 96 |         DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
 97 |     else:
 98 |         data_loader = instantiate(cfg.dataloader.test)
 99 |         model = instantiate(cfg.model)
100 |         model.to(cfg.train.device)
101 |         DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
102 |     model.eval()
103 | 
104 |     counts = Counter()
105 |     total_activations = []
106 |     for idx, data in zip(tqdm.trange(args.num_inputs), data_loader):  # noqa
107 |         count = activation_count_operators(model, data)
108 |         counts += count
109 |         total_activations.append(sum(count.values()))
110 |     logger.info(
111 |         "(Million) Activations for Each Type of Operators:\n"
112 |         + str([(k, v / idx) for k, v in counts.items()])
113 |     )
114 |     logger.info(
115 |         "Total (Million) Activations: {}±{}".format(
116 |             np.mean(total_activations), np.std(total_activations)
117 |         )
118 |     )
119 | 
120 | 
121 | def do_parameter(cfg):
122 |     if isinstance(cfg, CfgNode):
123 |         model = build_model(cfg)
124 |     else:
125 |         model = instantiate(cfg.model)
126 |     logger.info("Parameter Count:\n" + parameter_count_table(model, max_depth=5))
127 | 
128 | 
129 | def do_structure(cfg):
130 |     if isinstance(cfg, CfgNode):
131 |         model = build_model(cfg)
132 |     else:
133 |         model = instantiate(cfg.model)
134 |     logger.info("Model Structure:\n" + str(model))
135 | 
136 | 
137 | if __name__ == "__main__":
138 |     parser = default_argument_parser(
139 |         epilog="""
140 | Examples:
141 | To show parameters of a model:
142 | $ ./analyze_model.py --tasks parameter \\
143 |     --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
144 | Flops and activations are data-dependent, therefore inputs and model weights
145 | are needed to count them:
146 | $ ./analyze_model.py --num-inputs 100 --tasks flop \\
147 |     --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \\
148 |     MODEL.WEIGHTS /path/to/model.pkl
149 | """
150 |     )
151 |     parser.add_argument(
152 |         "--tasks",
153 |         choices=["flop", "activation", "parameter", "structure"],
154 |         required=True,
155 |         nargs="+",
156 |     )
157 |     parser.add_argument(
158 |         "-n",
159 |         "--num-inputs",
160 |         default=100,
161 |         type=int,
162 |         help="number of inputs used to compute statistics for flops/activations, "
163 |         "both are data dependent.",
164 |     )
165 |     parser.add_argument(
166 |         "--use-fixed-input-size",
167 |         action="store_true",
168 |         help="use fixed input size when calculating flops",
169 |     )
170 |     args = parser.parse_args()
171 |     assert not args.eval_only
172 |     assert args.num_gpus == 1
173 | 
174 |     cfg = setup(args)
175 | 
176 |     for task in args.tasks:
177 |         {
178 |             "flop": do_flop,
179 |             "activation": do_activation,
180 |             "parameter": do_parameter,
181 |             "structure": do_structure,
182 |         }[task](cfg)
183 | 


--------------------------------------------------------------------------------
/tools/convert-pretrained-model-to-d2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | # Adapted for AutoFocusFormer by Ziwen 2023
 4 | 
 5 | import pickle as pkl
 6 | import sys
 7 | 
 8 | import torch
 9 | 
10 | """
11 | Usage:
12 |   # run the conversion
13 |   python ./convert-pretrained-model-to-d2.py aff.pth aff.pkl
14 |   # Then, use aff.pkl in config:
15 | MODEL:
16 |   WEIGHTS: "/path/to/aff.pkl"
17 | INPUT:
18 |   FORMAT: "RGB"
19 | """
20 | 
21 | if __name__ == "__main__":
22 |     input = sys.argv[1]
23 | 
24 |     obj = torch.load(input, map_location="cpu")["model"]
25 | 
26 |     res = {"model": obj, "__author__": "third_party", "matching_heuristics": True}
27 | 
28 |     with open(sys.argv[2], "wb") as f:
29 |         pkl.dump(res, f)
30 | 


--------------------------------------------------------------------------------