├── .flake8
├── .gitattributes
├── .gitignore
├── ACKNOWLEDGMENTS
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── aff.png
├── architecture.png
├── builtin.py
├── builtin_meta.py
├── configs
├── ade20k
│ └── semantic-segmentation
│ │ ├── Base-ADE20K-SemanticSegmentation.yaml
│ │ ├── aff
│ │ ├── maskformer2_aff_mini_1_5th_bs32_80k.yaml
│ │ ├── maskformer2_aff_mini_bs32_80k.yaml
│ │ ├── maskformer2_aff_small_1_5th_bs32_80k.yaml
│ │ ├── maskformer2_aff_small_bs32_80k.yaml
│ │ ├── maskformer2_aff_tiny_1_5th_bs32_80k.yaml
│ │ └── maskformer2_aff_tiny_bs32_80k.yaml
│ │ └── maskformer2_R50_bs16_160k.yaml
├── cityscapes
│ ├── instance-segmentation
│ │ ├── Base-Cityscapes-InstanceSegmentation.yaml
│ │ ├── aff
│ │ │ ├── maskformer2_aff_base_384_bs16_90k.yaml
│ │ │ ├── maskformer2_aff_mini_bs32_45k.yaml
│ │ │ ├── maskformer2_aff_small_bs32_45k.yaml
│ │ │ └── maskformer2_aff_tiny_bs32_45k.yaml
│ │ └── maskformer2_R50_bs16_90k.yaml
│ └── panoptic-segmentation
│ │ ├── Base-Cityscapes-PanopticSegmentation.yaml
│ │ ├── aff
│ │ ├── maskformer2_aff_base_384_bs16_90k.yaml
│ │ ├── maskformer2_aff_mini_bs32_45k.yaml
│ │ ├── maskformer2_aff_small_bs32_45k.yaml
│ │ └── maskformer2_aff_tiny_bs32_45k.yaml
│ │ └── maskformer2_R50_bs16_90k.yaml
└── coco
│ └── instance-segmentation
│ ├── Base-COCO-InstanceSegmentation.yaml
│ ├── aff
│ ├── maskformer2_aff_mini_1_5th_bs64_50ep.yaml
│ ├── maskformer2_aff_mini_bs64_50ep.yaml
│ ├── maskformer2_aff_small_1_5th_bs64_50ep.yaml
│ ├── maskformer2_aff_small_bs64_50ep.yaml
│ ├── maskformer2_aff_tiny_1_5th_bs64_50ep.yaml
│ └── maskformer2_aff_tiny_bs64_50ep.yaml
│ └── maskformer2_R50_bs16_50ep.yaml
├── create_env.sh
├── datasets
├── README.md
├── prepare_ade20k_sem_seg.py
├── prepare_coco_semantic_annos_from_panoptic_annos.py
├── prepare_cocofied_lvis.py
└── prepare_cocofied_lvisv1.py
├── demo
├── demo.py
└── predictor.py
├── demo1.png
├── demo2.png
├── mask2former
├── __init__.py
├── config.py
├── data
│ ├── __init__.py
│ ├── dataset_mappers
│ │ ├── __init__.py
│ │ ├── coco_instance_new_baseline_dataset_mapper.py
│ │ ├── coco_panoptic_new_baseline_dataset_mapper.py
│ │ ├── mask_former_instance_dataset_mapper.py
│ │ ├── mask_former_panoptic_dataset_mapper.py
│ │ └── mask_former_semantic_dataset_mapper.py
│ └── datasets
│ │ ├── __init__.py
│ │ └── register_coco_panoptic_annos_semseg.py
├── evaluation
│ ├── __init__.py
│ └── instance_evaluation.py
├── maskformer_model.py
├── modeling
│ ├── __init__.py
│ ├── backbone
│ │ ├── __init__.py
│ │ ├── aff.py
│ │ └── point_utils.py
│ ├── clusten
│ │ ├── __init__.py
│ │ ├── clusten.py
│ │ ├── src
│ │ │ ├── clustenav_cuda.cpp
│ │ │ ├── clustenav_cuda_kernel.cu
│ │ │ ├── clustenqk_cuda.cpp
│ │ │ ├── clustenqk_cuda_kernel.cu
│ │ │ ├── clustenwf_cuda.cpp
│ │ │ ├── clustenwf_cuda_kernel.cu
│ │ │ ├── msdetrpc_cuda.cpp
│ │ │ ├── msdetrpc_cuda_kernel.cu
│ │ │ ├── setup.py
│ │ │ ├── weighted_gather_cuda.cpp
│ │ │ └── weighted_gather_cuda_kernel.cu
│ │ ├── test_msdetrpc_kernel.py
│ │ └── test_wg_kernel.py
│ ├── criterion.py
│ ├── matcher.py
│ ├── meta_arch
│ │ ├── __init__.py
│ │ └── mask_former_head.py
│ ├── pixel_decoder
│ │ └── msdeformattn_pc.py
│ └── transformer_decoder
│ │ ├── __init__.py
│ │ ├── mask2former_transformer_decoder.py
│ │ ├── position_encoding.py
│ │ └── transformer.py
├── test_time_augmentation.py
└── utils
│ ├── __init__.py
│ └── misc.py
├── run_aff_segmentation.sh
├── run_demo.sh
├── tools
├── README.md
├── analyze_model.py
└── convert-pretrained-model-to-d2.py
└── train_net.py
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | select = B,C,E,F,P,T4,W,B9
3 | max-line-length = 120
4 | # C408 ignored because we like the dict keyword argument syntax
5 | # E501 is not flexible enough, we're using B950 instead
6 | ignore =
7 | E203,E305,E402,E501,E721,E741,F405,F821,F841,F999,W503,W504,C408,E303,E226,
8 | # shebang has extra meaning in fbcode lints, so I think it's not worth trying
9 | # to line this up with executable bit
10 | EXE001,
11 | # these ignores are from flake8-bugbear; please fix!
12 | B007,B008,
13 | # these ignores are from flake8-comprehensions; please fix!
14 | C400,C401,C402,C403,C404,C405,C407,C411,C413,C414,C415,
15 | # for "unable to detect undefined names"
16 | F403,
17 | # for "Too many leading '#' for block comment (E266)"
18 | E266,
19 | # for "E731 do not assign a lambda expression, use a def"
20 | E731,
21 | # for "future feature annotations is not defined"
22 | F407,
23 | # do not use bare 'except'
24 | E722,
25 | per-file-ignores =
26 | __init__.py: F401,
27 | #pre_table is used as a global variable
28 | mask2former/modeling/pixel_decoder/msdeformattn_pc.py: F401
29 | optional-ascii-coding = True
30 | exclude =
31 | ./.git,
32 | ./docs,
33 | ./scripts,
34 | ./test
35 | ./third_party,
36 | ./venv,
37 | *.pyi
38 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.pth filter=lfs diff=lfs merge=lfs -text
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.svg
2 | .nfs*
3 | .DS_Store
4 | __pycache__/
5 | *swp*
6 | output/
7 |
8 | # Byte-compiled / optimized / DLL files
9 | __pycache__/
10 | *.py[cod]
11 | *$py.class
12 |
13 | # C extensions
14 | *.so
15 |
16 | # Distribution / packaging
17 | .Python
18 | build/
19 | develop-eggs/
20 | dist/
21 | downloads/
22 | eggs/
23 | .eggs/
24 | lib/
25 | lib64/
26 | parts/
27 | sdist/
28 | var/
29 | wheels/
30 | pip-wheel-metadata/
31 | share/python-wheels/
32 | *.egg-info/
33 | .installed.cfg
34 | *.egg
35 | MANIFEST
36 |
37 | # PyInstaller
38 | # Usually these files are written by a python script from a template
39 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
40 | *.manifest
41 | *.spec
42 |
43 | # Installer logs
44 | pip-log.txt
45 | pip-delete-this-directory.txt
46 |
47 | # Unit test / coverage reports
48 | htmlcov/
49 | .tox/
50 | .nox/
51 | .coverage
52 | .coverage.*
53 | .cache
54 | nosetests.xml
55 | coverage.xml
56 | *.cover
57 | *.py,cover
58 | .hypothesis/
59 | .pytest_cache/
60 |
61 | # Translations
62 | *.mo
63 | *.pot
64 |
65 | # Django stuff:
66 | *.log
67 | local_settings.py
68 | db.sqlite3
69 | db.sqlite3-journal
70 |
71 | # Flask stuff:
72 | instance/
73 | .webassets-cache
74 |
75 | # Scrapy stuff:
76 | .scrapy
77 |
78 | # Sphinx documentation
79 | docs/_build/
80 |
81 | # PyBuilder
82 | target/
83 |
84 | # Jupyter Notebook
85 | .ipynb_checkpoints
86 |
87 | # IPython
88 | profile_default/
89 | ipython_config.py
90 |
91 | # pyenv
92 | .python-version
93 |
94 | # pipenv
95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
98 | # install all needed dependencies.
99 | #Pipfile.lock
100 |
101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
102 | __pypackages__/
103 |
104 | # Celery stuff
105 | celerybeat-schedule
106 | celerybeat.pid
107 |
108 | # SageMath parsed files
109 | *.sage.py
110 |
111 | # Environments
112 | .env
113 | .venv
114 | env/
115 | venv/
116 | ENV/
117 | env.bak/
118 | venv.bak/
119 |
120 | # Spyder project settings
121 | .spyderproject
122 | .spyproject
123 |
124 | # Rope project settings
125 | .ropeproject
126 |
127 | # mkdocs documentation
128 | /site
129 |
130 | # mypy
131 | .mypy_cache/
132 | .dmypy.json
133 | dmypy.json
134 |
135 | # Pyre type checker
136 | .pyre/
137 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | We as members, contributors, and leaders pledge to make participation in our
6 | community a harassment-free experience for everyone, regardless of age, body
7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
8 | identity and expression, level of experience, education, socio-economic status,
9 | nationality, personal appearance, race, caste, color, religion, or sexual
10 | identity and orientation.
11 |
12 | We pledge to act and interact in ways that contribute to an open, welcoming,
13 | diverse, inclusive, and healthy community.
14 |
15 | ## Our Standards
16 |
17 | Examples of behavior that contributes to a positive environment for our
18 | community include:
19 |
20 | * Demonstrating empathy and kindness toward other people
21 | * Being respectful of differing opinions, viewpoints, and experiences
22 | * Giving and gracefully accepting constructive feedback
23 | * Accepting responsibility and apologizing to those affected by our mistakes,
24 | and learning from the experience
25 | * Focusing on what is best not just for us as individuals, but for the overall
26 | community
27 |
28 | Examples of unacceptable behavior include:
29 |
30 | * The use of sexualized language or imagery, and sexual attention or advances of
31 | any kind
32 | * Trolling, insulting or derogatory comments, and personal or political attacks
33 | * Public or private harassment
34 | * Publishing others' private information, such as a physical or email address,
35 | without their explicit permission
36 | * Other conduct which could reasonably be considered inappropriate in a
37 | professional setting
38 |
39 | ## Enforcement Responsibilities
40 |
41 | Community leaders are responsible for clarifying and enforcing our standards of
42 | acceptable behavior and will take appropriate and fair corrective action in
43 | response to any behavior that they deem inappropriate, threatening, offensive,
44 | or harmful.
45 |
46 | Community leaders have the right and responsibility to remove, edit, or reject
47 | comments, commits, code, wiki edits, issues, and other contributions that are
48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
49 | decisions when appropriate.
50 |
51 | ## Scope
52 |
53 | This Code of Conduct applies within all community spaces, and also applies when
54 | an individual is officially representing the community in public spaces.
55 | Examples of representing our community include using an official e-mail address,
56 | posting via an official social media account, or acting as an appointed
57 | representative at an online or offline event.
58 |
59 | ## Enforcement
60 |
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported to the community leaders responsible for enforcement at [opensource-conduct@group.apple.com](mailto:opensource-conduct@group.apple.com).
63 | All complaints will be reviewed and investigated promptly and fairly.
64 |
65 | All community leaders are obligated to respect the privacy and security of the
66 | reporter of any incident.
67 |
68 | ## Enforcement Guidelines
69 |
70 | Community leaders will follow these Community Impact Guidelines in determining
71 | the consequences for any action they deem in violation of this Code of Conduct:
72 |
73 | ### 1. Correction
74 |
75 | **Community Impact**: Use of inappropriate language or other behavior deemed
76 | unprofessional or unwelcome in the community.
77 |
78 | **Consequence**: A private, written warning from community leaders, providing
79 | clarity around the nature of the violation and an explanation of why the
80 | behavior was inappropriate. A public apology may be requested.
81 |
82 | ### 2. Warning
83 |
84 | **Community Impact**: A violation through a single incident or series of
85 | actions.
86 |
87 | **Consequence**: A warning with consequences for continued behavior. No
88 | interaction with the people involved, including unsolicited interaction with
89 | those enforcing the Code of Conduct, for a specified period of time. This
90 | includes avoiding interactions in community spaces as well as external channels
91 | like social media. Violating these terms may lead to a temporary or permanent
92 | ban.
93 |
94 | ### 3. Temporary Ban
95 |
96 | **Community Impact**: A serious violation of community standards, including
97 | sustained inappropriate behavior.
98 |
99 | **Consequence**: A temporary ban from any sort of interaction or public
100 | communication with the community for a specified period of time. No public or
101 | private interaction with the people involved, including unsolicited interaction
102 | with those enforcing the Code of Conduct, is allowed during this period.
103 | Violating these terms may lead to a permanent ban.
104 |
105 | ### 4. Permanent Ban
106 |
107 | **Community Impact**: Demonstrating a pattern of violation of community
108 | standards, including sustained inappropriate behavior, harassment of an
109 | individual, or aggression toward or disparagement of classes of individuals.
110 |
111 | **Consequence**: A permanent ban from any sort of public interaction within the
112 | community.
113 |
114 | ## Attribution
115 |
116 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
117 | version 2.1, available at
118 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
119 |
120 | Community Impact Guidelines were inspired by
121 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
122 |
123 | For answers to common questions about this code of conduct, see the FAQ at
124 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
125 | [https://www.contributor-covenant.org/translations][translations].
126 |
127 | [homepage]: https://www.contributor-covenant.org
128 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
129 | [Mozilla CoC]: https://github.com/mozilla/diversity
130 | [FAQ]: https://www.contributor-covenant.org/faq
131 | [translations]: https://www.contributor-covenant.org/translations
132 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contribution Guide
2 |
3 | Thanks for your interest in contributing. This project was released to accompany a research paper for purposes of reproducability, and beyond its publication there are limited plans for future development of the repository.
4 |
5 | While we welcome new pull requests and issues please note that our response may be limited. Forks and out-of-tree improvements are strongly encouraged.
6 |
7 | ## Before you get started
8 |
9 | By submitting a pull request, you represent that you have the right to license your contribution to Apple and the community, and agree by submitting the patch that your contributions are licensed under the [LICENSE](LICENSE).
10 |
11 | We ask that all community members read and observe our [Code of Conduct](CODE_OF_CONDUCT.md).
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (C) 2023 Apple Inc. All Rights Reserved.
2 |
3 | IMPORTANT: This Apple software is supplied to you by Apple
4 | Inc. ("Apple") in consideration of your agreement to the following
5 | terms, and your use, installation, modification or redistribution of
6 | this Apple software constitutes acceptance of these terms. If you do
7 | not agree with these terms, please do not use, install, modify or
8 | redistribute this Apple software.
9 |
10 | In consideration of your agreement to abide by the following terms, and
11 | subject to these terms, Apple grants you a personal, non-exclusive
12 | license, under Apple's copyrights in this original Apple software (the
13 | "Apple Software"), to use, reproduce, modify and redistribute the Apple
14 | Software, with or without modifications, in source and/or binary forms;
15 | provided that if you redistribute the Apple Software in its entirety and
16 | without modifications, you must retain this notice and the following
17 | text and disclaimers in all such redistributions of the Apple Software.
18 | Neither the name, trademarks, service marks or logos of Apple Inc. may
19 | be used to endorse or promote products derived from the Apple Software
20 | without specific prior written permission from Apple. Except as
21 | expressly stated in this notice, no other rights or licenses, express or
22 | implied, are granted by Apple herein, including but not limited to any
23 | patent rights that may be infringed by your derivative works or by other
24 | works in which the Apple Software may be incorporated.
25 |
26 | The Apple Software is provided by Apple on an "AS IS" basis. APPLE
27 | MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
28 | THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
29 | FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
30 | OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
31 |
32 | IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
33 | OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 | INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
36 | MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
37 | AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
38 | STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
39 | POSSIBILITY OF SUCH DAMAGE.
40 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # AutoFocusFormer
2 |
3 | [](CODE_OF_CONDUCT.md)
4 | [](clusten/)
5 |
6 | AFF-Base: [](https://paperswithcode.com/sota/instance-segmentation-on-cityscapes-val?p=autofocusformer-image-segmentation-off-the) [](https://paperswithcode.com/sota/panoptic-segmentation-on-cityscapes-val?p=autofocusformer-image-segmentation-off-the)
7 |
8 | This software project accompanies the research paper, *AutoFocusFormer: Image Segmentation off the Grid* (CVPR 2023).
9 |
10 | [Chen Ziwen](https://www.chenziwe.com), Kaushik Patnaik, [Shuangfei Zhai](https://scholar.google.com/citations?user=G6vdBYsAAAAJ&hl=en), [Alvin Wan](http://alvinwan.com), [Zhile Ren](https://jrenzhile.com), [Alex Schwing](https://alexander-schwing.de/), [Alex Colburn](https://www.colburn.org), [Li Fuxin](https://web.engr.oregonstate.edu/~lif/)
11 |
12 | [arXiv](https://arxiv.org/abs/2304.12406) | [video narration](https://youtu.be/i1mZtk70yGY) | [AFF-Classification](https://github.com/apple/ml-autofocusformer) | [AFF-Segmentation (this repo)](https://github.com/apple/ml-autofocusformer-segmentation)
13 |
14 | ## Introduction
15 |
16 | AutoFocusFormer (AFF) is the first **adaptive**-downsampling network capable of **dense** prediction tasks such as semantic/instance segmentation.
17 |
18 | AFF abandons the traditional grid structure of image feature maps, and automatically learns to retain the most important pixels with respect to the task goal.
19 |
20 |
21 |

22 |
23 |
24 | AFF consists of a local-attention transformer backbone and a task-specific head. The backbone consists of four stages, each stage containing three modules: balanced clustering, local-attention transformer blocks, and adaptive downsampling.
25 |
26 |
27 |

28 |
29 |
30 | AFF demonstrates significant savings on FLOPs (see our models with 1/5 downsampling rate), and significant improvement on recognition of small objects.
31 |
32 | Notably, AFF-Small achieves **44.0** instance segmentation AP and **66.9** panoptic segmentation PQ on Cityscapes val with a backbone of only **42.6M** parameters, a performance on par with Swin-Large, a backbone with **197M** params (saving **78%**!).
33 |
34 |
35 |

36 |
37 |
38 |
39 |

40 |
41 |
42 | This repository contains the AFF backbone and the point cloud-version of the Mask2Former segmentation head.
43 |
44 | We also add a few convenient functionalities, such as visualizing prediction results on blurred version of the images, and evaluating on cocofied lvis v1 annotations.
45 |
46 | ## Main Results with Pretrained Models
47 |
48 | **ADE20K Semantic Segmentation (val)**
49 | | backbone | method | pretrain | crop size | mIoU | FLOPs | checkpoint |
50 | | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
51 | | AFF-Mini | Mask2Former | ImageNet-1K | 512x512 | 46.5 | 48.3G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/ade_sem/aff_mini.pth) |
52 | | AFF-Mini-1/5 | Mask2Former | ImageNet-1K | 512x512 | 46.0 | 39.9G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/ade_sem/aff_mini_1_5th.pth) |
53 | | AFF-Tiny | Mask2Former | ImageNet-1K | 512x512 | 50.2 | 64.6G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/ade_sem/aff_tiny.pth) |
54 | | AFF-Tiny-1/5 | Mask2Former | ImageNet-1K | 512x512 | 50.0 | 51.1G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/ade_sem/aff_tiny_1_5th.pth) |
55 | | AFF-Small | Mask2Former | ImageNet-1K | 512x512 | 51.2 | 87G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/ade_sem/aff_small.pth) |
56 | | AFF-Small-1/5 | Mask2Former | ImageNet-1K | 512x512 | 51.9 | 67.2G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/ade_sem/aff_small_1_5th.pth) |
57 |
58 | **Cityscapes Instance Segmentation (val)**
59 | | backbone | method | pretrain | AP | checkpoint |
60 | | :---: | :---: | :---: | :---: | :---: |
61 | | AFF-Mini | Mask2Former | ImageNet-1K | 40.0 | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/city_ins/aff_mini.pth) |
62 | | AFF-Tiny | Mask2Former | ImageNet-1K | 42.7 | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/city_ins/aff_tiny.pth) |
63 | | AFF-Small | Mask2Former | ImageNet-1K | 44.0 | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/city_ins/aff_small.pth) |
64 | | AFF-Base | Mask2Former | ImageNet-22K | 46.2 | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/city_pan/aff_base_22kto1k_384.pth) |
65 |
66 | **Cityscapes Panoptic Segmentation (val)**
67 | | backbone | method | pretrain | PQ(s.s.) | checkpoint |
68 | | :---: | :---: | :---: | :---: | :---: |
69 | | AFF-Mini | Mask2Former | ImageNet-1K | 62.7 | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/city_pan/aff_mini.pth) |
70 | | AFF-Tiny | Mask2Former | ImageNet-1K | 65.7 | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/city_pan/aff_tiny.pth) |
71 | | AFF-Small | Mask2Former | ImageNet-1K | 66.9 | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/city_pan/aff_small.pth) |
72 | | AFF-Base | Mask2Former | ImageNet-22K | 67.7 | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/city_pan/aff_base_22kto1k_384.pth) |
73 |
74 | **COCO Instance Segmentation (val)**
75 | | backbone | method | pretrain | epochs | AP | FLOPs | checkpoint |
76 | | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
77 | | AFF-Mini | Mask2Former | ImageNet-1K | 50 | 42.3 | 148G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/coco_ins/aff_mini.pth) |
78 | | AFF-Mini-1/5 | Mask2Former | ImageNet-1K | 50 | 42.3 | 120G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/coco_ins/aff_mini_1_5th.pth) |
79 | | AFF-Tiny | Mask2Former | ImageNet-1K | 50 | 45.3 | 204G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/coco_ins/aff_tiny.pth) |
80 | | AFF-Tiny-1/5 | Mask2Former | ImageNet-1K | 50 | 44.5 | 152G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/coco_ins/aff_tiny_1_5th.pth) |
81 | | AFF-Small | Mask2Former | ImageNet-1K | 50 | 46.4 | 281G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/coco_ins/aff_small.pth) |
82 | | AFF-Small-1/5 | Mask2Former | ImageNet-1K | 50 | 45.7 | 206G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/coco_ins/aff_small_1_5th.pth) |
83 |
84 | ## Getting Started
85 |
86 | ### Clone this repo
87 |
88 | ```bash
89 | git clone git@github.com:apple/ml-autofocusformer-segmentation.git
90 | cd ml-autofocusformer-segmentation
91 | ```
92 | One can download the pre-trained checkpoints through the links in the tables above.
93 |
94 | ### Create environment and install requirements
95 |
96 | ```bash
97 | sh create_env.sh
98 | ```
99 |
100 | See further documentation inside the script file.
101 |
102 | Our experiments are run with `CUDA==11.6` and `pytorch==1.12`.
103 |
104 | ### Prepare data
105 |
106 | Please refer to [dataset README](datasets/README.md).
107 |
108 | ### Prepare pre-trained backbone checkpoint
109 |
110 | Use `tools/convert-pretrained-model-to-d2.py` to convert any torch checkpoint `.pth` file trained on ImageNet into a Detectron2 model zoo format `.pkl` file.
111 | ```
112 | python tools/convert-pretrained-model-to-d2.py aff_mini.pth aff_mini.pkl
113 | ```
114 | Otherwise, d2 will assume the checkpoint is for the entire segmentation model and will not add `backbone.` to the parameter names, and thus the checkpoint will not be properly loaded.
115 |
116 | ### Train and evaluate
117 |
118 | Modify the arguments in script `run_aff_segmentation.sh` and run
119 | ```bash
120 | sh run_aff_segmentation.sh
121 | ```
122 | for training or evaluation.
123 |
124 | One can also directly modify the config files in `configs/`.
125 |
126 | ### Visualize predictions for pre-trained models
127 |
128 | See script `run_demo.sh`. More details can be found in [Mask2Former GETTING_STARTED.md](https://github.com/facebookresearch/Mask2Former/blob/main/GETTING_STARTED.md).
129 |
130 | ### Analyze model FLOPs
131 |
132 | See [tools README](tools/README.md).
133 |
134 | ## Citing AutoFocusFormer
135 |
136 | ```BibTeX
137 | @inproceedings{autofocusformer,
138 | title = {AutoFocusFormer: Image Segmentation off the Grid},
139 | booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
140 | author = {Ziwen, Chen and Patnaik, Kaushik and Zhai, Shuangfei and Wan, Alvin and Ren, Zhile and Schwing, Alex and Colburn, Alex and Fuxin, Li},
141 | year = {2023},
142 | }
143 | ```
144 |
--------------------------------------------------------------------------------
/aff.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-autofocusformer-segmentation/52908e8ad5112b5bff1d043e6a06a9e8f9aad3ba/aff.png
--------------------------------------------------------------------------------
/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-autofocusformer-segmentation/52908e8ad5112b5bff1d043e6a06a9e8f9aad3ba/architecture.png
--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/Base-ADE20K-SemanticSegmentation.yaml:
--------------------------------------------------------------------------------
1 | MODEL:
2 | BACKBONE:
3 | FREEZE_AT: 0
4 | NAME: "build_resnet_backbone"
5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6 | PIXEL_MEAN: [123.675, 116.280, 103.530]
7 | PIXEL_STD: [58.395, 57.120, 57.375]
8 | RESNETS:
9 | DEPTH: 50
10 | STEM_TYPE: "basic" # not used
11 | STEM_OUT_CHANNELS: 64
12 | STRIDE_IN_1X1: False
13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 | # NORM: "SyncBN"
15 | RES5_MULTI_GRID: [1, 1, 1] # not used
16 | DATASETS:
17 | TRAIN: ("ade20k_sem_seg_train",)
18 | TEST: ("ade20k_sem_seg_val",)
19 | SOLVER:
20 | IMS_PER_BATCH: 16
21 | BASE_LR: 0.0001
22 | MAX_ITER: 160000
23 | WARMUP_FACTOR: 1.0
24 | WARMUP_ITERS: 0
25 | WEIGHT_DECAY: 0.05
26 | OPTIMIZER: "ADAMW"
27 | LR_SCHEDULER_NAME: "WarmupPolyLR"
28 | BACKBONE_MULTIPLIER: 0.1
29 | CLIP_GRADIENTS:
30 | ENABLED: True
31 | CLIP_TYPE: "full_model"
32 | CLIP_VALUE: 0.01
33 | NORM_TYPE: 2.0
34 | AMP:
35 | ENABLED: False
36 | INPUT:
37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"]
38 | MIN_SIZE_TRAIN_SAMPLING: "choice"
39 | MIN_SIZE_TEST: 512
40 | MAX_SIZE_TRAIN: 2048
41 | MAX_SIZE_TEST: 2048
42 | CROP:
43 | ENABLED: True
44 | TYPE: "absolute"
45 | SIZE: (512, 512)
46 | SINGLE_CATEGORY_MAX_AREA: 1.0
47 | COLOR_AUG_SSD: True
48 | SIZE_DIVISIBILITY: 512 # used in dataset mapper
49 | FORMAT: "RGB"
50 | DATASET_MAPPER_NAME: "mask_former_semantic"
51 | TEST:
52 | EVAL_PERIOD: 5000
53 | AUG:
54 | ENABLED: False
55 | MIN_SIZES: [256, 384, 512, 640, 768, 896]
56 | MAX_SIZE: 3584
57 | FLIP: True
58 | DATALOADER:
59 | FILTER_EMPTY_ANNOTATIONS: True
60 | NUM_WORKERS: 4
61 | VERSION: 2
62 | SEED: 0
63 |
--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/aff/maskformer2_aff_mini_1_5th_bs32_80k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "AutoFocusFormer"
5 | AFF:
6 | EMBED_DIM: [32,128,256,384]
7 | DEPTHS: [2, 2, 6, 2]
8 | NUM_HEADS: [ 2, 4, 8, 16 ]
9 | DROP_PATH_RATE: 0.0
10 | PATCH_NORM: True
11 | MLP_RATIO: 2.
12 | CLUSTER_SIZE: 8
13 | NBHD_SIZE: [48,48,48,48]
14 | ALPHA: 4.0
15 | DS_RATE: 0.2
16 | WEIGHTS: "aff_mini_1_5th.pkl"
17 | PIXEL_MEAN: [123.675, 116.280, 103.530]
18 | PIXEL_STD: [58.395, 57.120, 57.375]
19 | SOLVER:
20 | IMS_PER_BATCH: 32
21 | BASE_LR: 0.0002
22 | MAX_ITER: 80000
23 |
--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/aff/maskformer2_aff_mini_bs32_80k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "AutoFocusFormer"
5 | AFF:
6 | EMBED_DIM: [32,128,256,384]
7 | DEPTHS: [2, 2, 6, 2]
8 | NUM_HEADS: [ 2, 4, 8, 16 ]
9 | DROP_PATH_RATE: 0.0
10 | PATCH_NORM: True
11 | MLP_RATIO: 2.
12 | CLUSTER_SIZE: 8
13 | NBHD_SIZE: [48,48,48,48]
14 | ALPHA: 4.0
15 | DS_RATE: 0.25
16 | WEIGHTS: "aff_mini.pkl"
17 | PIXEL_MEAN: [123.675, 116.280, 103.530]
18 | PIXEL_STD: [58.395, 57.120, 57.375]
19 | SOLVER:
20 | IMS_PER_BATCH: 32
21 | BASE_LR: 0.0002
22 | MAX_ITER: 80000
23 |
--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/aff/maskformer2_aff_small_1_5th_bs32_80k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "AutoFocusFormer"
5 | AFF:
6 | EMBED_DIM: [96,192,384,768]
7 | DEPTHS: [3,4,18,2]
8 | NUM_HEADS: [3,6,12,24]
9 | DROP_PATH_RATE: 0.3
10 | PATCH_NORM: True
11 | MLP_RATIO: 3.
12 | CLUSTER_SIZE: 8
13 | NBHD_SIZE: [48,48,48,48]
14 | LAYER_SCALE: 1e-5 # turned off if 0.0
15 | ALPHA: 4.0
16 | DS_RATE: 0.2
17 | WEIGHTS: "aff_small_1_5th.pkl"
18 | PIXEL_MEAN: [123.675, 116.280, 103.530]
19 | PIXEL_STD: [58.395, 57.120, 57.375]
20 | SOLVER:
21 | IMS_PER_BATCH: 32
22 | BASE_LR: 0.0002
23 | MAX_ITER: 80000
24 |
--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/aff/maskformer2_aff_small_bs32_80k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "AutoFocusFormer"
5 | AFF:
6 | EMBED_DIM: [96,192,384,768]
7 | DEPTHS: [3,4,18,2]
8 | NUM_HEADS: [3,6,12,24]
9 | DROP_PATH_RATE: 0.3
10 | PATCH_NORM: True
11 | MLP_RATIO: 3.
12 | CLUSTER_SIZE: 8
13 | NBHD_SIZE: [48,48,48,48]
14 | LAYER_SCALE: 1e-5 # turned off if 0.0
15 | ALPHA: 4.0
16 | DS_RATE: 0.25
17 | WEIGHTS: "aff_small.pkl"
18 | PIXEL_MEAN: [123.675, 116.280, 103.530]
19 | PIXEL_STD: [58.395, 57.120, 57.375]
20 | SOLVER:
21 | IMS_PER_BATCH: 32
22 | BASE_LR: 0.0002
23 | MAX_ITER: 80000
24 |
--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/aff/maskformer2_aff_tiny_1_5th_bs32_80k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "AutoFocusFormer"
5 | AFF:
6 | EMBED_DIM: [64,128,256,512]
7 | DEPTHS: [3,4,18,5]
8 | NUM_HEADS: [2,4,8,16]
9 | DROP_PATH_RATE: 0.3
10 | PATCH_NORM: True
11 | MLP_RATIO: 3.
12 | CLUSTER_SIZE: 8
13 | NBHD_SIZE: [48,48,48,48]
14 | ALPHA: 4.0
15 | DS_RATE: 0.2
16 | WEIGHTS: "aff_tiny_1_5th.pkl"
17 | PIXEL_MEAN: [123.675, 116.280, 103.530]
18 | PIXEL_STD: [58.395, 57.120, 57.375]
19 | SOLVER:
20 | IMS_PER_BATCH: 32
21 | BASE_LR: 0.0002
22 | MAX_ITER: 80000
23 |
--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/aff/maskformer2_aff_tiny_bs32_80k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "AutoFocusFormer"
5 | AFF:
6 | EMBED_DIM: [64,128,256,512]
7 | DEPTHS: [3,4,18,5]
8 | NUM_HEADS: [2,4,8,16]
9 | DROP_PATH_RATE: 0.3
10 | PATCH_NORM: True
11 | MLP_RATIO: 3.
12 | CLUSTER_SIZE: 8
13 | NBHD_SIZE: [48,48,48,48]
14 | ALPHA: 4.0
15 | DS_RATE: 0.25
16 | WEIGHTS: "aff_tiny.pkl"
17 | PIXEL_MEAN: [123.675, 116.280, 103.530]
18 | PIXEL_STD: [58.395, 57.120, 57.375]
19 | SOLVER:
20 | IMS_PER_BATCH: 32
21 | BASE_LR: 0.0002
22 | MAX_ITER: 80000
23 |
--------------------------------------------------------------------------------
/configs/ade20k/semantic-segmentation/maskformer2_R50_bs16_160k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-ADE20K-SemanticSegmentation.yaml
2 | MODEL:
3 | META_ARCHITECTURE: "MaskFormer"
4 | SEM_SEG_HEAD:
5 | NAME: "MaskFormerHead"
6 | IGNORE_VALUE: 255
7 | NUM_CLASSES: 150
8 | LOSS_WEIGHT: 1.0
9 | CONVS_DIM: 256
10 | MASK_DIM: 256
11 | NORM: "GN"
12 | # pixel decoder
13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 | COMMON_STRIDE: 4
17 | TRANSFORMER_ENC_LAYERS: 6
18 | MASK_FORMER:
19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 | DEEP_SUPERVISION: True
22 | NO_OBJECT_WEIGHT: 0.1
23 | CLASS_WEIGHT: 2.0
24 | MASK_WEIGHT: 5.0
25 | DICE_WEIGHT: 5.0
26 | HIDDEN_DIM: 256
27 | NUM_OBJECT_QUERIES: 100
28 | NHEADS: 8
29 | DROPOUT: 0.0
30 | DIM_FEEDFORWARD: 2048
31 | ENC_LAYERS: 0
32 | PRE_NORM: False
33 | ENFORCE_INPUT_PROJ: False
34 | SIZE_DIVISIBILITY: 32
35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
36 | TRAIN_NUM_POINTS: 12544
37 | OVERSAMPLE_RATIO: 3.0
38 | IMPORTANCE_SAMPLE_RATIO: 0.75
39 | TEST:
40 | SEMANTIC_ON: True
41 | INSTANCE_ON: False
42 | PANOPTIC_ON: False
43 | OVERLAP_THRESHOLD: 0.8
44 | OBJECT_MASK_THRESHOLD: 0.8
45 |
--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/Base-Cityscapes-InstanceSegmentation.yaml:
--------------------------------------------------------------------------------
1 | MODEL:
2 | BACKBONE:
3 | FREEZE_AT: 0
4 | NAME: "build_resnet_backbone"
5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6 | PIXEL_MEAN: [123.675, 116.280, 103.530]
7 | PIXEL_STD: [58.395, 57.120, 57.375]
8 | RESNETS:
9 | DEPTH: 50
10 | STEM_TYPE: "basic" # not used
11 | STEM_OUT_CHANNELS: 64
12 | STRIDE_IN_1X1: False
13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 | NORM: "SyncBN" # use syncbn for cityscapes dataset
15 | RES5_MULTI_GRID: [1, 1, 1] # not used
16 | DATASETS:
17 | TRAIN: ("cityscapes_fine_instance_seg_train",)
18 | TEST: ("cityscapes_fine_instance_seg_val",)
19 | SOLVER:
20 | IMS_PER_BATCH: 16
21 | BASE_LR: 0.0001
22 | MAX_ITER: 90000
23 | WARMUP_FACTOR: 1.0
24 | WARMUP_ITERS: 0
25 | WEIGHT_DECAY: 0.05
26 | OPTIMIZER: "ADAMW"
27 | LR_SCHEDULER_NAME: "WarmupPolyLR"
28 | BACKBONE_MULTIPLIER: 0.1
29 | CLIP_GRADIENTS:
30 | ENABLED: True
31 | CLIP_TYPE: "full_model"
32 | CLIP_VALUE: 0.01
33 | NORM_TYPE: 2.0
34 | AMP:
35 | ENABLED: False
36 | INPUT:
37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
38 | MIN_SIZE_TRAIN_SAMPLING: "choice"
39 | MIN_SIZE_TEST: 1024
40 | MAX_SIZE_TRAIN: 4096
41 | MAX_SIZE_TEST: 2048
42 | CROP:
43 | ENABLED: True
44 | TYPE: "absolute"
45 | SIZE: (512, 1024)
46 | SINGLE_CATEGORY_MAX_AREA: 1.0
47 | COLOR_AUG_SSD: True
48 | SIZE_DIVISIBILITY: -1
49 | FORMAT: "RGB"
50 | DATASET_MAPPER_NAME: "mask_former_instance"
51 | TEST:
52 | EVAL_PERIOD: 5000
53 | AUG:
54 | ENABLED: False
55 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
56 | MAX_SIZE: 4096
57 | FLIP: True
58 | DATALOADER:
59 | FILTER_EMPTY_ANNOTATIONS: True
60 | NUM_WORKERS: 4
61 | VERSION: 2
62 | SEED: 0
63 |
--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/aff/maskformer2_aff_base_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "AutoFocusFormer"
5 | AFF:
6 | EMBED_DIM: [128, 256, 512, 1024]
7 | DEPTHS: [3,4,18,2]
8 | NUM_HEADS: [4,8,16,32]
9 | DROP_PATH_RATE: 0.3
10 | PATCH_NORM: True
11 | MLP_RATIO: 3.
12 | CLUSTER_SIZE: 24
13 | NBHD_SIZE: [144,144,144,144]
14 | LAYER_SCALE: 1e-5 # turned off if 0.0
15 | ALPHA: 8.0
16 | DS_RATE: 0.25
17 | WEIGHTS: "aff_base_22kto1k_384.pkl"
18 | PIXEL_MEAN: [123.675, 116.280, 103.530]
19 | PIXEL_STD: [58.395, 57.120, 57.375]
20 | MASK_FORMER:
21 | NUM_OBJECT_QUERIES: 250
22 | SOLVER:
23 | IMS_PER_BATCH: 16
24 | BASE_LR: 0.0001
25 | MAX_ITER: 90000
26 | TEST:
27 | DETECTIONS_PER_IMAGE: 250
28 |
--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/aff/maskformer2_aff_mini_bs32_45k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "AutoFocusFormer"
5 | AFF:
6 | EMBED_DIM: [32,128,256,384]
7 | DEPTHS: [2, 2, 6, 2]
8 | NUM_HEADS: [ 2, 4, 8, 16 ]
9 | DROP_PATH_RATE: 0.0
10 | PATCH_NORM: True
11 | MLP_RATIO: 2.
12 | CLUSTER_SIZE: 8
13 | NBHD_SIZE: [48,48,48,48]
14 | ALPHA: 8.0
15 | DS_RATE: 0.25
16 | WEIGHTS: "aff_mini.pkl"
17 | PIXEL_MEAN: [123.675, 116.280, 103.530]
18 | PIXEL_STD: [58.395, 57.120, 57.375]
19 | SOLVER:
20 | IMS_PER_BATCH: 32
21 | BASE_LR: 0.0002
22 | MAX_ITER: 45000
23 |
--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/aff/maskformer2_aff_small_bs32_45k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "AutoFocusFormer"
5 | AFF:
6 | EMBED_DIM: [96,192,384,768]
7 | DEPTHS: [3,4,18,2]
8 | NUM_HEADS: [3,6,12,24]
9 | DROP_PATH_RATE: 0.3
10 | PATCH_NORM: True
11 | MLP_RATIO: 3.
12 | CLUSTER_SIZE: 8
13 | NBHD_SIZE: [48,48,48,48]
14 | LAYER_SCALE: 1e-5 # turned off if 0.0
15 | ALPHA: 8.0
16 | DS_RATE: 0.25
17 | WEIGHTS: "aff_small.pkl"
18 | PIXEL_MEAN: [123.675, 116.280, 103.530]
19 | PIXEL_STD: [58.395, 57.120, 57.375]
20 | SOLVER:
21 | IMS_PER_BATCH: 32
22 | BASE_LR: 0.0002
23 | MAX_ITER: 45000
24 |
--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/aff/maskformer2_aff_tiny_bs32_45k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "AutoFocusFormer"
5 | AFF:
6 | EMBED_DIM: [64,128,256,512]
7 | DEPTHS: [3,4,18,5]
8 | NUM_HEADS: [2,4,8,16]
9 | DROP_PATH_RATE: 0.3
10 | PATCH_NORM: True
11 | MLP_RATIO: 3.
12 | CLUSTER_SIZE: 8
13 | NBHD_SIZE: [48,48,48,48]
14 | ALPHA: 8.0
15 | DS_RATE: 0.25
16 | WEIGHTS: "aff_tiny.pkl"
17 | PIXEL_MEAN: [123.675, 116.280, 103.530]
18 | PIXEL_STD: [58.395, 57.120, 57.375]
19 | SOLVER:
20 | IMS_PER_BATCH: 32
21 | BASE_LR: 0.0002
22 | MAX_ITER: 45000
23 |
--------------------------------------------------------------------------------
/configs/cityscapes/instance-segmentation/maskformer2_R50_bs16_90k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-Cityscapes-InstanceSegmentation.yaml
2 | MODEL:
3 | META_ARCHITECTURE: "MaskFormer"
4 | SEM_SEG_HEAD:
5 | NAME: "MaskFormerHead"
6 | IGNORE_VALUE: 255
7 | NUM_CLASSES: 8
8 | LOSS_WEIGHT: 1.0
9 | CONVS_DIM: 256
10 | MASK_DIM: 256
11 | NORM: "GN"
12 | # pixel decoder
13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 | COMMON_STRIDE: 4
17 | TRANSFORMER_ENC_LAYERS: 6
18 | MASK_FORMER:
19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 | DEEP_SUPERVISION: True
22 | NO_OBJECT_WEIGHT: 0.1
23 | CLASS_WEIGHT: 2.0
24 | MASK_WEIGHT: 5.0
25 | DICE_WEIGHT: 5.0
26 | HIDDEN_DIM: 256
27 | NUM_OBJECT_QUERIES: 100
28 | NHEADS: 8
29 | DROPOUT: 0.0
30 | DIM_FEEDFORWARD: 2048
31 | ENC_LAYERS: 0
32 | PRE_NORM: False
33 | ENFORCE_INPUT_PROJ: False
34 | SIZE_DIVISIBILITY: 32
35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
36 | TRAIN_NUM_POINTS: 12544
37 | OVERSAMPLE_RATIO: 3.0
38 | IMPORTANCE_SAMPLE_RATIO: 0.75
39 | TEST:
40 | SEMANTIC_ON: False
41 | INSTANCE_ON: True
42 | PANOPTIC_ON: False
43 | OVERLAP_THRESHOLD: 0.8
44 | OBJECT_MASK_THRESHOLD: 0.8
45 |
--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/Base-Cityscapes-PanopticSegmentation.yaml:
--------------------------------------------------------------------------------
1 | MODEL:
2 | BACKBONE:
3 | FREEZE_AT: 0
4 | NAME: "build_resnet_backbone"
5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6 | PIXEL_MEAN: [123.675, 116.280, 103.530]
7 | PIXEL_STD: [58.395, 57.120, 57.375]
8 | RESNETS:
9 | DEPTH: 50
10 | STEM_TYPE: "basic" # not used
11 | STEM_OUT_CHANNELS: 64
12 | STRIDE_IN_1X1: False
13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 | NORM: "SyncBN" # use syncbn for cityscapes dataset
15 | RES5_MULTI_GRID: [1, 1, 1] # not used
16 | DATASETS:
17 | TRAIN: ("cityscapes_fine_panoptic_train",)
18 | TEST: ("cityscapes_fine_panoptic_val",)
19 | SOLVER:
20 | IMS_PER_BATCH: 16
21 | BASE_LR: 0.0001
22 | MAX_ITER: 90000
23 | WARMUP_FACTOR: 1.0
24 | WARMUP_ITERS: 0
25 | WEIGHT_DECAY: 0.05
26 | OPTIMIZER: "ADAMW"
27 | LR_SCHEDULER_NAME: "WarmupPolyLR"
28 | BACKBONE_MULTIPLIER: 0.1
29 | CLIP_GRADIENTS:
30 | ENABLED: True
31 | CLIP_TYPE: "full_model"
32 | CLIP_VALUE: 0.01
33 | NORM_TYPE: 2.0
34 | AMP:
35 | ENABLED: False
36 | INPUT:
37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
38 | MIN_SIZE_TRAIN_SAMPLING: "choice"
39 | MIN_SIZE_TEST: 1024
40 | MAX_SIZE_TRAIN: 4096
41 | MAX_SIZE_TEST: 2048
42 | CROP:
43 | ENABLED: True
44 | TYPE: "absolute"
45 | SIZE: (512, 1024)
46 | SINGLE_CATEGORY_MAX_AREA: 1.0
47 | COLOR_AUG_SSD: True
48 | SIZE_DIVISIBILITY: -1
49 | FORMAT: "RGB"
50 | DATASET_MAPPER_NAME: "mask_former_panoptic"
51 | TEST:
52 | EVAL_PERIOD: 5000
53 | AUG:
54 | ENABLED: False
55 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
56 | MAX_SIZE: 4096
57 | FLIP: True
58 | DATALOADER:
59 | FILTER_EMPTY_ANNOTATIONS: True
60 | NUM_WORKERS: 4
61 | VERSION: 2
62 | SEED: 0
63 |
--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/aff/maskformer2_aff_base_384_bs16_90k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "AutoFocusFormer"
5 | AFF:
6 | EMBED_DIM: [128, 256, 512, 1024]
7 | DEPTHS: [3,4,18,2]
8 | NUM_HEADS: [4,8,16,32]
9 | DROP_PATH_RATE: 0.3
10 | PATCH_NORM: True
11 | MLP_RATIO: 3.
12 | CLUSTER_SIZE: 24
13 | NBHD_SIZE: [144,144,144,144]
14 | LAYER_SCALE: 1e-5 # turned off if 0.0
15 | ALPHA: 8.0
16 | DS_RATE: 0.25
17 | WEIGHTS: "aff_base_22kto1k_384.pkl"
18 | PIXEL_MEAN: [123.675, 116.280, 103.530]
19 | PIXEL_STD: [58.395, 57.120, 57.375]
20 | MASK_FORMER:
21 | NUM_OBJECT_QUERIES: 250
22 | SOLVER:
23 | IMS_PER_BATCH: 16
24 | BASE_LR: 0.0001
25 | MAX_ITER: 90000
26 | TEST:
27 | DETECTIONS_PER_IMAGE: 250
28 |
--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/aff/maskformer2_aff_mini_bs32_45k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "AutoFocusFormer"
5 | AFF:
6 | EMBED_DIM: [32,128,256,384]
7 | DEPTHS: [2, 2, 6, 2]
8 | NUM_HEADS: [ 2, 4, 8, 16 ]
9 | DROP_PATH_RATE: 0.0
10 | PATCH_NORM: True
11 | MLP_RATIO: 2.
12 | CLUSTER_SIZE: 8
13 | NBHD_SIZE: [48,48,48,48]
14 | ALPHA: 8.0
15 | DS_RATE: 0.25
16 | WEIGHTS: "aff_mini.pkl"
17 | PIXEL_MEAN: [123.675, 116.280, 103.530]
18 | PIXEL_STD: [58.395, 57.120, 57.375]
19 | SOLVER:
20 | IMS_PER_BATCH: 32
21 | BASE_LR: 0.0002
22 | MAX_ITER: 45000
23 |
--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/aff/maskformer2_aff_small_bs32_45k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "AutoFocusFormer"
5 | AFF:
6 | EMBED_DIM: [96,192,384,768]
7 | DEPTHS: [3,4,18,2]
8 | NUM_HEADS: [3,6,12,24]
9 | DROP_PATH_RATE: 0.3
10 | PATCH_NORM: True
11 | MLP_RATIO: 3.
12 | CLUSTER_SIZE: 8
13 | NBHD_SIZE: [48,48,48,48]
14 | LAYER_SCALE: 1e-5 # turned off if 0.0
15 | ALPHA: 8.0
16 | DS_RATE: 0.25
17 | WEIGHTS: "aff_small.pkl"
18 | PIXEL_MEAN: [123.675, 116.280, 103.530]
19 | PIXEL_STD: [58.395, 57.120, 57.375]
20 | SOLVER:
21 | IMS_PER_BATCH: 32
22 | BASE_LR: 0.0002
23 | MAX_ITER: 45000
24 |
--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/aff/maskformer2_aff_tiny_bs32_45k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "AutoFocusFormer"
5 | AFF:
6 | EMBED_DIM: [64,128,256,512]
7 | DEPTHS: [3,4,18,5]
8 | NUM_HEADS: [2,4,8,16]
9 | DROP_PATH_RATE: 0.3
10 | PATCH_NORM: True
11 | MLP_RATIO: 3.
12 | CLUSTER_SIZE: 8
13 | NBHD_SIZE: [48,48,48,48]
14 | ALPHA: 8.0
15 | DS_RATE: 0.25
16 | WEIGHTS: "aff_tiny.pkl"
17 | PIXEL_MEAN: [123.675, 116.280, 103.530]
18 | PIXEL_STD: [58.395, 57.120, 57.375]
19 | SOLVER:
20 | IMS_PER_BATCH: 32
21 | BASE_LR: 0.0002
22 | MAX_ITER: 45000
23 |
--------------------------------------------------------------------------------
/configs/cityscapes/panoptic-segmentation/maskformer2_R50_bs16_90k.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-Cityscapes-PanopticSegmentation.yaml
2 | MODEL:
3 | META_ARCHITECTURE: "MaskFormer"
4 | SEM_SEG_HEAD:
5 | NAME: "MaskFormerHead"
6 | IGNORE_VALUE: 255
7 | NUM_CLASSES: 19
8 | LOSS_WEIGHT: 1.0
9 | CONVS_DIM: 256
10 | MASK_DIM: 256
11 | NORM: "GN"
12 | # pixel decoder
13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 | COMMON_STRIDE: 4
17 | TRANSFORMER_ENC_LAYERS: 6
18 | MASK_FORMER:
19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 | DEEP_SUPERVISION: True
22 | NO_OBJECT_WEIGHT: 0.1
23 | CLASS_WEIGHT: 2.0
24 | MASK_WEIGHT: 5.0
25 | DICE_WEIGHT: 5.0
26 | HIDDEN_DIM: 256
27 | NUM_OBJECT_QUERIES: 100
28 | NHEADS: 8
29 | DROPOUT: 0.0
30 | DIM_FEEDFORWARD: 2048
31 | ENC_LAYERS: 0
32 | PRE_NORM: False
33 | ENFORCE_INPUT_PROJ: False
34 | SIZE_DIVISIBILITY: 32
35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
36 | TRAIN_NUM_POINTS: 12544
37 | OVERSAMPLE_RATIO: 3.0
38 | IMPORTANCE_SAMPLE_RATIO: 0.75
39 | TEST:
40 | SEMANTIC_ON: True
41 | INSTANCE_ON: True
42 | PANOPTIC_ON: True
43 | OVERLAP_THRESHOLD: 0.8
44 | OBJECT_MASK_THRESHOLD: 0.8
45 |
--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml:
--------------------------------------------------------------------------------
1 | MODEL:
2 | BACKBONE:
3 | FREEZE_AT: 0
4 | NAME: "build_resnet_backbone"
5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6 | PIXEL_MEAN: [123.675, 116.280, 103.530]
7 | PIXEL_STD: [58.395, 57.120, 57.375]
8 | RESNETS:
9 | DEPTH: 50
10 | STEM_TYPE: "basic" # not used
11 | STEM_OUT_CHANNELS: 64
12 | STRIDE_IN_1X1: False
13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14 | # NORM: "SyncBN"
15 | RES5_MULTI_GRID: [1, 1, 1] # not used
16 | AFF:
17 | SHEPARD_POWER: 4.0
18 | SHEPARD_POWER_LEARNABLE: False
19 | DATASETS:
20 | TRAIN: ("coco_2017_train",)
21 | TEST: ("coco_2017_val",)
22 | SOLVER:
23 | IMS_PER_BATCH: 16
24 | BASE_LR: 0.0001
25 | STEPS: (327778, 355092)
26 | MAX_ITER: 368750
27 | WARMUP_FACTOR: 1.0
28 | WARMUP_ITERS: 10
29 | WEIGHT_DECAY: 0.05
30 | OPTIMIZER: "ADAMW"
31 | BACKBONE_MULTIPLIER: 0.1
32 | CLIP_GRADIENTS:
33 | ENABLED: True
34 | CLIP_TYPE: "full_model"
35 | CLIP_VALUE: 0.01
36 | NORM_TYPE: 2.0
37 | AMP:
38 | ENABLED: False
39 | INPUT:
40 | IMAGE_SIZE: 1024
41 | MIN_SCALE: 0.1
42 | MAX_SCALE: 2.0
43 | FORMAT: "RGB"
44 | DATASET_MAPPER_NAME: "coco_instance_lsj"
45 | TEST:
46 | EVAL_PERIOD: 5000
47 | DATALOADER:
48 | FILTER_EMPTY_ANNOTATIONS: True
49 | NUM_WORKERS: 4
50 | VERSION: 2
51 | SEED: 0
52 |
--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/aff/maskformer2_aff_mini_1_5th_bs64_50ep.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "AutoFocusFormer"
5 | AFF:
6 | EMBED_DIM: [32,128,256,384]
7 | DEPTHS: [2, 2, 6, 2]
8 | NUM_HEADS: [ 2, 4, 8, 16 ]
9 | DROP_PATH_RATE: 0.0
10 | PATCH_NORM: True
11 | MLP_RATIO: 2.
12 | CLUSTER_SIZE: 8
13 | NBHD_SIZE: [48,48,48,48]
14 | ALPHA: 4.0
15 | DS_RATE: 0.2
16 | WEIGHTS: "aff_mini_1_5th.pkl"
17 | PIXEL_MEAN: [123.675, 116.280, 103.530]
18 | PIXEL_STD: [58.395, 57.120, 57.375]
19 | SOLVER:
20 | IMS_PER_BATCH: 64
21 | BASE_LR: 0.0002
22 | STEPS: (81945, 88773)
23 | MAX_ITER: 92188
24 | WARMUP_ITERS: 3
25 | CHECKPOINT_PERIOD: 2500
26 | TEST:
27 | EVAL_PERIOD: 2500
28 |
--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/aff/maskformer2_aff_mini_bs64_50ep.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "AutoFocusFormer"
5 | AFF:
6 | EMBED_DIM: [32,128,256,384]
7 | DEPTHS: [2, 2, 6, 2]
8 | NUM_HEADS: [ 2, 4, 8, 16 ]
9 | DROP_PATH_RATE: 0.0
10 | PATCH_NORM: True
11 | MLP_RATIO: 2.
12 | CLUSTER_SIZE: 8
13 | NBHD_SIZE: [48,48,48,48]
14 | ALPHA: 4.0
15 | DS_RATE: 0.25
16 | WEIGHTS: "aff_mini.pkl"
17 | PIXEL_MEAN: [123.675, 116.280, 103.530]
18 | PIXEL_STD: [58.395, 57.120, 57.375]
19 | SOLVER:
20 | IMS_PER_BATCH: 64
21 | BASE_LR: 0.0002
22 | STEPS: (81945, 88773)
23 | MAX_ITER: 92188
24 | WARMUP_ITERS: 3
25 | CHECKPOINT_PERIOD: 2500
26 | TEST:
27 | EVAL_PERIOD: 2500
28 |
--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/aff/maskformer2_aff_small_1_5th_bs64_50ep.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "AutoFocusFormer"
5 | AFF:
6 | EMBED_DIM: [96,192,384,768]
7 | DEPTHS: [3,4,18,2]
8 | NUM_HEADS: [3,6,12,24]
9 | DROP_PATH_RATE: 0.3
10 | PATCH_NORM: True
11 | MLP_RATIO: 3.
12 | CLUSTER_SIZE: 8
13 | NBHD_SIZE: [48,48,48,48]
14 | LAYER_SCALE: 1e-5 # turned off if 0.0
15 | ALPHA: 4.0
16 | DS_RATE: 0.2
17 | WEIGHTS: "aff_small_1_5th.pkl"
18 | PIXEL_MEAN: [123.675, 116.280, 103.530]
19 | PIXEL_STD: [58.395, 57.120, 57.375]
20 | SOLVER:
21 | IMS_PER_BATCH: 64
22 | BASE_LR: 0.0002
23 | STEPS: (81945, 88773)
24 | MAX_ITER: 92188
25 | WARMUP_ITERS: 3
26 | CHECKPOINT_PERIOD: 2500
27 | TEST:
28 | EVAL_PERIOD: 2500
29 |
--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/aff/maskformer2_aff_small_bs64_50ep.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "AutoFocusFormer"
5 | AFF:
6 | EMBED_DIM: [96,192,384,768]
7 | DEPTHS: [3,4,18,2]
8 | NUM_HEADS: [3,6,12,24]
9 | DROP_PATH_RATE: 0.3
10 | PATCH_NORM: True
11 | MLP_RATIO: 3.
12 | CLUSTER_SIZE: 8
13 | NBHD_SIZE: [48,48,48,48]
14 | LAYER_SCALE: 1e-5 # turned off if 0.0
15 | ALPHA: 4.0
16 | DS_RATE: 0.25
17 | WEIGHTS: "aff_small.pkl"
18 | PIXEL_MEAN: [123.675, 116.280, 103.530]
19 | PIXEL_STD: [58.395, 57.120, 57.375]
20 | SOLVER:
21 | IMS_PER_BATCH: 64
22 | BASE_LR: 0.0002
23 | STEPS: (81945, 88773)
24 | MAX_ITER: 92188
25 | WARMUP_ITERS: 3
26 | CHECKPOINT_PERIOD: 2500
27 | TEST:
28 | EVAL_PERIOD: 2500
29 |
--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/aff/maskformer2_aff_tiny_1_5th_bs64_50ep.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "AutoFocusFormer"
5 | AFF:
6 | EMBED_DIM: [64,128,256,512]
7 | DEPTHS: [3,4,18,5]
8 | NUM_HEADS: [2,4,8,16]
9 | DROP_PATH_RATE: 0.3
10 | PATCH_NORM: True
11 | MLP_RATIO: 3.
12 | CLUSTER_SIZE: 8
13 | NBHD_SIZE: [48,48,48,48]
14 | ALPHA: 4.0
15 | DS_RATE: 0.2
16 | WEIGHTS: "aff_tiny_1_5th.pkl"
17 | PIXEL_MEAN: [123.675, 116.280, 103.530]
18 | PIXEL_STD: [58.395, 57.120, 57.375]
19 | SOLVER:
20 | IMS_PER_BATCH: 64
21 | BASE_LR: 0.0002
22 | STEPS: (81945, 88773)
23 | MAX_ITER: 92188
24 | WARMUP_ITERS: 3
25 | CHECKPOINT_PERIOD: 2500
26 | TEST:
27 | EVAL_PERIOD: 2500
28 |
--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/aff/maskformer2_aff_tiny_bs64_50ep.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml
2 | MODEL:
3 | BACKBONE:
4 | NAME: "AutoFocusFormer"
5 | AFF:
6 | EMBED_DIM: [64,128,256,512]
7 | DEPTHS: [3,4,18,5]
8 | NUM_HEADS: [2,4,8,16]
9 | DROP_PATH_RATE: 0.3
10 | PATCH_NORM: True
11 | MLP_RATIO: 3.
12 | CLUSTER_SIZE: 8
13 | NBHD_SIZE: [48,48,48,48]
14 | ALPHA: 4.0
15 | DS_RATE: 0.25
16 | WEIGHTS: "aff_tiny.pkl"
17 | PIXEL_MEAN: [123.675, 116.280, 103.530]
18 | PIXEL_STD: [58.395, 57.120, 57.375]
19 | SOLVER:
20 | IMS_PER_BATCH: 64
21 | BASE_LR: 0.0002
22 | STEPS: (81945, 88773)
23 | MAX_ITER: 92188
24 | WARMUP_ITERS: 3
25 | CHECKPOINT_PERIOD: 2500
26 | TEST:
27 | EVAL_PERIOD: 2500
28 |
--------------------------------------------------------------------------------
/configs/coco/instance-segmentation/maskformer2_R50_bs16_50ep.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: Base-COCO-InstanceSegmentation.yaml
2 | MODEL:
3 | META_ARCHITECTURE: "MaskFormer"
4 | SEM_SEG_HEAD:
5 | NAME: "MaskFormerHead"
6 | IGNORE_VALUE: 255
7 | NUM_CLASSES: 80
8 | LOSS_WEIGHT: 1.0
9 | CONVS_DIM: 256
10 | MASK_DIM: 256
11 | NORM: "GN"
12 | # pixel decoder
13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16 | COMMON_STRIDE: 4
17 | TRANSFORMER_ENC_LAYERS: 6
18 | MASK_FORMER:
19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21 | DEEP_SUPERVISION: True
22 | NO_OBJECT_WEIGHT: 0.1
23 | CLASS_WEIGHT: 2.0
24 | MASK_WEIGHT: 5.0
25 | DICE_WEIGHT: 5.0
26 | HIDDEN_DIM: 256
27 | NUM_OBJECT_QUERIES: 100
28 | NHEADS: 8
29 | DROPOUT: 0.0
30 | DIM_FEEDFORWARD: 2048
31 | ENC_LAYERS: 0
32 | PRE_NORM: False
33 | ENFORCE_INPUT_PROJ: False
34 | SIZE_DIVISIBILITY: 32
35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
36 | TRAIN_NUM_POINTS: 12544
37 | OVERSAMPLE_RATIO: 3.0
38 | IMPORTANCE_SAMPLE_RATIO: 0.75
39 | TEST:
40 | SEMANTIC_ON: False
41 | INSTANCE_ON: True
42 | PANOPTIC_ON: False
43 | OVERLAP_THRESHOLD: 0.8
44 | OBJECT_MASK_THRESHOLD: 0.8
45 |
--------------------------------------------------------------------------------
/create_env.sh:
--------------------------------------------------------------------------------
1 | # Create a conda virtual environment and activate it
2 | conda create -n aff python=3.8
3 | conda activate aff
4 |
5 | # Install requirements
6 | pip install \
7 | yacs==0.1.8 \
8 | termcolor==2.2.0 \
9 | timm==0.6.12 \
10 | pykeops==2.1.1 \
11 | ptflops==0.6.9 \
12 | numpy==1.22.4 \
13 | cython==0.29.33 \
14 | scipy==1.9.1 \
15 | shapely==2.0.1 \
16 | h5py==3.8.0 \
17 | submitit==1.4.5 \
18 | scikit-image==0.20.0
19 | conda install -c conda-forge opencv
20 | conda install pytorch==1.12.0 torchvision==0.13.0 torchaudio==0.12.0 cudatoolkit=11.6 -c pytorch -c conda-forge
21 |
22 | # Detectron2
23 | python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
24 |
25 | # add ADE20K_SEM_SEG_CATEGORIES_COLORS for consistent color in ADE prediction visualization
26 | mv ./builtin.py path/to/conda/lib/python3.8/site-packages/detectron2/data/datasets
27 | mv ./builtin_meta.py path/to/conda/lib/python3.8/site-packages/detectron2/data/datasets
28 |
29 | # Install the custom CUDA kernels for AFF
30 | cd mask2former/modeling/clusten/src && python setup.py install
31 |
--------------------------------------------------------------------------------
/datasets/README.md:
--------------------------------------------------------------------------------
1 | # Prepare Datasets
2 |
3 | A dataset can be used by accessing [DatasetCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.DatasetCatalog)
4 | for its data, or [MetadataCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.MetadataCatalog) for its metadata (class names, etc).
5 | This document explains how to setup the builtin datasets so they can be used by the above APIs.
6 | [Use Custom Datasets](https://detectron2.readthedocs.io/tutorials/datasets.html) gives a deeper dive on how to use `DatasetCatalog` and `MetadataCatalog`,
7 | and how to add new datasets to them.
8 |
9 | The datasets are assumed to exist in a directory specified by the environment variable
10 | `DETECTRON2_DATASETS`.
11 | Under this directory, detectron2 will look for datasets in the structure described below, if needed.
12 | ```
13 | $DETECTRON2_DATASETS/
14 | ADEChallengeData2016/
15 | coco/
16 | cityscapes/
17 | ```
18 |
19 | You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`.
20 | If left unset, the default is `./datasets` relative to your current working directory.
21 |
22 |
23 | ## Expected dataset structure for [COCO](https://cocodataset.org/#download):
24 |
25 | ```
26 | coco/
27 | annotations/
28 | instances_{train,val}2017.json
29 | panoptic_{train,val}2017.json
30 | {train,val}2017/
31 | # image files that are mentioned in the corresponding json
32 | panoptic_{train,val}2017/ # png annotations
33 | panoptic_semseg_{train,val}2017/ # generated by the script mentioned below
34 | ```
35 |
36 | Install panopticapi by:
37 | ```
38 | pip install git+https://github.com/cocodataset/panopticapi.git
39 | ```
40 | Then, run `python datasets/prepare_coco_semantic_annos_from_panoptic_annos.py`, to extract semantic annotations from panoptic annotations (only used for evaluation).
41 |
42 |
43 | ## Expected dataset structure for [cityscapes](https://www.cityscapes-dataset.com/downloads/):
44 | ```
45 | cityscapes/
46 | gtFine/
47 | train/
48 | aachen/
49 | color.png, instanceIds.png, labelIds.png, polygons.json,
50 | labelTrainIds.png
51 | ...
52 | val/
53 | test/
54 | # below are generated Cityscapes panoptic annotation
55 | cityscapes_panoptic_train.json
56 | cityscapes_panoptic_train/
57 | cityscapes_panoptic_val.json
58 | cityscapes_panoptic_val/
59 | cityscapes_panoptic_test.json
60 | cityscapes_panoptic_test/
61 | leftImg8bit/
62 | train/
63 | val/
64 | test/
65 | ```
66 | Download cityscapes scripts by:
67 | ```
68 | git clone https://github.com/mcordts/cityscapesScripts.git
69 | ```
70 |
71 | Note: to create labelTrainIds.png, first prepare the above structure, then run cityscapesescript with:
72 | ```
73 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesScripts/cityscapesscripts/preparation/createTrainIdLabelImgs.py
74 | ```
75 | These files are not needed for instance segmentation.
76 |
77 | Note: to generate Cityscapes panoptic dataset, run cityscapesescript with:
78 | ```
79 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesScripts/cityscapesscripts/preparation/createPanopticImgs.py
80 | ```
81 | These files are not needed for semantic and instance segmentation.
82 |
83 |
84 | ## Expected dataset structure for [ADE20k](http://sceneparsing.csail.mit.edu/):
85 | ```
86 | ADEChallengeData2016/
87 | images/
88 | annotations/
89 | objectInfo150.txt
90 | # download instance annotation
91 | annotations_instance/
92 | # generated by prepare_ade20k_sem_seg.py
93 | annotations_detectron2/
94 | # below are generated by prepare_ade20k_pan_seg.py
95 | ade20k_panoptic_{train,val}.json
96 | ade20k_panoptic_{train,val}/
97 | # below are generated by prepare_ade20k_ins_seg.py
98 | ade20k_instance_{train,val}.json
99 | ```
100 |
101 | The directory `annotations_detectron2` is generated by running `python datasets/prepare_ade20k_sem_seg.py`.
102 |
103 | ## Expected dataset structure for [LVIS instance segmentation](https://www.lvisdataset.org/dataset):
104 | ```
105 | coco/
106 | {train,val,test}2017/
107 | lvis/
108 | lvis_v0.5_{train,val}.json
109 | lvis_v0.5_image_info_test.json
110 | lvis_v1_{train,val}.json
111 | lvis_v1_image_info_test{,_challenge}.json
112 | ```
113 |
114 | Install lvis-api by:
115 | ```
116 | pip install git+https://github.com/lvis-dataset/lvis-api.git
117 | ```
118 |
119 | To evaluate models trained on the COCO dataset using LVIS annotations,
120 | run `python datasets/prepare_cocofied_lvis.py` to prepare "cocofied" LVIS v0.5 annotations,
121 | or `python datasets/prepare_cocofied_lvisv1.py` to prepare "cocofied" LVIS v1 annotations.
122 |
123 | Then, add `("lvis_v0.5_val_cocofied",)` or `("lvis_v1_val_cocofied",)` to DATASETS:TEST in config files.
124 |
125 | Finally, for v1, add `lvis_v1_cocofied` entry
126 | ```
127 | "lvis_v1_cocofied": {
128 | "lvis_v1_val_cocofied": ("coco/", "lvis/lvis_v1_val_cocofied.json"),
129 | },
130 | ```
131 | to detectron2/data/datasets/builtin.py.
132 |
--------------------------------------------------------------------------------
/datasets/prepare_ade20k_sem_seg.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | # Copyright (c) Facebook, Inc. and its affiliates.
4 |
5 | import os
6 | from pathlib import Path
7 |
8 | import numpy as np
9 | import tqdm
10 | from PIL import Image
11 |
12 |
13 | def convert(input, output):
14 | img = np.asarray(Image.open(input))
15 | assert img.dtype == np.uint8
16 | img = img - 1 # 0 (ignore) becomes 255. others are shifted by 1
17 | Image.fromarray(img).save(output)
18 |
19 |
20 | if __name__ == "__main__":
21 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016"
22 | for name in ["training", "validation"]:
23 | annotation_dir = dataset_dir / "annotations" / name
24 | output_dir = dataset_dir / "annotations_detectron2" / name
25 | output_dir.mkdir(parents=True, exist_ok=True)
26 | for file in tqdm.tqdm(list(annotation_dir.iterdir())):
27 | output_file = output_dir / file.name
28 | convert(file, output_file)
29 |
--------------------------------------------------------------------------------
/datasets/prepare_coco_semantic_annos_from_panoptic_annos.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | # Copyright (c) Facebook, Inc. and its affiliates.
4 |
5 | import functools
6 | import json
7 | import multiprocessing as mp
8 | import numpy as np
9 | import os
10 | import time
11 | from panopticapi.utils import rgb2id
12 | from PIL import Image
13 |
14 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
15 |
16 |
17 | def _process_panoptic_to_semantic(input_panoptic, output_semantic, segments, id_map):
18 | panoptic = np.asarray(Image.open(input_panoptic), dtype=np.uint32)
19 | panoptic = rgb2id(panoptic)
20 | output = np.zeros_like(panoptic, dtype=np.uint8) + 255
21 | for seg in segments:
22 | cat_id = seg["category_id"]
23 | new_cat_id = id_map[cat_id]
24 | output[panoptic == seg["id"]] = new_cat_id
25 | Image.fromarray(output).save(output_semantic)
26 |
27 |
28 | def separate_coco_semantic_from_panoptic(panoptic_json, panoptic_root, sem_seg_root, categories):
29 | """
30 | Create semantic segmentation annotations from panoptic segmentation
31 | annotations, to be used by PanopticFPN.
32 | It maps all thing categories to class 0, and maps all unlabeled pixels to class 255.
33 | It maps all stuff categories to contiguous ids starting from 1.
34 | Args:
35 | panoptic_json (str): path to the panoptic json file, in COCO's format.
36 | panoptic_root (str): a directory with panoptic annotation files, in COCO's format.
37 | sem_seg_root (str): a directory to output semantic annotation files
38 | categories (list[dict]): category metadata. Each dict needs to have:
39 | "id": corresponds to the "category_id" in the json annotations
40 | "isthing": 0 or 1
41 | """
42 | os.makedirs(sem_seg_root, exist_ok=True)
43 |
44 | id_map = {} # map from category id to id in the output semantic annotation
45 | assert len(categories) <= 254
46 | for i, k in enumerate(categories):
47 | id_map[k["id"]] = i
48 | # what is id = 0?
49 | # id_map[0] = 255
50 | print(id_map)
51 |
52 | with open(panoptic_json) as f:
53 | obj = json.load(f)
54 |
55 | pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4))
56 |
57 | def iter_annotations():
58 | for anno in obj["annotations"]:
59 | file_name = anno["file_name"]
60 | segments = anno["segments_info"]
61 | input = os.path.join(panoptic_root, file_name)
62 | output = os.path.join(sem_seg_root, file_name)
63 | yield input, output, segments
64 |
65 | print("Start writing to {} ...".format(sem_seg_root))
66 | start = time.time()
67 | pool.starmap(
68 | functools.partial(_process_panoptic_to_semantic, id_map=id_map),
69 | iter_annotations(),
70 | chunksize=100,
71 | )
72 | print("Finished. time: {:.2f}s".format(time.time() - start))
73 |
74 |
75 | if __name__ == "__main__":
76 | dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco")
77 | for s in ["val2017", "train2017"]:
78 | separate_coco_semantic_from_panoptic(
79 | os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)),
80 | os.path.join(dataset_dir, "panoptic_{}".format(s)),
81 | os.path.join(dataset_dir, "panoptic_semseg_{}".format(s)),
82 | COCO_CATEGORIES,
83 | )
84 |
--------------------------------------------------------------------------------
/datasets/prepare_cocofied_lvis.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | # Copyright (c) Facebook, Inc. and its affiliates.
4 |
5 | import copy
6 | import json
7 | import os
8 | from collections import defaultdict
9 |
10 | # This mapping is extracted from the official LVIS mapping:
11 | # https://github.com/lvis-dataset/lvis-api/blob/master/data/coco_to_synset.json
12 | COCO_SYNSET_CATEGORIES = [
13 | {"synset": "person.n.01", "coco_cat_id": 1},
14 | {"synset": "bicycle.n.01", "coco_cat_id": 2},
15 | {"synset": "car.n.01", "coco_cat_id": 3},
16 | {"synset": "motorcycle.n.01", "coco_cat_id": 4},
17 | {"synset": "airplane.n.01", "coco_cat_id": 5},
18 | {"synset": "bus.n.01", "coco_cat_id": 6},
19 | {"synset": "train.n.01", "coco_cat_id": 7},
20 | {"synset": "truck.n.01", "coco_cat_id": 8},
21 | {"synset": "boat.n.01", "coco_cat_id": 9},
22 | {"synset": "traffic_light.n.01", "coco_cat_id": 10},
23 | {"synset": "fireplug.n.01", "coco_cat_id": 11},
24 | {"synset": "stop_sign.n.01", "coco_cat_id": 13},
25 | {"synset": "parking_meter.n.01", "coco_cat_id": 14},
26 | {"synset": "bench.n.01", "coco_cat_id": 15},
27 | {"synset": "bird.n.01", "coco_cat_id": 16},
28 | {"synset": "cat.n.01", "coco_cat_id": 17},
29 | {"synset": "dog.n.01", "coco_cat_id": 18},
30 | {"synset": "horse.n.01", "coco_cat_id": 19},
31 | {"synset": "sheep.n.01", "coco_cat_id": 20},
32 | {"synset": "beef.n.01", "coco_cat_id": 21},
33 | {"synset": "elephant.n.01", "coco_cat_id": 22},
34 | {"synset": "bear.n.01", "coco_cat_id": 23},
35 | {"synset": "zebra.n.01", "coco_cat_id": 24},
36 | {"synset": "giraffe.n.01", "coco_cat_id": 25},
37 | {"synset": "backpack.n.01", "coco_cat_id": 27},
38 | {"synset": "umbrella.n.01", "coco_cat_id": 28},
39 | {"synset": "bag.n.04", "coco_cat_id": 31},
40 | {"synset": "necktie.n.01", "coco_cat_id": 32},
41 | {"synset": "bag.n.06", "coco_cat_id": 33},
42 | {"synset": "frisbee.n.01", "coco_cat_id": 34},
43 | {"synset": "ski.n.01", "coco_cat_id": 35},
44 | {"synset": "snowboard.n.01", "coco_cat_id": 36},
45 | {"synset": "ball.n.06", "coco_cat_id": 37},
46 | {"synset": "kite.n.03", "coco_cat_id": 38},
47 | {"synset": "baseball_bat.n.01", "coco_cat_id": 39},
48 | {"synset": "baseball_glove.n.01", "coco_cat_id": 40},
49 | {"synset": "skateboard.n.01", "coco_cat_id": 41},
50 | {"synset": "surfboard.n.01", "coco_cat_id": 42},
51 | {"synset": "tennis_racket.n.01", "coco_cat_id": 43},
52 | {"synset": "bottle.n.01", "coco_cat_id": 44},
53 | {"synset": "wineglass.n.01", "coco_cat_id": 46},
54 | {"synset": "cup.n.01", "coco_cat_id": 47},
55 | {"synset": "fork.n.01", "coco_cat_id": 48},
56 | {"synset": "knife.n.01", "coco_cat_id": 49},
57 | {"synset": "spoon.n.01", "coco_cat_id": 50},
58 | {"synset": "bowl.n.03", "coco_cat_id": 51},
59 | {"synset": "banana.n.02", "coco_cat_id": 52},
60 | {"synset": "apple.n.01", "coco_cat_id": 53},
61 | {"synset": "sandwich.n.01", "coco_cat_id": 54},
62 | {"synset": "orange.n.01", "coco_cat_id": 55},
63 | {"synset": "broccoli.n.01", "coco_cat_id": 56},
64 | {"synset": "carrot.n.01", "coco_cat_id": 57},
65 | {"synset": "frank.n.02", "coco_cat_id": 58},
66 | {"synset": "pizza.n.01", "coco_cat_id": 59},
67 | {"synset": "doughnut.n.02", "coco_cat_id": 60},
68 | {"synset": "cake.n.03", "coco_cat_id": 61},
69 | {"synset": "chair.n.01", "coco_cat_id": 62},
70 | {"synset": "sofa.n.01", "coco_cat_id": 63},
71 | {"synset": "pot.n.04", "coco_cat_id": 64},
72 | {"synset": "bed.n.01", "coco_cat_id": 65},
73 | {"synset": "dining_table.n.01", "coco_cat_id": 67},
74 | {"synset": "toilet.n.02", "coco_cat_id": 70},
75 | {"synset": "television_receiver.n.01", "coco_cat_id": 72},
76 | {"synset": "laptop.n.01", "coco_cat_id": 73},
77 | {"synset": "mouse.n.04", "coco_cat_id": 74},
78 | {"synset": "remote_control.n.01", "coco_cat_id": 75},
79 | {"synset": "computer_keyboard.n.01", "coco_cat_id": 76},
80 | {"synset": "cellular_telephone.n.01", "coco_cat_id": 77},
81 | {"synset": "microwave.n.02", "coco_cat_id": 78},
82 | {"synset": "oven.n.01", "coco_cat_id": 79},
83 | {"synset": "toaster.n.02", "coco_cat_id": 80},
84 | {"synset": "sink.n.01", "coco_cat_id": 81},
85 | {"synset": "electric_refrigerator.n.01", "coco_cat_id": 82},
86 | {"synset": "book.n.01", "coco_cat_id": 84},
87 | {"synset": "clock.n.01", "coco_cat_id": 85},
88 | {"synset": "vase.n.01", "coco_cat_id": 86},
89 | {"synset": "scissors.n.01", "coco_cat_id": 87},
90 | {"synset": "teddy.n.01", "coco_cat_id": 88},
91 | {"synset": "hand_blower.n.01", "coco_cat_id": 89},
92 | {"synset": "toothbrush.n.01", "coco_cat_id": 90},
93 | ]
94 |
95 |
96 | def cocofy_lvis(input_filename, output_filename):
97 | """
98 | Filter LVIS instance segmentation annotations to remove all categories that are not included in
99 | COCO. The new json files can be used to evaluate COCO AP using `lvis-api`. The category ids in
100 | the output json are the incontiguous COCO dataset ids.
101 |
102 | Args:
103 | input_filename (str): path to the LVIS json file.
104 | output_filename (str): path to the COCOfied json file.
105 | """
106 |
107 | with open(input_filename, "r") as f:
108 | lvis_json = json.load(f)
109 |
110 | lvis_annos = lvis_json.pop("annotations")
111 | cocofied_lvis = copy.deepcopy(lvis_json)
112 | lvis_json["annotations"] = lvis_annos
113 |
114 | # Mapping from lvis cat id to coco cat id via synset
115 | lvis_cat_id_to_synset = {cat["id"]: cat["synset"] for cat in lvis_json["categories"]}
116 | synset_to_coco_cat_id = {x["synset"]: x["coco_cat_id"] for x in COCO_SYNSET_CATEGORIES}
117 | # Synsets that we will keep in the dataset
118 | synsets_to_keep = set(synset_to_coco_cat_id.keys())
119 | coco_cat_id_with_instances = defaultdict(int)
120 |
121 | new_annos = []
122 | ann_id = 1
123 | for ann in lvis_annos:
124 | lvis_cat_id = ann["category_id"]
125 | synset = lvis_cat_id_to_synset[lvis_cat_id]
126 | if synset not in synsets_to_keep:
127 | continue
128 | coco_cat_id = synset_to_coco_cat_id[synset]
129 | new_ann = copy.deepcopy(ann)
130 | new_ann["category_id"] = coco_cat_id
131 | new_ann["id"] = ann_id
132 | ann_id += 1
133 | new_annos.append(new_ann)
134 | coco_cat_id_with_instances[coco_cat_id] += 1
135 | cocofied_lvis["annotations"] = new_annos
136 |
137 | for image in cocofied_lvis["images"]:
138 | for key in ["not_exhaustive_category_ids", "neg_category_ids"]:
139 | new_category_list = []
140 | for lvis_cat_id in image[key]:
141 | synset = lvis_cat_id_to_synset[lvis_cat_id]
142 | if synset not in synsets_to_keep:
143 | continue
144 | coco_cat_id = synset_to_coco_cat_id[synset]
145 | new_category_list.append(coco_cat_id)
146 | coco_cat_id_with_instances[coco_cat_id] += 1
147 | image[key] = new_category_list
148 |
149 | coco_cat_id_with_instances = set(coco_cat_id_with_instances.keys())
150 |
151 | new_categories = []
152 | for cat in lvis_json["categories"]:
153 | synset = cat["synset"]
154 | if synset not in synsets_to_keep:
155 | continue
156 | coco_cat_id = synset_to_coco_cat_id[synset]
157 | if coco_cat_id not in coco_cat_id_with_instances:
158 | continue
159 | new_cat = copy.deepcopy(cat)
160 | new_cat["id"] = coco_cat_id
161 | new_categories.append(new_cat)
162 | cocofied_lvis["categories"] = new_categories
163 |
164 | with open(output_filename, "w") as f:
165 | json.dump(cocofied_lvis, f)
166 | print("{} is COCOfied and stored in {}.".format(input_filename, output_filename))
167 |
168 |
169 | if __name__ == "__main__":
170 | dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "lvis")
171 | for s in ["lvis_v0.5_train", "lvis_v0.5_val"]:
172 | print("Start COCOfing {}.".format(s))
173 | cocofy_lvis(
174 | os.path.join(dataset_dir, "{}.json".format(s)),
175 | os.path.join(dataset_dir, "{}_cocofied.json".format(s)),
176 | )
177 |
--------------------------------------------------------------------------------
/datasets/prepare_cocofied_lvisv1.py:
--------------------------------------------------------------------------------
1 | #
2 | # For licensing see accompanying LICENSE file.
3 | # Copyright (C) 2023 Apple Inc. All Rights Reserved.
4 | #
5 |
6 | import copy
7 | import json
8 | import os
9 | from collections import defaultdict
10 |
11 | # This mapping is extracted from the official LVIS mapping:
12 | # https://github.com/lvis-dataset/lvis-api/blob/master/data/coco_to_synset.json
13 | COCO_SYNSET_CATEGORIES = [
14 | {"synset": "person.n.01", "coco_cat_id": 1},
15 | {"synset": "bicycle.n.01", "coco_cat_id": 2},
16 | {"synset": "car.n.01", "coco_cat_id": 3},
17 | {"synset": "motorcycle.n.01", "coco_cat_id": 4},
18 | {"synset": "airplane.n.01", "coco_cat_id": 5},
19 | {"synset": "bus.n.01", "coco_cat_id": 6},
20 | {"synset": "train.n.01", "coco_cat_id": 7},
21 | {"synset": "truck.n.01", "coco_cat_id": 8},
22 | {"synset": "boat.n.01", "coco_cat_id": 9},
23 | {"synset": "traffic_light.n.01", "coco_cat_id": 10},
24 | {"synset": "fireplug.n.01", "coco_cat_id": 11},
25 | {"synset": "stop_sign.n.01", "coco_cat_id": 13},
26 | {"synset": "parking_meter.n.01", "coco_cat_id": 14},
27 | {"synset": "bench.n.01", "coco_cat_id": 15},
28 | {"synset": "bird.n.01", "coco_cat_id": 16},
29 | {"synset": "cat.n.01", "coco_cat_id": 17},
30 | {"synset": "dog.n.01", "coco_cat_id": 18},
31 | {"synset": "horse.n.01", "coco_cat_id": 19},
32 | {"synset": "sheep.n.01", "coco_cat_id": 20},
33 | {"synset": "beef.n.01", "coco_cat_id": 21},
34 | {"synset": "elephant.n.01", "coco_cat_id": 22},
35 | {"synset": "bear.n.01", "coco_cat_id": 23},
36 | {"synset": "zebra.n.01", "coco_cat_id": 24},
37 | {"synset": "giraffe.n.01", "coco_cat_id": 25},
38 | {"synset": "backpack.n.01", "coco_cat_id": 27},
39 | {"synset": "umbrella.n.01", "coco_cat_id": 28},
40 | {"synset": "bag.n.04", "coco_cat_id": 31},
41 | {"synset": "necktie.n.01", "coco_cat_id": 32},
42 | {"synset": "bag.n.06", "coco_cat_id": 33},
43 | {"synset": "frisbee.n.01", "coco_cat_id": 34},
44 | {"synset": "ski.n.01", "coco_cat_id": 35},
45 | {"synset": "snowboard.n.01", "coco_cat_id": 36},
46 | {"synset": "ball.n.06", "coco_cat_id": 37},
47 | {"synset": "kite.n.03", "coco_cat_id": 38},
48 | {"synset": "baseball_bat.n.01", "coco_cat_id": 39},
49 | {"synset": "baseball_glove.n.01", "coco_cat_id": 40},
50 | {"synset": "skateboard.n.01", "coco_cat_id": 41},
51 | {"synset": "surfboard.n.01", "coco_cat_id": 42},
52 | {"synset": "tennis_racket.n.01", "coco_cat_id": 43},
53 | {"synset": "bottle.n.01", "coco_cat_id": 44},
54 | {"synset": "wineglass.n.01", "coco_cat_id": 46},
55 | {"synset": "cup.n.01", "coco_cat_id": 47},
56 | {"synset": "fork.n.01", "coco_cat_id": 48},
57 | {"synset": "knife.n.01", "coco_cat_id": 49},
58 | {"synset": "spoon.n.01", "coco_cat_id": 50},
59 | {"synset": "bowl.n.03", "coco_cat_id": 51},
60 | {"synset": "banana.n.02", "coco_cat_id": 52},
61 | {"synset": "apple.n.01", "coco_cat_id": 53},
62 | {"synset": "sandwich.n.01", "coco_cat_id": 54},
63 | {"synset": "orange.n.01", "coco_cat_id": 55},
64 | {"synset": "broccoli.n.01", "coco_cat_id": 56},
65 | {"synset": "carrot.n.01", "coco_cat_id": 57},
66 | {"synset": "frank.n.02", "coco_cat_id": 58},
67 | {"synset": "pizza.n.01", "coco_cat_id": 59},
68 | {"synset": "doughnut.n.02", "coco_cat_id": 60},
69 | {"synset": "cake.n.03", "coco_cat_id": 61},
70 | {"synset": "chair.n.01", "coco_cat_id": 62},
71 | {"synset": "sofa.n.01", "coco_cat_id": 63},
72 | {"synset": "pot.n.04", "coco_cat_id": 64},
73 | {"synset": "bed.n.01", "coco_cat_id": 65},
74 | {"synset": "dining_table.n.01", "coco_cat_id": 67},
75 | {"synset": "toilet.n.02", "coco_cat_id": 70},
76 | {"synset": "television_receiver.n.01", "coco_cat_id": 72},
77 | {"synset": "laptop.n.01", "coco_cat_id": 73},
78 | {"synset": "mouse.n.04", "coco_cat_id": 74},
79 | {"synset": "remote_control.n.01", "coco_cat_id": 75},
80 | {"synset": "computer_keyboard.n.01", "coco_cat_id": 76},
81 | {"synset": "cellular_telephone.n.01", "coco_cat_id": 77},
82 | {"synset": "microwave.n.02", "coco_cat_id": 78},
83 | {"synset": "oven.n.01", "coco_cat_id": 79},
84 | {"synset": "toaster.n.02", "coco_cat_id": 80},
85 | {"synset": "sink.n.01", "coco_cat_id": 81},
86 | {"synset": "electric_refrigerator.n.01", "coco_cat_id": 82},
87 | {"synset": "book.n.01", "coco_cat_id": 84},
88 | {"synset": "clock.n.01", "coco_cat_id": 85},
89 | {"synset": "vase.n.01", "coco_cat_id": 86},
90 | {"synset": "scissors.n.01", "coco_cat_id": 87},
91 | {"synset": "teddy.n.01", "coco_cat_id": 88},
92 | {"synset": "hand_blower.n.01", "coco_cat_id": 89},
93 | {"synset": "toothbrush.n.01", "coco_cat_id": 90},
94 | ]
95 |
96 |
97 | def cocofy_lvis(input_filename, output_filename):
98 | """
99 | Filter LVIS instance segmentation annotations to remove all categories that are not included in
100 | COCO. The new json files can be used to evaluate COCO AP using `lvis-api`. The category ids in
101 | the output json are the incontiguous COCO dataset ids.
102 |
103 | Args:
104 | input_filename (str): path to the LVIS json file.
105 | output_filename (str): path to the COCOfied json file.
106 | """
107 |
108 | with open(input_filename, "r") as f:
109 | lvis_json = json.load(f)
110 |
111 | lvis_annos = lvis_json.pop("annotations")
112 | lvis_imgs = lvis_json.pop("images")
113 | cocofied_lvis = copy.deepcopy(lvis_json)
114 | lvis_json["annotations"] = lvis_annos
115 | lvis_json["images"] = lvis_imgs
116 |
117 | # Mapping from lvis cat id to coco cat id via synset
118 | lvis_cat_id_to_synset = {cat["id"]: cat["synset"] for cat in lvis_json["categories"]}
119 | synset_to_coco_cat_id = {x["synset"]: x["coco_cat_id"] for x in COCO_SYNSET_CATEGORIES}
120 | # Synsets that we will keep in the dataset
121 | synsets_to_keep = set(synset_to_coco_cat_id.keys())
122 | coco_cat_id_with_instances = defaultdict(int)
123 |
124 | invalid_img_ids = set()
125 | new_img_id_dict = {}
126 |
127 | new_images = []
128 | img_id = 1
129 | for image in lvis_imgs:
130 | coco_url = image['coco_url']
131 | split, file_name = coco_url.split('/')[-2:]
132 | if split == 'train2017':
133 | invalid_img_ids.add(image['id'])
134 | continue
135 | new_img = copy.deepcopy(image)
136 | new_img_id_dict[new_img['id']] = img_id
137 | new_img['id'] = img_id
138 | img_id += 1
139 | new_img['file_name'] = file_name
140 | for key in ["not_exhaustive_category_ids", "neg_category_ids"]:
141 | new_category_list = []
142 | for lvis_cat_id in new_img[key]:
143 | synset = lvis_cat_id_to_synset[lvis_cat_id]
144 | if synset not in synsets_to_keep:
145 | continue
146 | coco_cat_id = synset_to_coco_cat_id[synset]
147 | new_category_list.append(coco_cat_id)
148 | coco_cat_id_with_instances[coco_cat_id] += 1
149 | new_img[key] = new_category_list
150 | new_images.append(new_img)
151 | cocofied_lvis["images"] = new_images
152 |
153 | new_annos = []
154 | ann_id = 1
155 | for ann in lvis_annos:
156 | img_id = ann["image_id"]
157 | if img_id in invalid_img_ids:
158 | continue
159 | lvis_cat_id = ann["category_id"]
160 | synset = lvis_cat_id_to_synset[lvis_cat_id]
161 | if synset not in synsets_to_keep:
162 | continue
163 | coco_cat_id = synset_to_coco_cat_id[synset]
164 | new_ann = copy.deepcopy(ann)
165 | new_ann["category_id"] = coco_cat_id
166 | new_ann["id"] = ann_id
167 | ann_id += 1
168 | new_ann["image_id"] = new_img_id_dict[img_id]
169 | new_annos.append(new_ann)
170 | coco_cat_id_with_instances[coco_cat_id] += 1
171 | cocofied_lvis["annotations"] = new_annos
172 |
173 |
174 | coco_cat_id_with_instances = set(coco_cat_id_with_instances.keys())
175 |
176 | new_categories = []
177 | for cat in lvis_json["categories"]:
178 | synset = cat["synset"]
179 | if synset not in synsets_to_keep:
180 | continue
181 | coco_cat_id = synset_to_coco_cat_id[synset]
182 | if coco_cat_id not in coco_cat_id_with_instances:
183 | continue
184 | new_cat = copy.deepcopy(cat)
185 | new_cat["id"] = coco_cat_id
186 | new_categories.append(new_cat)
187 | cocofied_lvis["categories"] = new_categories
188 |
189 | with open(output_filename, "w") as f:
190 | json.dump(cocofied_lvis, f)
191 | print("{} is COCOfied and stored in {}.".format(input_filename, output_filename))
192 |
193 |
194 | if __name__ == "__main__":
195 | dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "lvis")
196 | for s in ["lvis_v1_val"]:
197 | print("Start COCOfing {}.".format(s))
198 | cocofy_lvis(
199 | os.path.join(dataset_dir, "{}.json".format(s)),
200 | os.path.join(dataset_dir, "{}_cocofied.json".format(s)),
201 | )
202 |
--------------------------------------------------------------------------------
/demo/demo.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py
3 | # Adapted for AutoFocusFormer by Ziwen 2023
4 |
5 | import argparse
6 | import glob
7 | import multiprocessing as mp
8 | import os
9 |
10 | # fmt: off
11 | import sys
12 | sys.path.insert(1, os.path.join(sys.path[0], '..'))
13 | # fmt: on
14 |
15 | import tempfile
16 | import time
17 | import warnings
18 |
19 | import cv2
20 | import numpy as np
21 | import tqdm
22 |
23 | from detectron2.config import get_cfg
24 | from detectron2.data.detection_utils import read_image
25 | from detectron2.projects.deeplab import add_deeplab_config
26 | from detectron2.utils.logger import setup_logger
27 |
28 | from mask2former import add_maskformer2_config
29 | from predictor import VisualizationDemo
30 |
31 |
32 | # constants
33 | WINDOW_NAME = "mask2former demo"
34 |
35 |
36 | def setup_cfg(args):
37 | # load config from file and command-line arguments
38 | cfg = get_cfg()
39 | add_deeplab_config(cfg)
40 | add_maskformer2_config(cfg)
41 | cfg.merge_from_file(args.config_file)
42 | cfg.merge_from_list(args.opts)
43 | cfg.freeze()
44 | return cfg
45 |
46 |
47 | def get_parser():
48 | parser = argparse.ArgumentParser(description="maskformer2 demo for builtin configs")
49 | parser.add_argument(
50 | "--config-file",
51 | default="configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml",
52 | metavar="FILE",
53 | help="path to config file",
54 | )
55 | parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.")
56 | parser.add_argument("--video-input", help="Path to video file.")
57 | parser.add_argument(
58 | "--input",
59 | nargs="+",
60 | help="A list of space separated input images; "
61 | "or a single glob pattern such as 'directory/*.jpg'",
62 | )
63 | parser.add_argument(
64 | "--output",
65 | help="A file or directory to save output visualizations. "
66 | "If not given, will show output in an OpenCV window.",
67 | )
68 |
69 | parser.add_argument(
70 | "--confidence-threshold",
71 | type=float,
72 | default=0.5,
73 | help="Minimum score for instance predictions to be shown",
74 | )
75 | parser.add_argument(
76 | "--opts",
77 | help="Modify config options using the command-line 'KEY VALUE' pairs",
78 | default=[],
79 | nargs=argparse.REMAINDER,
80 | )
81 | parser.add_argument(
82 | "--blur",
83 | help="A directory containing blurred version of the inputs (e.g., blurred human faces). "
84 | "If given, predictions are visualized on the blurred images."
85 | "Images inside this folder need to have the same name as the input images",
86 | )
87 | return parser
88 |
89 |
90 | def test_opencv_video_format(codec, file_ext):
91 | with tempfile.TemporaryDirectory(prefix="video_format_test") as dir:
92 | filename = os.path.join(dir, "test_file" + file_ext)
93 | writer = cv2.VideoWriter(
94 | filename=filename,
95 | fourcc=cv2.VideoWriter_fourcc(*codec),
96 | fps=float(30),
97 | frameSize=(10, 10),
98 | isColor=True,
99 | )
100 | [writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)]
101 | writer.release()
102 | if os.path.isfile(filename):
103 | return True
104 | return False
105 |
106 |
107 | if __name__ == "__main__":
108 | mp.set_start_method("spawn", force=True)
109 | args = get_parser().parse_args()
110 | setup_logger(name="fvcore")
111 | logger = setup_logger()
112 | logger.info("Arguments: " + str(args))
113 |
114 | cfg = setup_cfg(args)
115 |
116 | demo = VisualizationDemo(cfg)
117 |
118 | if args.input:
119 | if len(args.input) == 1:
120 | args.input = glob.glob(os.path.expanduser(args.input[0]))
121 | assert args.input, "The input path(s) was not found"
122 | for path in tqdm.tqdm(args.input, disable=not args.output):
123 | # use PIL, to be consistent with evaluation
124 | img = read_image(path, format="BGR")
125 | if args.blur:
126 | path_blur = os.path.join(args.blur, path.split('/')[-1])
127 | img_blur = read_image(path_blur, format="BGR")
128 | else:
129 | img_blur = None
130 | start_time = time.time()
131 | predictions, visualized_output = demo.run_on_image(img, blur=img_blur)
132 | logger.info(
133 | "{}: {} in {:.2f}s".format(
134 | path,
135 | "detected {} instances".format(len(predictions["instances"]))
136 | if "instances" in predictions
137 | else "finished",
138 | time.time() - start_time,
139 | )
140 | )
141 |
142 | if args.output:
143 | if os.path.isdir(args.output):
144 | assert os.path.isdir(args.output), args.output
145 | out_filename = os.path.join(args.output, os.path.basename(path))
146 | else:
147 | assert len(args.input) == 1, "Please specify a directory with args.output"
148 | out_filename = args.output
149 | visualized_output.save(out_filename)
150 | else:
151 | cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
152 | cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
153 | if cv2.waitKey(0) == 27:
154 | break # esc to quit
155 | elif args.webcam:
156 | assert args.input is None, "Cannot have both --input and --webcam!"
157 | assert args.output is None, "output not yet supported with --webcam!"
158 | cam = cv2.VideoCapture(0)
159 | for vis in tqdm.tqdm(demo.run_on_video(cam)):
160 | cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
161 | cv2.imshow(WINDOW_NAME, vis)
162 | if cv2.waitKey(1) == 27:
163 | break # esc to quit
164 | cam.release()
165 | cv2.destroyAllWindows()
166 | elif args.video_input:
167 | video = cv2.VideoCapture(args.video_input)
168 | width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
169 | height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
170 | frames_per_second = video.get(cv2.CAP_PROP_FPS)
171 | num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
172 | basename = os.path.basename(args.video_input)
173 | codec, file_ext = (
174 | ("x264", ".mkv") if test_opencv_video_format("x264", ".mkv") else ("mp4v", ".mp4")
175 | )
176 | if codec == ".mp4v":
177 | warnings.warn("x264 codec not available, switching to mp4v")
178 | if args.output:
179 | if os.path.isdir(args.output):
180 | output_fname = os.path.join(args.output, basename)
181 | output_fname = os.path.splitext(output_fname)[0] + file_ext
182 | else:
183 | output_fname = args.output
184 | assert not os.path.isfile(output_fname), output_fname
185 | output_file = cv2.VideoWriter(
186 | filename=output_fname,
187 | # some installation of opencv may not support x264 (due to its license),
188 | # you can try other format (e.g. MPEG)
189 | fourcc=cv2.VideoWriter_fourcc(*codec),
190 | fps=float(frames_per_second),
191 | frameSize=(width, height),
192 | isColor=True,
193 | )
194 | assert os.path.isfile(args.video_input)
195 | for vis_frame in tqdm.tqdm(demo.run_on_video(video), total=num_frames):
196 | if args.output:
197 | output_file.write(vis_frame)
198 | else:
199 | cv2.namedWindow(basename, cv2.WINDOW_NORMAL)
200 | cv2.imshow(basename, vis_frame)
201 | if cv2.waitKey(1) == 27:
202 | break # esc to quit
203 | video.release()
204 | if args.output:
205 | output_file.release()
206 | else:
207 | cv2.destroyAllWindows()
208 |
--------------------------------------------------------------------------------
/demo/predictor.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Copied from: https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py
3 | # Adapted for AutoFocusFormer by Ziwen 2023
4 |
5 | import atexit
6 | import bisect
7 | import multiprocessing as mp
8 | from collections import deque
9 |
10 | import cv2
11 | import torch
12 |
13 | from detectron2.data import MetadataCatalog
14 | from detectron2.engine.defaults import DefaultPredictor
15 | from detectron2.utils.video_visualizer import VideoVisualizer
16 | from detectron2.utils.visualizer import ColorMode, Visualizer
17 |
18 |
19 | class VisualizationDemo(object):
20 | def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
21 | """
22 | Args:
23 | cfg (CfgNode):
24 | instance_mode (ColorMode):
25 | parallel (bool): whether to run the model in different processes from visualization.
26 | Useful since the visualization logic can be slow.
27 | """
28 | self.metadata = MetadataCatalog.get(
29 | cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
30 | )
31 | self.cpu_device = torch.device("cpu")
32 | self.instance_mode = instance_mode
33 |
34 | self.parallel = parallel
35 | if parallel:
36 | num_gpu = torch.cuda.device_count()
37 | self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
38 | else:
39 | self.predictor = DefaultPredictor(cfg)
40 |
41 | def run_on_image(self, image, blur=None):
42 | """
43 | Args:
44 | image (np.ndarray): an image of shape (H, W, C) (in BGR order).
45 | This is the format used by OpenCV.
46 | Returns:
47 | predictions (dict): the output of the model.
48 | vis_output (VisImage): the visualized image output.
49 | """
50 | vis_output = None
51 | predictions = self.predictor(image)
52 | # Convert image from OpenCV BGR format to Matplotlib RGB format.
53 | if blur is not None:
54 | blur = blur[:, :, ::-1]
55 | visualizer = Visualizer(blur, self.metadata, instance_mode=self.instance_mode)
56 | else:
57 | image = image[:, :, ::-1]
58 | visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode)
59 | if "panoptic_seg" in predictions:
60 | panoptic_seg, segments_info = predictions["panoptic_seg"]
61 | vis_output = visualizer.draw_panoptic_seg_predictions(
62 | panoptic_seg.to(self.cpu_device), segments_info
63 | )
64 | else:
65 | if "sem_seg" in predictions:
66 | vis_output = visualizer.draw_sem_seg(
67 | predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
68 | )
69 | if "instances" in predictions:
70 | instances = predictions["instances"].to(self.cpu_device)
71 | vis_output = visualizer.draw_instance_predictions(predictions=instances)
72 |
73 | return predictions, vis_output
74 |
75 | def _frame_from_video(self, video):
76 | while video.isOpened():
77 | success, frame = video.read()
78 | if success:
79 | yield frame
80 | else:
81 | break
82 |
83 | def run_on_video(self, video):
84 | """
85 | Visualizes predictions on frames of the input video.
86 | Args:
87 | video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be
88 | either a webcam or a video file.
89 | Yields:
90 | ndarray: BGR visualizations of each video frame.
91 | """
92 | video_visualizer = VideoVisualizer(self.metadata, self.instance_mode)
93 |
94 | def process_predictions(frame, predictions):
95 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
96 | if "panoptic_seg" in predictions:
97 | panoptic_seg, segments_info = predictions["panoptic_seg"]
98 | vis_frame = video_visualizer.draw_panoptic_seg_predictions(
99 | frame, panoptic_seg.to(self.cpu_device), segments_info
100 | )
101 | elif "instances" in predictions:
102 | predictions = predictions["instances"].to(self.cpu_device)
103 | vis_frame = video_visualizer.draw_instance_predictions(frame, predictions)
104 | elif "sem_seg" in predictions:
105 | vis_frame = video_visualizer.draw_sem_seg(
106 | frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
107 | )
108 |
109 | # Converts Matplotlib RGB format to OpenCV BGR format
110 | vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR)
111 | return vis_frame
112 |
113 | frame_gen = self._frame_from_video(video)
114 | if self.parallel:
115 | buffer_size = self.predictor.default_buffer_size
116 |
117 | frame_data = deque()
118 |
119 | for cnt, frame in enumerate(frame_gen):
120 | frame_data.append(frame)
121 | self.predictor.put(frame)
122 |
123 | if cnt >= buffer_size:
124 | frame = frame_data.popleft()
125 | predictions = self.predictor.get()
126 | yield process_predictions(frame, predictions)
127 |
128 | while len(frame_data):
129 | frame = frame_data.popleft()
130 | predictions = self.predictor.get()
131 | yield process_predictions(frame, predictions)
132 | else:
133 | for frame in frame_gen:
134 | yield process_predictions(frame, self.predictor(frame))
135 |
136 |
137 | class AsyncPredictor:
138 | """
139 | A predictor that runs the model asynchronously, possibly on >1 GPUs.
140 | Because rendering the visualization takes considerably amount of time,
141 | this helps improve throughput a little bit when rendering videos.
142 | """
143 |
144 | class _StopToken:
145 | pass
146 |
147 | class _PredictWorker(mp.Process):
148 | def __init__(self, cfg, task_queue, result_queue):
149 | self.cfg = cfg
150 | self.task_queue = task_queue
151 | self.result_queue = result_queue
152 | super().__init__()
153 |
154 | def run(self):
155 | predictor = DefaultPredictor(self.cfg)
156 |
157 | while True:
158 | task = self.task_queue.get()
159 | if isinstance(task, AsyncPredictor._StopToken):
160 | break
161 | idx, data = task
162 | result = predictor(data)
163 | self.result_queue.put((idx, result))
164 |
165 | def __init__(self, cfg, num_gpus: int = 1):
166 | """
167 | Args:
168 | cfg (CfgNode):
169 | num_gpus (int): if 0, will run on CPU
170 | """
171 | num_workers = max(num_gpus, 1)
172 | self.task_queue = mp.Queue(maxsize=num_workers * 3)
173 | self.result_queue = mp.Queue(maxsize=num_workers * 3)
174 | self.procs = []
175 | for gpuid in range(max(num_gpus, 1)):
176 | cfg = cfg.clone()
177 | cfg.defrost()
178 | cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu"
179 | self.procs.append(
180 | AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)
181 | )
182 |
183 | self.put_idx = 0
184 | self.get_idx = 0
185 | self.result_rank = []
186 | self.result_data = []
187 |
188 | for p in self.procs:
189 | p.start()
190 | atexit.register(self.shutdown)
191 |
192 | def put(self, image):
193 | self.put_idx += 1
194 | self.task_queue.put((self.put_idx, image))
195 |
196 | def get(self):
197 | self.get_idx += 1 # the index needed for this request
198 | if len(self.result_rank) and self.result_rank[0] == self.get_idx:
199 | res = self.result_data[0]
200 | del self.result_data[0], self.result_rank[0]
201 | return res
202 |
203 | while True:
204 | # make sure the results are returned in the correct order
205 | idx, res = self.result_queue.get()
206 | if idx == self.get_idx:
207 | return res
208 | insert = bisect.bisect(self.result_rank, idx)
209 | self.result_rank.insert(insert, idx)
210 | self.result_data.insert(insert, res)
211 |
212 | def __len__(self):
213 | return self.put_idx - self.get_idx
214 |
215 | def __call__(self, image):
216 | self.put(image)
217 | return self.get()
218 |
219 | def shutdown(self):
220 | for _ in self.procs:
221 | self.task_queue.put(AsyncPredictor._StopToken())
222 |
223 | @property
224 | def default_buffer_size(self):
225 | return len(self.procs) * 5
226 |
--------------------------------------------------------------------------------
/demo1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-autofocusformer-segmentation/52908e8ad5112b5bff1d043e6a06a9e8f9aad3ba/demo1.png
--------------------------------------------------------------------------------
/demo2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-autofocusformer-segmentation/52908e8ad5112b5bff1d043e6a06a9e8f9aad3ba/demo2.png
--------------------------------------------------------------------------------
/mask2former/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
3 | from . import data # register all new datasets
4 | from . import modeling
5 |
6 | # config
7 | from .config import add_maskformer2_config
8 |
9 | # dataset loading
10 | from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper
11 | from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper
12 | from .data.dataset_mappers.mask_former_instance_dataset_mapper import (
13 | MaskFormerInstanceDatasetMapper,
14 | )
15 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import (
16 | MaskFormerPanopticDatasetMapper,
17 | )
18 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
19 | MaskFormerSemanticDatasetMapper,
20 | )
21 |
22 | # models
23 | from .maskformer_model import MaskFormer
24 | from .test_time_augmentation import SemanticSegmentorWithTTA
25 |
26 | # evaluation
27 | from .evaluation.instance_evaluation import InstanceSegEvaluator
28 |
--------------------------------------------------------------------------------
/mask2former/config.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Adapted for AutoFocusFormer by Ziwen 2023
3 |
4 | from detectron2.config import CfgNode as CN
5 |
6 |
7 | def add_maskformer2_config(cfg):
8 | """
9 | Add config for MASK_FORMER.
10 | """
11 | # NOTE: configs from original maskformer
12 | # data config
13 | # select the dataset mapper
14 | cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic"
15 | # Color augmentation
16 | cfg.INPUT.COLOR_AUG_SSD = False
17 | # We retry random cropping until no single category in semantic segmentation GT occupies more
18 | # than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
19 | cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
20 | # Pad image and segmentation GT in dataset mapper.
21 | cfg.INPUT.SIZE_DIVISIBILITY = -1
22 |
23 | # solver config
24 | # weight decay on embedding
25 | cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0
26 | # optimizer
27 | cfg.SOLVER.OPTIMIZER = "ADAMW"
28 | cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
29 |
30 | # mask_former model config
31 | cfg.MODEL.MASK_FORMER = CN()
32 |
33 | # loss
34 | cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True
35 | cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1
36 | cfg.MODEL.MASK_FORMER.CLASS_WEIGHT = 1.0
37 | cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0
38 | cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0
39 |
40 | # transformer config
41 | cfg.MODEL.MASK_FORMER.NHEADS = 8
42 | cfg.MODEL.MASK_FORMER.DROPOUT = 0.1
43 | cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048
44 | cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0
45 | cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6
46 | cfg.MODEL.MASK_FORMER.PRE_NORM = False
47 |
48 | cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256
49 | cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100
50 |
51 | cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5"
52 | cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False
53 |
54 | # mask_former inference config
55 | cfg.MODEL.MASK_FORMER.TEST = CN()
56 | cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True
57 | cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = False
58 | cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False
59 | cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0
60 | cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0
61 | cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False
62 |
63 | # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
64 | # you can use this config to override
65 | cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32
66 |
67 | # pixel decoder config
68 | cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256
69 | # adding transformer in pixel decoder
70 | cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0
71 | # pixel decoder
72 | cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "MSDeformAttnPixelDecoder"
73 |
74 | # autofocusformer backbone
75 | cfg.MODEL.AFF = CN()
76 | cfg.MODEL.AFF.EMBED_DIM = [32, 128, 256, 384]
77 | cfg.MODEL.AFF.DEPTHS = [2, 2, 6, 2]
78 | cfg.MODEL.AFF.NUM_HEADS = [3, 6, 12, 24]
79 | cfg.MODEL.AFF.MLP_RATIO = 2.0
80 | cfg.MODEL.AFF.CLUSTER_SIZE = 8
81 | cfg.MODEL.AFF.NBHD_SIZE = [48, 48, 48, 48]
82 | cfg.MODEL.AFF.LAYER_SCALE = 0.0
83 | cfg.MODEL.AFF.ALPHA = 4.0
84 | cfg.MODEL.AFF.DS_RATE = 0.25
85 | cfg.MODEL.AFF.RESERVE = True
86 | cfg.MODEL.AFF.DROP_RATE = 0.0
87 | cfg.MODEL.AFF.ATTN_DROP_RATE = 0.0
88 | cfg.MODEL.AFF.DROP_PATH_RATE = 0.3
89 | cfg.MODEL.AFF.PATCH_NORM = True
90 | cfg.MODEL.AFF.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
91 | cfg.MODEL.AFF.SHEPARD_POWER = 6.0
92 | cfg.MODEL.AFF.SHEPARD_POWER_LEARNABLE = True
93 |
94 | # NOTE: maskformer2 extra configs
95 | # transformer module
96 | cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME = "MultiScaleMaskedTransformerDecoder"
97 |
98 | # LSJ aug
99 | cfg.INPUT.IMAGE_SIZE = 1024
100 | cfg.INPUT.MIN_SCALE = 0.1
101 | cfg.INPUT.MAX_SCALE = 2.0
102 |
103 | # MSDeformAttn encoder configs
104 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"]
105 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4
106 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8
107 |
108 | # point loss configs
109 | # Number of points sampled during training for a mask point head.
110 | cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS = 112 * 112
111 | # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the
112 | # original paper.
113 | cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO = 3.0
114 | # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in
115 | # the original paper.
116 | cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75
117 |
--------------------------------------------------------------------------------
/mask2former/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
3 | from . import datasets
4 |
--------------------------------------------------------------------------------
/mask2former/data/dataset_mappers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
--------------------------------------------------------------------------------
/mask2former/data/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
3 |
4 | import copy
5 | import logging
6 |
7 | import numpy as np
8 | import torch
9 |
10 | from detectron2.config import configurable
11 | from detectron2.data import detection_utils as utils
12 | from detectron2.data import transforms as T
13 |
14 | from pycocotools import mask as coco_mask
15 |
16 | __all__ = ["COCOInstanceNewBaselineDatasetMapper"]
17 |
18 |
19 | def convert_coco_poly_to_mask(segmentations, height, width):
20 | masks = []
21 | for polygons in segmentations:
22 | rles = coco_mask.frPyObjects(polygons, height, width)
23 | mask = coco_mask.decode(rles)
24 | if len(mask.shape) < 3:
25 | mask = mask[..., None]
26 | mask = torch.as_tensor(mask, dtype=torch.uint8)
27 | mask = mask.any(dim=2)
28 | masks.append(mask)
29 | if masks:
30 | masks = torch.stack(masks, dim=0)
31 | else:
32 | masks = torch.zeros((0, height, width), dtype=torch.uint8)
33 | return masks
34 |
35 |
36 | def build_transform_gen(cfg, is_train):
37 | """
38 | Create a list of default :class:`Augmentation` from config.
39 | Now it includes resizing and flipping.
40 | Returns:
41 | list[Augmentation]
42 | """
43 | assert is_train, "Only support training augmentation"
44 | image_size = cfg.INPUT.IMAGE_SIZE
45 | min_scale = cfg.INPUT.MIN_SCALE
46 | max_scale = cfg.INPUT.MAX_SCALE
47 |
48 | augmentation = []
49 |
50 | if cfg.INPUT.RANDOM_FLIP != "none":
51 | augmentation.append(
52 | T.RandomFlip(
53 | horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
54 | vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
55 | )
56 | )
57 |
58 | augmentation.extend([
59 | T.ResizeScale(
60 | min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
61 | ),
62 | T.FixedSizeCrop(crop_size=(image_size, image_size)),
63 | ])
64 |
65 | return augmentation
66 |
67 |
68 | # This is specifically designed for the COCO dataset.
69 | class COCOInstanceNewBaselineDatasetMapper:
70 | """
71 | A callable which takes a dataset dict in Detectron2 Dataset format,
72 | and map it into a format used by MaskFormer.
73 |
74 | This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
75 |
76 | The callable currently does the following:
77 |
78 | 1. Read the image from "file_name"
79 | 2. Applies geometric transforms to the image and annotation
80 | 3. Find and applies suitable cropping to the image and annotation
81 | 4. Prepare image and annotation to Tensors
82 | """
83 |
84 | @configurable
85 | def __init__(
86 | self,
87 | is_train=True,
88 | *,
89 | tfm_gens,
90 | image_format,
91 | ):
92 | """
93 | NOTE: this interface is experimental.
94 | Args:
95 | is_train: for training or inference
96 | augmentations: a list of augmentations or deterministic transforms to apply
97 | tfm_gens: data augmentation
98 | image_format: an image format supported by :func:`detection_utils.read_image`.
99 | """
100 | self.tfm_gens = tfm_gens
101 | logging.getLogger(__name__).info(
102 | "[COCOInstanceNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(str(self.tfm_gens))
103 | )
104 |
105 | self.img_format = image_format
106 | self.is_train = is_train
107 |
108 | @classmethod
109 | def from_config(cls, cfg, is_train=True):
110 | # Build augmentation
111 | tfm_gens = build_transform_gen(cfg, is_train)
112 |
113 | ret = {
114 | "is_train": is_train,
115 | "tfm_gens": tfm_gens,
116 | "image_format": cfg.INPUT.FORMAT,
117 | }
118 | return ret
119 |
120 | def __call__(self, dataset_dict):
121 | """
122 | Args:
123 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
124 |
125 | Returns:
126 | dict: a format that builtin models in detectron2 accept
127 | """
128 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
129 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
130 | utils.check_image_size(dataset_dict, image)
131 |
132 | # TODO: get padding mask
133 | # by feeding a "segmentation mask" to the same transforms
134 | padding_mask = np.ones(image.shape[:2])
135 |
136 | image, transforms = T.apply_transform_gens(self.tfm_gens, image)
137 | # the crop transformation has default padding value 0 for segmentation
138 | padding_mask = transforms.apply_segmentation(padding_mask)
139 | padding_mask = ~ padding_mask.astype(bool)
140 |
141 | image_shape = image.shape[:2] # h, w
142 |
143 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
144 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
145 | # Therefore it's important to use torch.Tensor.
146 | dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
147 | dataset_dict["padding_mask"] = torch.as_tensor(np.ascontiguousarray(padding_mask))
148 |
149 | if not self.is_train:
150 | # USER: Modify this if you want to keep them for some reason.
151 | dataset_dict.pop("annotations", None)
152 | return dataset_dict
153 |
154 | if "annotations" in dataset_dict:
155 | # USER: Modify this if you want to keep them for some reason.
156 | for anno in dataset_dict["annotations"]:
157 | # Let's always keep mask
158 | # if not self.mask_on:
159 | # anno.pop("segmentation", None)
160 | anno.pop("keypoints", None)
161 |
162 | # USER: Implement additional transformations if you have other types of data
163 | annos = [
164 | utils.transform_instance_annotations(obj, transforms, image_shape)
165 | for obj in dataset_dict.pop("annotations")
166 | if obj.get("iscrowd", 0) == 0
167 | ]
168 | # NOTE: does not support BitMask due to augmentation
169 | # Current BitMask cannot handle empty objects
170 | instances = utils.annotations_to_instances(annos, image_shape)
171 | # After transforms such as cropping are applied, the bounding box may no longer
172 | # tightly bound the object. As an example, imagine a triangle object
173 | # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
174 | # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
175 | # the intersection of original bounding box and the cropping box.
176 | instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
177 | # Need to filter empty instances first (due to augmentation)
178 | instances = utils.filter_empty_instances(instances)
179 | # Generate masks from polygon
180 | h, w = instances.image_size
181 | # image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float)
182 | if hasattr(instances, 'gt_masks'):
183 | gt_masks = instances.gt_masks
184 | gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
185 | instances.gt_masks = gt_masks
186 | dataset_dict["instances"] = instances
187 |
188 | return dataset_dict
189 |
--------------------------------------------------------------------------------
/mask2former/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
3 |
4 | import copy
5 | import logging
6 |
7 | import numpy as np
8 | import torch
9 |
10 | from detectron2.config import configurable
11 | from detectron2.data import detection_utils as utils
12 | from detectron2.data import transforms as T
13 | from detectron2.structures import BitMasks, Boxes, Instances
14 |
15 | __all__ = ["COCOPanopticNewBaselineDatasetMapper"]
16 |
17 |
18 | def build_transform_gen(cfg, is_train):
19 | """
20 | Create a list of default :class:`Augmentation` from config.
21 | Now it includes resizing and flipping.
22 | Returns:
23 | list[Augmentation]
24 | """
25 | assert is_train, "Only support training augmentation"
26 | image_size = cfg.INPUT.IMAGE_SIZE
27 | min_scale = cfg.INPUT.MIN_SCALE
28 | max_scale = cfg.INPUT.MAX_SCALE
29 |
30 | augmentation = []
31 |
32 | if cfg.INPUT.RANDOM_FLIP != "none":
33 | augmentation.append(
34 | T.RandomFlip(
35 | horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
36 | vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
37 | )
38 | )
39 |
40 | augmentation.extend([
41 | T.ResizeScale(
42 | min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
43 | ),
44 | T.FixedSizeCrop(crop_size=(image_size, image_size)),
45 | ])
46 |
47 | return augmentation
48 |
49 |
50 | # This is specifically designed for the COCO dataset.
51 | class COCOPanopticNewBaselineDatasetMapper:
52 | """
53 | A callable which takes a dataset dict in Detectron2 Dataset format,
54 | and map it into a format used by MaskFormer.
55 |
56 | This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
57 |
58 | The callable currently does the following:
59 |
60 | 1. Read the image from "file_name"
61 | 2. Applies geometric transforms to the image and annotation
62 | 3. Find and applies suitable cropping to the image and annotation
63 | 4. Prepare image and annotation to Tensors
64 | """
65 |
66 | @configurable
67 | def __init__(
68 | self,
69 | is_train=True,
70 | *,
71 | tfm_gens,
72 | image_format,
73 | ):
74 | """
75 | NOTE: this interface is experimental.
76 | Args:
77 | is_train: for training or inference
78 | augmentations: a list of augmentations or deterministic transforms to apply
79 | crop_gen: crop augmentation
80 | tfm_gens: data augmentation
81 | image_format: an image format supported by :func:`detection_utils.read_image`.
82 | """
83 | self.tfm_gens = tfm_gens
84 | logging.getLogger(__name__).info(
85 | "[COCOPanopticNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(
86 | str(self.tfm_gens)
87 | )
88 | )
89 |
90 | self.img_format = image_format
91 | self.is_train = is_train
92 |
93 | @classmethod
94 | def from_config(cls, cfg, is_train=True):
95 | # Build augmentation
96 | tfm_gens = build_transform_gen(cfg, is_train)
97 |
98 | ret = {
99 | "is_train": is_train,
100 | "tfm_gens": tfm_gens,
101 | "image_format": cfg.INPUT.FORMAT,
102 | }
103 | return ret
104 |
105 | def __call__(self, dataset_dict):
106 | """
107 | Args:
108 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
109 |
110 | Returns:
111 | dict: a format that builtin models in detectron2 accept
112 | """
113 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
114 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
115 | utils.check_image_size(dataset_dict, image)
116 |
117 | image, transforms = T.apply_transform_gens(self.tfm_gens, image)
118 | image_shape = image.shape[:2] # h, w
119 |
120 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
121 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
122 | # Therefore it's important to use torch.Tensor.
123 | dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
124 |
125 | if not self.is_train:
126 | # USER: Modify this if you want to keep them for some reason.
127 | dataset_dict.pop("annotations", None)
128 | return dataset_dict
129 |
130 | if "pan_seg_file_name" in dataset_dict:
131 | pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
132 | segments_info = dataset_dict["segments_info"]
133 |
134 | # apply the same transformation to panoptic segmentation
135 | pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
136 |
137 | from panopticapi.utils import rgb2id
138 |
139 | pan_seg_gt = rgb2id(pan_seg_gt)
140 |
141 | instances = Instances(image_shape)
142 | classes = []
143 | masks = []
144 | for segment_info in segments_info:
145 | class_id = segment_info["category_id"]
146 | if not segment_info["iscrowd"]:
147 | classes.append(class_id)
148 | masks.append(pan_seg_gt == segment_info["id"])
149 |
150 | classes = np.array(classes)
151 | instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
152 | if len(masks) == 0:
153 | # Some image does not have annotation (all ignored)
154 | instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
155 | instances.gt_boxes = Boxes(torch.zeros((0, 4)))
156 | else:
157 | masks = BitMasks(
158 | torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
159 | )
160 | instances.gt_masks = masks.tensor
161 | instances.gt_boxes = masks.get_bounding_boxes()
162 |
163 | dataset_dict["instances"] = instances
164 |
165 | return dataset_dict
166 |
--------------------------------------------------------------------------------
/mask2former/data/dataset_mappers/mask_former_instance_dataset_mapper.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
3 | import copy
4 | import logging
5 |
6 | import numpy as np
7 | import pycocotools.mask as mask_util
8 | import torch
9 | from torch.nn import functional as F
10 |
11 | from detectron2.config import configurable
12 | from detectron2.data import detection_utils as utils
13 | from detectron2.data import transforms as T
14 | from detectron2.projects.point_rend import ColorAugSSDTransform
15 | from detectron2.structures import BitMasks, Instances, polygons_to_bitmask
16 |
17 | __all__ = ["MaskFormerInstanceDatasetMapper"]
18 |
19 |
20 | class MaskFormerInstanceDatasetMapper:
21 | """
22 | A callable which takes a dataset dict in Detectron2 Dataset format,
23 | and map it into a format used by MaskFormer for instance segmentation.
24 |
25 | The callable currently does the following:
26 |
27 | 1. Read the image from "file_name"
28 | 2. Applies geometric transforms to the image and annotation
29 | 3. Find and applies suitable cropping to the image and annotation
30 | 4. Prepare image and annotation to Tensors
31 | """
32 |
33 | @configurable
34 | def __init__(
35 | self,
36 | is_train=True,
37 | *,
38 | augmentations,
39 | image_format,
40 | size_divisibility,
41 | ):
42 | """
43 | NOTE: this interface is experimental.
44 | Args:
45 | is_train: for training or inference
46 | augmentations: a list of augmentations or deterministic transforms to apply
47 | image_format: an image format supported by :func:`detection_utils.read_image`.
48 | size_divisibility: pad image size to be divisible by this value
49 | """
50 | self.is_train = is_train
51 | self.tfm_gens = augmentations
52 | self.img_format = image_format
53 | self.size_divisibility = size_divisibility
54 |
55 | logger = logging.getLogger(__name__)
56 | mode = "training" if is_train else "inference"
57 | logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
58 |
59 | @classmethod
60 | def from_config(cls, cfg, is_train=True):
61 | # Build augmentation
62 | augs = [
63 | T.ResizeShortestEdge(
64 | cfg.INPUT.MIN_SIZE_TRAIN,
65 | cfg.INPUT.MAX_SIZE_TRAIN,
66 | cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
67 | )
68 | ]
69 | if cfg.INPUT.CROP.ENABLED:
70 | augs.append(
71 | T.RandomCrop(
72 | cfg.INPUT.CROP.TYPE,
73 | cfg.INPUT.CROP.SIZE,
74 | )
75 | )
76 | if cfg.INPUT.COLOR_AUG_SSD:
77 | augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
78 | augs.append(T.RandomFlip())
79 |
80 | ret = {
81 | "is_train": is_train,
82 | "augmentations": augs,
83 | "image_format": cfg.INPUT.FORMAT,
84 | "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
85 | }
86 | return ret
87 |
88 | def __call__(self, dataset_dict):
89 | """
90 | Args:
91 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
92 |
93 | Returns:
94 | dict: a format that builtin models in detectron2 accept
95 | """
96 | assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
97 |
98 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
99 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
100 | utils.check_image_size(dataset_dict, image)
101 |
102 | aug_input = T.AugInput(image)
103 | aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
104 | image = aug_input.image
105 |
106 | # transform instnace masks
107 | assert "annotations" in dataset_dict
108 | for anno in dataset_dict["annotations"]:
109 | anno.pop("keypoints", None)
110 |
111 | annos = [
112 | utils.transform_instance_annotations(obj, transforms, image.shape[:2])
113 | for obj in dataset_dict.pop("annotations")
114 | if obj.get("iscrowd", 0) == 0
115 | ]
116 |
117 | if len(annos):
118 | assert "segmentation" in annos[0]
119 | segms = [obj["segmentation"] for obj in annos]
120 | masks = []
121 | for segm in segms:
122 | if isinstance(segm, list):
123 | # polygon
124 | masks.append(polygons_to_bitmask(segm, *image.shape[:2]))
125 | elif isinstance(segm, dict):
126 | # COCO RLE
127 | masks.append(mask_util.decode(segm))
128 | elif isinstance(segm, np.ndarray):
129 | assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
130 | segm.ndim
131 | )
132 | # mask array
133 | masks.append(segm)
134 | else:
135 | raise ValueError(
136 | "Cannot convert segmentation of type '{}' to BitMasks!"
137 | "Supported types are: polygons as list[list[float] or ndarray],"
138 | " COCO-style RLE as a dict, or a binary segmentation mask "
139 | " in a 2D numpy array of shape HxW.".format(type(segm))
140 | )
141 |
142 | # Pad image and segmentation label here!
143 | image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
144 | masks = [torch.from_numpy(np.ascontiguousarray(x)) for x in masks]
145 |
146 | classes = [int(obj["category_id"]) for obj in annos]
147 | classes = torch.tensor(classes, dtype=torch.int64)
148 |
149 | if self.size_divisibility > 0:
150 | image_size = (image.shape[-2], image.shape[-1])
151 | padding_size = [
152 | 0,
153 | self.size_divisibility - image_size[1],
154 | 0,
155 | self.size_divisibility - image_size[0],
156 | ]
157 | # pad image
158 | image = F.pad(image, padding_size, value=128).contiguous()
159 | # pad mask
160 | masks = [F.pad(x, padding_size, value=0).contiguous() for x in masks]
161 |
162 | image_shape = (image.shape[-2], image.shape[-1]) # h, w
163 |
164 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
165 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
166 | # Therefore it's important to use torch.Tensor.
167 | dataset_dict["image"] = image
168 |
169 | # Prepare per-category binary masks
170 | instances = Instances(image_shape)
171 | instances.gt_classes = classes
172 | if len(masks) == 0:
173 | # Some image does not have annotation (all ignored)
174 | instances.gt_masks = torch.zeros((0, image.shape[-2], image.shape[-1]))
175 | else:
176 | masks = BitMasks(torch.stack(masks))
177 | instances.gt_masks = masks.tensor
178 |
179 | dataset_dict["instances"] = instances
180 |
181 | return dataset_dict
182 |
--------------------------------------------------------------------------------
/mask2former/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
3 | import copy
4 |
5 | import numpy as np
6 | import torch
7 | from torch.nn import functional as F
8 |
9 | from detectron2.config import configurable
10 | from detectron2.data import detection_utils as utils
11 | from detectron2.data import transforms as T
12 | from detectron2.structures import BitMasks, Instances
13 |
14 | from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper
15 |
16 | __all__ = ["MaskFormerPanopticDatasetMapper"]
17 |
18 |
19 | class MaskFormerPanopticDatasetMapper(MaskFormerSemanticDatasetMapper):
20 | """
21 | A callable which takes a dataset dict in Detectron2 Dataset format,
22 | and map it into a format used by MaskFormer for panoptic segmentation.
23 |
24 | The callable currently does the following:
25 |
26 | 1. Read the image from "file_name"
27 | 2. Applies geometric transforms to the image and annotation
28 | 3. Find and applies suitable cropping to the image and annotation
29 | 4. Prepare image and annotation to Tensors
30 | """
31 |
32 | @configurable
33 | def __init__(
34 | self,
35 | is_train=True,
36 | *,
37 | augmentations,
38 | image_format,
39 | ignore_label,
40 | size_divisibility,
41 | ):
42 | """
43 | NOTE: this interface is experimental.
44 | Args:
45 | is_train: for training or inference
46 | augmentations: a list of augmentations or deterministic transforms to apply
47 | image_format: an image format supported by :func:`detection_utils.read_image`.
48 | ignore_label: the label that is ignored to evaluation
49 | size_divisibility: pad image size to be divisible by this value
50 | """
51 | super().__init__(
52 | is_train,
53 | augmentations=augmentations,
54 | image_format=image_format,
55 | ignore_label=ignore_label,
56 | size_divisibility=size_divisibility,
57 | )
58 |
59 | def __call__(self, dataset_dict):
60 | """
61 | Args:
62 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
63 |
64 | Returns:
65 | dict: a format that builtin models in detectron2 accept
66 | """
67 | assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
68 |
69 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
70 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
71 | utils.check_image_size(dataset_dict, image)
72 |
73 | # semantic segmentation
74 | if "sem_seg_file_name" in dataset_dict:
75 | # PyTorch transformation not implemented for uint16, so converting it to double first
76 | sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
77 | else:
78 | sem_seg_gt = None
79 |
80 | # panoptic segmentation
81 | if "pan_seg_file_name" in dataset_dict:
82 | pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
83 | segments_info = dataset_dict["segments_info"]
84 | else:
85 | pan_seg_gt = None
86 | segments_info = None
87 |
88 | if pan_seg_gt is None:
89 | raise ValueError(
90 | "Cannot find 'pan_seg_file_name' for panoptic segmentation dataset {}.".format(
91 | dataset_dict["file_name"]
92 | )
93 | )
94 |
95 | aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
96 | aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
97 | image = aug_input.image
98 | if sem_seg_gt is not None:
99 | sem_seg_gt = aug_input.sem_seg
100 |
101 | # apply the same transformation to panoptic segmentation
102 | pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
103 |
104 | from panopticapi.utils import rgb2id
105 |
106 | pan_seg_gt = rgb2id(pan_seg_gt)
107 |
108 | # Pad image and segmentation label here!
109 | image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
110 | if sem_seg_gt is not None:
111 | sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
112 | pan_seg_gt = torch.as_tensor(pan_seg_gt.astype("long"))
113 |
114 | if self.size_divisibility > 0:
115 | image_size = (image.shape[-2], image.shape[-1])
116 | padding_size = [
117 | 0,
118 | self.size_divisibility - image_size[1],
119 | 0,
120 | self.size_divisibility - image_size[0],
121 | ]
122 | image = F.pad(image, padding_size, value=128).contiguous()
123 | if sem_seg_gt is not None:
124 | sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
125 | pan_seg_gt = F.pad(
126 | pan_seg_gt, padding_size, value=0
127 | ).contiguous() # 0 is the VOID panoptic label
128 |
129 | image_shape = (image.shape[-2], image.shape[-1]) # h, w
130 |
131 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
132 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
133 | # Therefore it's important to use torch.Tensor.
134 | dataset_dict["image"] = image
135 | if sem_seg_gt is not None:
136 | dataset_dict["sem_seg"] = sem_seg_gt.long()
137 |
138 | if "annotations" in dataset_dict:
139 | raise ValueError("Pemantic segmentation dataset should not have 'annotations'.")
140 |
141 | # Prepare per-category binary masks
142 | pan_seg_gt = pan_seg_gt.numpy()
143 | instances = Instances(image_shape)
144 | classes = []
145 | masks = []
146 | for segment_info in segments_info:
147 | class_id = segment_info["category_id"]
148 | if not segment_info["iscrowd"]:
149 | classes.append(class_id)
150 | masks.append(pan_seg_gt == segment_info["id"])
151 |
152 | classes = np.array(classes)
153 | instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
154 | if len(masks) == 0:
155 | # Some image does not have annotation (all ignored)
156 | instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
157 | else:
158 | masks = BitMasks(
159 | torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
160 | )
161 | instances.gt_masks = masks.tensor
162 |
163 | dataset_dict["instances"] = instances
164 |
165 | return dataset_dict
166 |
--------------------------------------------------------------------------------
/mask2former/data/dataset_mappers/mask_former_semantic_dataset_mapper.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
3 | import copy
4 | import logging
5 |
6 | import numpy as np
7 | import torch
8 | from torch.nn import functional as F
9 |
10 | from detectron2.config import configurable
11 | from detectron2.data import MetadataCatalog
12 | from detectron2.data import detection_utils as utils
13 | from detectron2.data import transforms as T
14 | from detectron2.projects.point_rend import ColorAugSSDTransform
15 | from detectron2.structures import BitMasks, Instances
16 |
17 | __all__ = ["MaskFormerSemanticDatasetMapper"]
18 |
19 |
20 | class MaskFormerSemanticDatasetMapper:
21 | """
22 | A callable which takes a dataset dict in Detectron2 Dataset format,
23 | and map it into a format used by MaskFormer for semantic segmentation.
24 |
25 | The callable currently does the following:
26 |
27 | 1. Read the image from "file_name"
28 | 2. Applies geometric transforms to the image and annotation
29 | 3. Find and applies suitable cropping to the image and annotation
30 | 4. Prepare image and annotation to Tensors
31 | """
32 |
33 | @configurable
34 | def __init__(
35 | self,
36 | is_train=True,
37 | *,
38 | augmentations,
39 | image_format,
40 | ignore_label,
41 | size_divisibility,
42 | ):
43 | """
44 | NOTE: this interface is experimental.
45 | Args:
46 | is_train: for training or inference
47 | augmentations: a list of augmentations or deterministic transforms to apply
48 | image_format: an image format supported by :func:`detection_utils.read_image`.
49 | ignore_label: the label that is ignored to evaluation
50 | size_divisibility: pad image size to be divisible by this value
51 | """
52 | self.is_train = is_train
53 | self.tfm_gens = augmentations
54 | self.img_format = image_format
55 | self.ignore_label = ignore_label
56 | self.size_divisibility = size_divisibility
57 |
58 | logger = logging.getLogger(__name__)
59 | mode = "training" if is_train else "inference"
60 | logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
61 |
62 | @classmethod
63 | def from_config(cls, cfg, is_train=True):
64 | # Build augmentation
65 | augs = [
66 | T.ResizeShortestEdge(
67 | cfg.INPUT.MIN_SIZE_TRAIN,
68 | cfg.INPUT.MAX_SIZE_TRAIN,
69 | cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
70 | )
71 | ]
72 | if cfg.INPUT.CROP.ENABLED:
73 | augs.append(
74 | T.RandomCrop_CategoryAreaConstraint(
75 | cfg.INPUT.CROP.TYPE,
76 | cfg.INPUT.CROP.SIZE,
77 | cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA,
78 | cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
79 | )
80 | )
81 | if cfg.INPUT.COLOR_AUG_SSD:
82 | augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
83 | augs.append(T.RandomFlip())
84 |
85 | # Assume always applies to the training set.
86 | dataset_names = cfg.DATASETS.TRAIN
87 | meta = MetadataCatalog.get(dataset_names[0])
88 | ignore_label = meta.ignore_label
89 |
90 | ret = {
91 | "is_train": is_train,
92 | "augmentations": augs,
93 | "image_format": cfg.INPUT.FORMAT,
94 | "ignore_label": ignore_label,
95 | "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
96 | }
97 | return ret
98 |
99 | def __call__(self, dataset_dict):
100 | """
101 | Args:
102 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
103 |
104 | Returns:
105 | dict: a format that builtin models in detectron2 accept
106 | """
107 | assert self.is_train, "MaskFormerSemanticDatasetMapper should only be used for training!"
108 |
109 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
110 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
111 | utils.check_image_size(dataset_dict, image)
112 |
113 | if "sem_seg_file_name" in dataset_dict:
114 | # PyTorch transformation not implemented for uint16, so converting it to double first
115 | sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
116 | else:
117 | sem_seg_gt = None
118 |
119 | if sem_seg_gt is None:
120 | raise ValueError(
121 | "Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format(
122 | dataset_dict["file_name"]
123 | )
124 | )
125 |
126 | aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
127 | aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
128 | image = aug_input.image
129 | sem_seg_gt = aug_input.sem_seg
130 |
131 | # Pad image and segmentation label here!
132 | image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
133 | if sem_seg_gt is not None:
134 | sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
135 |
136 | if self.size_divisibility > 0:
137 | image_size = (image.shape[-2], image.shape[-1])
138 | padding_size = [
139 | 0,
140 | self.size_divisibility - image_size[1],
141 | 0,
142 | self.size_divisibility - image_size[0],
143 | ]
144 | image = F.pad(image, padding_size, value=128).contiguous()
145 | if sem_seg_gt is not None:
146 | sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
147 |
148 | image_shape = (image.shape[-2], image.shape[-1]) # h, w
149 |
150 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
151 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
152 | # Therefore it's important to use torch.Tensor.
153 | dataset_dict["image"] = image
154 |
155 | if sem_seg_gt is not None:
156 | dataset_dict["sem_seg"] = sem_seg_gt.long()
157 |
158 | if "annotations" in dataset_dict:
159 | raise ValueError("Semantic segmentation dataset should not have 'annotations'.")
160 |
161 | # Prepare per-category binary masks
162 | if sem_seg_gt is not None:
163 | sem_seg_gt = sem_seg_gt.numpy()
164 | instances = Instances(image_shape)
165 | classes = np.unique(sem_seg_gt)
166 | # remove ignored region
167 | classes = classes[classes != self.ignore_label]
168 | instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
169 |
170 | masks = []
171 | for class_id in classes:
172 | masks.append(sem_seg_gt == class_id)
173 |
174 | if len(masks) == 0:
175 | # Some image does not have annotation (all ignored)
176 | instances.gt_masks = torch.zeros((0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1]))
177 | else:
178 | masks = BitMasks(
179 | torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
180 | )
181 | instances.gt_masks = masks.tensor
182 |
183 | dataset_dict["instances"] = instances
184 |
185 | return dataset_dict
186 |
--------------------------------------------------------------------------------
/mask2former/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
3 | from . import register_coco_panoptic_annos_semseg
4 |
--------------------------------------------------------------------------------
/mask2former/data/datasets/register_coco_panoptic_annos_semseg.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
3 | import json
4 | import os
5 |
6 | from detectron2.data import DatasetCatalog, MetadataCatalog
7 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
8 | from detectron2.utils.file_io import PathManager
9 |
10 |
11 | _PREDEFINED_SPLITS_COCO_PANOPTIC = {
12 | "coco_2017_train_panoptic": (
13 | # This is the original panoptic annotation directory
14 | "coco/panoptic_train2017",
15 | "coco/annotations/panoptic_train2017.json",
16 | # This directory contains semantic annotations that are
17 | # converted from panoptic annotations.
18 | # It is used by PanopticFPN.
19 | # You can use the script at detectron2/datasets/prepare_panoptic_fpn.py
20 | # to create these directories.
21 | "coco/panoptic_semseg_train2017",
22 | ),
23 | "coco_2017_val_panoptic": (
24 | "coco/panoptic_val2017",
25 | "coco/annotations/panoptic_val2017.json",
26 | "coco/panoptic_semseg_val2017",
27 | ),
28 | }
29 |
30 |
31 | def get_metadata():
32 | meta = {}
33 | # The following metadata maps contiguous id from [0, #thing categories +
34 | # #stuff categories) to their names and colors. We have to replica of the
35 | # same name and color under "thing_*" and "stuff_*" because the current
36 | # visualization function in D2 handles thing and class classes differently
37 | # due to some heuristic used in Panoptic FPN. We keep the same naming to
38 | # enable reusing existing visualization functions.
39 | thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1]
40 | thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1]
41 | stuff_classes = [k["name"] for k in COCO_CATEGORIES]
42 | stuff_colors = [k["color"] for k in COCO_CATEGORIES]
43 |
44 | meta["thing_classes"] = thing_classes
45 | meta["thing_colors"] = thing_colors
46 | meta["stuff_classes"] = stuff_classes
47 | meta["stuff_colors"] = stuff_colors
48 |
49 | # Convert category id for training:
50 | # category id: like semantic segmentation, it is the class id for each
51 | # pixel. Since there are some classes not used in evaluation, the category
52 | # id is not always contiguous and thus we have two set of category ids:
53 | # - original category id: category id in the original dataset, mainly
54 | # used for evaluation.
55 | # - contiguous category id: [0, #classes), in order to train the linear
56 | # softmax classifier.
57 | thing_dataset_id_to_contiguous_id = {}
58 | stuff_dataset_id_to_contiguous_id = {}
59 |
60 | for i, cat in enumerate(COCO_CATEGORIES):
61 | if cat["isthing"]:
62 | thing_dataset_id_to_contiguous_id[cat["id"]] = i
63 | # else:
64 | # stuff_dataset_id_to_contiguous_id[cat["id"]] = i
65 |
66 | # in order to use sem_seg evaluator
67 | stuff_dataset_id_to_contiguous_id[cat["id"]] = i
68 |
69 | meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
70 | meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
71 |
72 | return meta
73 |
74 |
75 | def load_coco_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta):
76 | """
77 | Args:
78 | image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
79 | gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
80 | json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
81 | Returns:
82 | list[dict]: a list of dicts in Detectron2 standard format. (See
83 | `Using Custom Datasets `_ )
84 | """
85 |
86 | def _convert_category_id(segment_info, meta):
87 | if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
88 | segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
89 | segment_info["category_id"]
90 | ]
91 | segment_info["isthing"] = True
92 | else:
93 | segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
94 | segment_info["category_id"]
95 | ]
96 | segment_info["isthing"] = False
97 | return segment_info
98 |
99 | with PathManager.open(json_file) as f:
100 | json_info = json.load(f)
101 |
102 | ret = []
103 | for ann in json_info["annotations"]:
104 | image_id = int(ann["image_id"])
105 | # TODO: currently we assume image and label has the same filename but
106 | # different extension, and images have extension ".jpg" for COCO. Need
107 | # to make image extension a user-provided argument if we extend this
108 | # function to support other COCO-like datasets.
109 | image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg")
110 | label_file = os.path.join(gt_dir, ann["file_name"])
111 | sem_label_file = os.path.join(semseg_dir, ann["file_name"])
112 | segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]]
113 | ret.append(
114 | {
115 | "file_name": image_file,
116 | "image_id": image_id,
117 | "pan_seg_file_name": label_file,
118 | "sem_seg_file_name": sem_label_file,
119 | "segments_info": segments_info,
120 | }
121 | )
122 | assert len(ret), f"No images found in {image_dir}!"
123 | assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
124 | assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"]
125 | assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"]
126 | return ret
127 |
128 |
129 | def register_coco_panoptic_annos_sem_seg(
130 | name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json
131 | ):
132 | panoptic_name = name
133 | delattr(MetadataCatalog.get(panoptic_name), "thing_classes")
134 | delattr(MetadataCatalog.get(panoptic_name), "thing_colors")
135 | MetadataCatalog.get(panoptic_name).set(
136 | thing_classes=metadata["thing_classes"],
137 | thing_colors=metadata["thing_colors"],
138 | # thing_dataset_id_to_contiguous_id=metadata["thing_dataset_id_to_contiguous_id"],
139 | )
140 |
141 | # the name is "coco_2017_train_panoptic_with_sem_seg" and "coco_2017_val_panoptic_with_sem_seg"
142 | semantic_name = name + "_with_sem_seg"
143 | DatasetCatalog.register(
144 | semantic_name,
145 | lambda: load_coco_panoptic_json(panoptic_json, image_root, panoptic_root, sem_seg_root, metadata),
146 | )
147 | MetadataCatalog.get(semantic_name).set(
148 | sem_seg_root=sem_seg_root,
149 | panoptic_root=panoptic_root,
150 | image_root=image_root,
151 | panoptic_json=panoptic_json,
152 | json_file=instances_json,
153 | evaluator_type="coco_panoptic_seg",
154 | ignore_label=255,
155 | label_divisor=1000,
156 | **metadata,
157 | )
158 |
159 |
160 | def register_all_coco_panoptic_annos_sem_seg(root):
161 | for (
162 | prefix,
163 | (panoptic_root, panoptic_json, semantic_root),
164 | ) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items():
165 | prefix_instances = prefix[: -len("_panoptic")]
166 | instances_meta = MetadataCatalog.get(prefix_instances)
167 | image_root, instances_json = instances_meta.image_root, instances_meta.json_file
168 |
169 | register_coco_panoptic_annos_sem_seg(
170 | prefix,
171 | get_metadata(),
172 | image_root,
173 | os.path.join(root, panoptic_root),
174 | os.path.join(root, panoptic_json),
175 | os.path.join(root, semantic_root),
176 | instances_json,
177 | )
178 |
179 |
180 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
181 | register_all_coco_panoptic_annos_sem_seg(_root)
182 |
--------------------------------------------------------------------------------
/mask2former/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
--------------------------------------------------------------------------------
/mask2former/evaluation/instance_evaluation.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
3 | import itertools
4 | import json
5 | import os
6 |
7 | from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco
8 | from detectron2.utils.file_io import PathManager
9 |
10 |
11 | # modified from COCOEvaluator for instance segmetnat
12 | class InstanceSegEvaluator(COCOEvaluator):
13 | """
14 | Evaluate AR for object proposals, AP for instance detection/segmentation, AP
15 | for keypoint detection outputs using COCO's metrics.
16 | See http://cocodataset.org/#detection-eval and
17 | http://cocodataset.org/#keypoints-eval to understand its metrics.
18 | The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means
19 | the metric cannot be computed (e.g. due to no predictions made).
20 |
21 | In addition to COCO, this evaluator is able to support any bounding box detection,
22 | instance segmentation, or keypoint detection dataset.
23 | """
24 |
25 | def _eval_predictions(self, predictions, img_ids=None):
26 | """
27 | Evaluate predictions. Fill self._results with the metrics of the tasks.
28 | """
29 | self._logger.info("Preparing results for COCO format ...")
30 | coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
31 | tasks = self._tasks or self._tasks_from_predictions(coco_results)
32 |
33 | # unmap the category ids for COCO
34 | if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
35 | dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
36 | # all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
37 | # num_classes = len(all_contiguous_ids)
38 | # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
39 |
40 | reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
41 | for result in coco_results:
42 | category_id = result["category_id"]
43 | # assert category_id < num_classes, (
44 | # f"A prediction has class={category_id}, "
45 | # f"but the dataset only has {num_classes} classes and "
46 | # f"predicted class id should be in [0, {num_classes - 1}]."
47 | # )
48 | assert category_id in reverse_id_mapping, (
49 | f"A prediction has class={category_id}, "
50 | f"but the dataset only has class ids in {dataset_id_to_contiguous_id}."
51 | )
52 | result["category_id"] = reverse_id_mapping[category_id]
53 |
54 | if self._output_dir:
55 | file_path = os.path.join(self._output_dir, "coco_instances_results.json")
56 | self._logger.info("Saving results to {}".format(file_path))
57 | with PathManager.open(file_path, "w") as f:
58 | f.write(json.dumps(coco_results))
59 | f.flush()
60 |
61 | if not self._do_evaluation:
62 | self._logger.info("Annotations are not available for evaluation.")
63 | return
64 |
65 | self._logger.info(
66 | "Evaluating predictions with {} COCO API...".format(
67 | "unofficial" if self._use_fast_impl else "official"
68 | )
69 | )
70 | for task in sorted(tasks):
71 | assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!"
72 | coco_eval = (
73 | _evaluate_predictions_on_coco(
74 | self._coco_api,
75 | coco_results,
76 | task,
77 | kpt_oks_sigmas=self._kpt_oks_sigmas,
78 | use_fast_impl=self._use_fast_impl,
79 | img_ids=img_ids,
80 | max_dets_per_image=self._max_dets_per_image,
81 | )
82 | if len(coco_results) > 0
83 | else None # cocoapi does not handle empty results very well
84 | )
85 |
86 | res = self._derive_coco_results(
87 | coco_eval, task, class_names=self._metadata.get("thing_classes")
88 | )
89 | self._results[task] = res
90 |
--------------------------------------------------------------------------------
/mask2former/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Adapted for AutoFocusFormer by Ziwen 2023
3 |
4 | from .backbone.aff import AutoFocusFormer
5 |
6 | from .pixel_decoder.msdeformattn_pc import MSDeformAttnPixelDecoder
7 | from .meta_arch.mask_former_head import MaskFormerHead
8 |
--------------------------------------------------------------------------------
/mask2former/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # For licensing see accompanying LICENSE file.
3 | # Copyright (C) 2023 Apple Inc. All Rights Reserved.
4 | #
5 |
--------------------------------------------------------------------------------
/mask2former/modeling/clusten/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # For licensing see accompanying LICENSE file.
3 | # Copyright (C) 2023 Apple Inc. All Rights Reserved.
4 | #
5 |
6 | from .clusten import CLUSTENQKFunction, CLUSTENAVFunction, CLUSTENWFFunction, WEIGHTEDGATHERFunction, MSDETRPCFunction
7 |
--------------------------------------------------------------------------------
/mask2former/modeling/clusten/clusten.py:
--------------------------------------------------------------------------------
1 | #
2 | # For licensing see accompanying LICENSE file.
3 | # Copyright (C) 2023 Apple Inc. All Rights Reserved.
4 | #
5 |
6 | from torch.autograd import Function
7 |
8 | try:
9 | import clustenqk_cuda
10 | import clustenav_cuda
11 | import clustenwf_cuda
12 | import weighted_gather_cuda
13 | import msdetrpc_cuda
14 | except ImportError:
15 | raise RuntimeError("Could not load CLUSTEN CUDA extension. " +
16 | "Please make sure your device has CUDA, the CUDA toolkit for PyTorch is installed, and that you've compiled CLUSTEN correctly.")
17 |
18 |
19 | class CLUSTENQKFunction(Function):
20 | """
21 | query times key function
22 | """
23 | @staticmethod
24 | def forward(ctx, query, key, nbhd_idx):
25 | query = query.contiguous()
26 | key = key.contiguous()
27 | if key.dtype != query.dtype:
28 | key = key.to(query.dtype)
29 | nbhd_idx = nbhd_idx.contiguous()
30 | attn = clustenqk_cuda.forward(
31 | query,
32 | key.permute(0, 1, 3, 2).contiguous(),
33 | nbhd_idx)
34 | ctx.save_for_backward(query, key, nbhd_idx)
35 | return attn
36 |
37 | @staticmethod
38 | def backward(ctx, grad_attn):
39 | outputs = clustenqk_cuda.backward(
40 | grad_attn.contiguous(), *ctx.saved_tensors)
41 | d_query, d_key = outputs
42 | return d_query, d_key, None
43 |
44 |
45 | class CLUSTENAVFunction(Function):
46 | """
47 | attention times value function
48 | """
49 | @staticmethod
50 | def forward(ctx, attn, v, nbhd_idx):
51 | attn = attn.contiguous()
52 | v = v.contiguous()
53 | nbhd_idx = nbhd_idx.contiguous()
54 | if attn.dtype != v.dtype:
55 | v = v.to(attn.dtype)
56 | feat = clustenav_cuda.forward(
57 | attn,
58 | v,
59 | nbhd_idx)
60 | ctx.save_for_backward(attn, v, nbhd_idx)
61 | return feat
62 |
63 | @staticmethod
64 | def backward(ctx, grad_feat):
65 | outputs = clustenav_cuda.backward(
66 | grad_feat.contiguous(), *ctx.saved_tensors)
67 | d_attn, d_v = outputs
68 | return d_attn, d_v, None
69 |
70 |
71 | class CLUSTENWFFunction(Function):
72 | """
73 | weight times feature function
74 | """
75 | @staticmethod
76 | def forward(ctx, weights, feat, nbhd_idx):
77 | weights = weights.contiguous()
78 | feat = feat.contiguous()
79 | nbhd_idx = nbhd_idx.contiguous()
80 | if feat.dtype != weights.dtype:
81 | feat = feat.to(weights.dtype)
82 | feat_new = clustenwf_cuda.forward(
83 | weights,
84 | feat,
85 | nbhd_idx)
86 | ctx.save_for_backward(weights, feat, nbhd_idx)
87 | return feat_new
88 |
89 | @staticmethod
90 | def backward(ctx, grad_feat_new):
91 | outputs = clustenwf_cuda.backward(
92 | grad_feat_new.contiguous(), *ctx.saved_tensors)
93 | d_weights, d_feat = outputs
94 | return d_weights, d_feat, None
95 |
96 |
97 | class WEIGHTEDGATHERFunction(Function):
98 | """
99 | weighted gather function
100 | """
101 | @staticmethod
102 | def forward(ctx, nbhd_idx, weights, feat):
103 | nbhd_idx = nbhd_idx.contiguous()
104 | weights = weights.contiguous()
105 | feat = feat.contiguous()
106 | if feat.dtype != weights.dtype:
107 | weights = weights.to(feat.dtype)
108 | feat_new = weighted_gather_cuda.forward(
109 | nbhd_idx,
110 | weights,
111 | feat)
112 | ctx.save_for_backward(nbhd_idx, weights, feat)
113 | return feat_new
114 |
115 | @staticmethod
116 | def backward(ctx, grad_feat_new):
117 | outputs = weighted_gather_cuda.backward(
118 | grad_feat_new.contiguous(), *ctx.saved_tensors)
119 | d_weights, d_feat = outputs
120 | return None, d_weights, d_feat
121 |
122 |
123 | class MSDETRPCFunction(Function):
124 | """
125 | deformable multi scale detr point cloud function
126 | """
127 | @staticmethod
128 | def forward(ctx, nn_idx, nn_weight, attn, val):
129 | nn_idx = nn_idx.contiguous()
130 | nn_weight = nn_weight.contiguous()
131 | attn = attn.contiguous()
132 | val = val.contiguous()
133 | feat = msdetrpc_cuda.forward(
134 | nn_idx,
135 | nn_weight,
136 | attn,
137 | val)
138 | ctx.save_for_backward(nn_idx, nn_weight, attn, val)
139 | return feat
140 |
141 | @staticmethod
142 | def backward(ctx, grad_feat):
143 | outputs = msdetrpc_cuda.backward(
144 | grad_feat.contiguous(), *ctx.saved_tensors)
145 | d_weight, d_attn, d_val = outputs
146 | return None, d_weight, d_attn, d_val
147 |
--------------------------------------------------------------------------------
/mask2former/modeling/clusten/src/clustenav_cuda.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * For licensing see accompanying LICENSE file.
3 | * Copyright (C) 2023 Apple Inc. All Rights Reserved.
4 | */
5 |
6 | #include
7 | #include
8 |
9 | torch::Tensor clusten_av_cuda_forward(
10 | const torch::Tensor &attn, // b x h x n x m
11 | const torch::Tensor &v, // b x h x n x c
12 | const torch::Tensor &nbhd_idx); // b x n x m
13 |
14 | std::vector clusten_av_cuda_backward(
15 | const torch::Tensor &d_feat,
16 | const torch::Tensor &attn,
17 | const torch::Tensor &v,
18 | const torch::Tensor &nbhd_idx);
19 |
20 | // C++ interface
21 | #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
22 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
23 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
24 |
25 | torch::Tensor clusten_av_forward(
26 | const torch::Tensor &attn,
27 | const torch::Tensor &v,
28 | const torch::Tensor &nbhd_idx) {
29 | CHECK_INPUT(attn);
30 | CHECK_INPUT(v);
31 | CHECK_INPUT(nbhd_idx);
32 | return clusten_av_cuda_forward(attn, v, nbhd_idx);
33 | }
34 |
35 | std::vector clusten_av_backward(
36 | const torch::Tensor &d_feat,
37 | const torch::Tensor &attn,
38 | const torch::Tensor &v,
39 | const torch::Tensor &nbhd_idx) {
40 | CHECK_INPUT(d_feat);
41 | CHECK_INPUT(attn);
42 | CHECK_INPUT(v);
43 | CHECK_INPUT(nbhd_idx);
44 | return clusten_av_cuda_backward(d_feat, attn, v, nbhd_idx);
45 | }
46 |
47 |
48 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
49 | m.def("forward", &clusten_av_forward, "CLUSTENAV forward (CUDA)");
50 | m.def("backward", &clusten_av_backward, "CLUSTENAV backward (CUDA)");
51 | }
52 |
--------------------------------------------------------------------------------
/mask2former/modeling/clusten/src/clustenqk_cuda.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * For licensing see accompanying LICENSE file.
3 | * Copyright (C) 2023 Apple Inc. All Rights Reserved.
4 | */
5 |
6 | #include
7 | #include
8 |
9 | torch::Tensor clusten_qk_cuda_forward(
10 | const torch::Tensor &query, // b x h x n x c
11 | const torch::Tensor &key, // b x h x n x c
12 | const torch::Tensor &nbhd_idx); // b x n x m
13 |
14 | std::vector clusten_qk_cuda_backward(
15 | const torch::Tensor &d_attn,
16 | const torch::Tensor &query,
17 | const torch::Tensor &key,
18 | const torch::Tensor &nbhd_idx);
19 |
20 | // C++ interface
21 | #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
22 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
23 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
24 |
25 | torch::Tensor clusten_qk_forward(
26 | const torch::Tensor &query,
27 | const torch::Tensor &key,
28 | const torch::Tensor &nbhd_idx) {
29 | CHECK_INPUT(query);
30 | CHECK_INPUT(key);
31 | CHECK_INPUT(nbhd_idx);
32 | return clusten_qk_cuda_forward(query, key, nbhd_idx);
33 | }
34 |
35 | std::vector clusten_qk_backward(
36 | const torch::Tensor &d_attn,
37 | const torch::Tensor &query,
38 | const torch::Tensor &key,
39 | const torch::Tensor &nbhd_idx) {
40 | CHECK_INPUT(d_attn);
41 | CHECK_INPUT(query);
42 | CHECK_INPUT(key);
43 | CHECK_INPUT(nbhd_idx);
44 | return clusten_qk_cuda_backward(d_attn, query, key, nbhd_idx);
45 | }
46 |
47 |
48 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
49 | m.def("forward", &clusten_qk_forward, "CLUSTENQK forward (CUDA)");
50 | m.def("backward", &clusten_qk_backward, "CLUSTENQK backward (CUDA)");
51 | }
52 |
--------------------------------------------------------------------------------
/mask2former/modeling/clusten/src/clustenqk_cuda_kernel.cu:
--------------------------------------------------------------------------------
1 | /*
2 | * For licensing see accompanying LICENSE file.
3 | * Copyright (C) 2023 Apple Inc. All Rights Reserved.
4 | */
5 |
6 | #include
7 |
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include
14 | #include
15 |
16 | #define CUDA_NUM_THREADS 1024
17 |
18 | template
19 | __global__ void clusten_qk_cuda_forward_kernel(
20 | const torch::PackedTensorAccessor32 query, // b x h x n x c
21 | const torch::PackedTensorAccessor32 key, // b x h x c x n (reordered by cluster)
22 | const torch::PackedTensorAccessor32 nbhd_idx, // b x n x m
23 | torch::PackedTensorAccessor32 attn, // b x h x n x m
24 | const int length, // n
25 | const int batch_size, // b
26 | const int heads, // h
27 | const int nbhd_size, // m
28 | const int dim) { // c
29 |
30 | const int z = blockIdx.z * blockDim.z + threadIdx.z;
31 | if (z < batch_size * heads){
32 | const int i = blockIdx.y * blockDim.y + threadIdx.y;
33 | if (i < length){
34 | const int ni = blockIdx.x * blockDim.x + threadIdx.x;
35 | if (ni < nbhd_size){
36 | const int b = z / heads;
37 | const int h = z - b * heads;
38 | int64_t nbi = nbhd_idx[b][i][ni];
39 | // calculate q@k
40 | scalar_t updt = scalar_t(0);
41 | #pragma unroll
42 | for (unsigned int c=0; c < dim; ++c) {
43 | updt += query[b][h][i][c] * key[b][h][c][nbi];
44 | }
45 | attn[b][h][i][ni] = updt;
46 | }
47 | }
48 | }
49 | }
50 |
51 |
52 | torch::Tensor clusten_qk_cuda_forward(
53 | const torch::Tensor &query,
54 | const torch::Tensor &key,
55 | const torch::Tensor &nbhd_idx) {
56 |
57 | int64_t batch_size = query.size(0);
58 | int64_t heads = query.size(1);
59 | int64_t length = query.size(2);
60 | int64_t dim = query.size(3);
61 | int64_t nbhd_size = nbhd_idx.size(2);
62 | int zsize = batch_size * heads;
63 |
64 | int NBHDTHREADS = min(int64_t(CUDA_NUM_THREADS), nbhd_size);
65 | int TOKENTHREADS = min(int64_t(CUDA_NUM_THREADS / NBHDTHREADS), length);
66 | int BATCHTHREADS = max(1, CUDA_NUM_THREADS / (TOKENTHREADS * NBHDTHREADS));
67 |
68 | auto attn = torch::zeros(
69 | {batch_size, heads, length, nbhd_size}, query.options());
70 |
71 | const auto stream = c10::cuda::getCurrentCUDAStream();
72 | const dim3 blocks(
73 | (dim + NBHDTHREADS - 1) / NBHDTHREADS,
74 | (length + TOKENTHREADS - 1) / TOKENTHREADS,
75 | (zsize + BATCHTHREADS - 1) / BATCHTHREADS);
76 | const dim3 threads(NBHDTHREADS, TOKENTHREADS, BATCHTHREADS);
77 |
78 | AT_DISPATCH_FLOATING_TYPES_AND_HALF(query.scalar_type(), "clusten_qk_cuda_forward", ([&] {
79 | const auto query_a = query.packed_accessor32();
80 | const auto key_a = key.packed_accessor32();
81 | const auto nbhd_idx_a = nbhd_idx.packed_accessor32();
82 | auto attn_a = attn.packed_accessor32();
83 |
84 | clusten_qk_cuda_forward_kernel<<>>(
85 | query_a, key_a, nbhd_idx_a, attn_a,
86 | length, batch_size, heads, nbhd_size, dim);
87 | }));
88 | return attn;
89 | }
90 |
91 | template
92 | __global__ void clusten_qk_cuda_backward_kernel(
93 | const torch::PackedTensorAccessor32 d_attn,
94 | const torch::PackedTensorAccessor32 query,
95 | const torch::PackedTensorAccessor32 key,
96 | const torch::PackedTensorAccessor32 nbhd_idx,
97 | torch::PackedTensorAccessor32 d_query,
98 | torch::PackedTensorAccessor32 d_key,
99 | const int length,
100 | const int batch_size,
101 | const int heads,
102 | const int nbhd_size,
103 | const int dim,
104 | const size_t d_key_numel) {
105 |
106 | const int z = blockIdx.z * blockDim.z + threadIdx.z;
107 | if (z < batch_size * heads){
108 | const int i = blockIdx.y * blockDim.y + threadIdx.y;
109 | if (i < length){
110 | const int c = blockIdx.x * blockDim.x + threadIdx.x;
111 | if (c < dim){
112 | const int b = z / heads;
113 | const int h = z - b * heads;
114 | size_t index;
115 | scalar_t dq_update = scalar_t(0);
116 | scalar_t d_attn_tmp;
117 | #pragma unroll
118 | for (unsigned int ni=0; ni < nbhd_size; ++ni) {
119 | const int64_t nbi = nbhd_idx[b][i][ni];
120 | // calculate d_query = key * d_att
121 | // calculate d_key = query * d_att
122 | d_attn_tmp = d_attn[b][h][i][ni];
123 | dq_update += key[b][h][nbi][c] * d_attn_tmp;
124 | index = b*d_key.stride(0) + h*d_key.stride(1) + nbi*d_key.stride(2) + c;
125 | at::native::fastAtomicAdd(d_key.data(), index, d_key_numel, query[b][h][i][c] * d_attn_tmp, true);
126 | //atomicAdd(&(d_key[b][h][nbi][c]), query[b][h][i][c] * d_attn_tmp); // avoid race condition
127 | }
128 | d_query[b][h][i][c] = dq_update;
129 | }
130 | }
131 | }
132 | }
133 |
134 | std::vector clusten_qk_cuda_backward(
135 | const torch::Tensor &d_attn,
136 | const torch::Tensor &query,
137 | const torch::Tensor &key,
138 | const torch::Tensor &nbhd_idx) {
139 |
140 | int64_t batch_size = query.size(0);
141 | int64_t heads = query.size(1);
142 | int64_t length = query.size(2);
143 | int64_t dim = query.size(3);
144 | int64_t nbhd_size = nbhd_idx.size(2);
145 | int zsize = batch_size * heads;
146 |
147 | int CHANNELTHREADS = min(int64_t(CUDA_NUM_THREADS), dim);
148 | int TOKENTHREADS = min(int64_t(CUDA_NUM_THREADS / CHANNELTHREADS), length);
149 | int BATCHTHREADS = max(1, CUDA_NUM_THREADS / (TOKENTHREADS * CHANNELTHREADS));
150 |
151 | auto d_query = torch::zeros_like(query);
152 | auto d_key = torch::zeros_like(key);
153 |
154 | const auto stream = c10::cuda::getCurrentCUDAStream();
155 |
156 | const dim3 blocks(
157 | (dim + CHANNELTHREADS - 1) / CHANNELTHREADS,
158 | (length + TOKENTHREADS - 1) / TOKENTHREADS,
159 | (zsize + BATCHTHREADS - 1) / BATCHTHREADS);
160 |
161 | const dim3 threads(CHANNELTHREADS, TOKENTHREADS, BATCHTHREADS);
162 |
163 | AT_DISPATCH_FLOATING_TYPES_AND_HALF(query.scalar_type(), "clusten_qk_cuda_backward", ([&] {
164 | const auto d_attn_a = d_attn.packed_accessor32();
165 | const auto query_a = query.packed_accessor32();
166 | const auto key_a = key.packed_accessor32();
167 | const auto nbhd_idx_a = nbhd_idx.packed_accessor32();
168 | auto d_query_a = d_query.packed_accessor32();
169 | auto d_key_a = d_key.packed_accessor32();
170 |
171 | const size_t d_key_numel = d_key.numel();
172 | clusten_qk_cuda_backward_kernel<<>>(
173 | d_attn_a, query_a, key_a, nbhd_idx_a, d_query_a, d_key_a,
174 | length, batch_size, heads, nbhd_size, dim, d_key_numel);
175 | }));
176 |
177 | return {d_query, d_key};
178 | }
179 |
--------------------------------------------------------------------------------
/mask2former/modeling/clusten/src/clustenwf_cuda.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * For licensing see accompanying LICENSE file.
3 | * Copyright (C) 2023 Apple Inc. All Rights Reserved.
4 | */
5 |
6 | #include
7 | #include
8 |
9 | torch::Tensor clusten_wf_cuda_forward(
10 | const torch::Tensor &weights, // b x n_ x m x ic
11 | const torch::Tensor &feat, // b x n x c
12 | const torch::Tensor &nbhd_idx); // b x n_ x m
13 |
14 | std::vector clusten_wf_cuda_backward(
15 | const torch::Tensor &d_feat_new,
16 | const torch::Tensor &weights,
17 | const torch::Tensor &feat,
18 | const torch::Tensor &nbhd_idx);
19 |
20 | // C++ interface
21 | #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
22 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
23 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
24 |
25 | torch::Tensor clusten_wf_forward(
26 | const torch::Tensor &weights,
27 | const torch::Tensor &feat,
28 | const torch::Tensor &nbhd_idx) {
29 | CHECK_INPUT(weights);
30 | CHECK_INPUT(feat);
31 | CHECK_INPUT(nbhd_idx);
32 | return clusten_wf_cuda_forward(weights, feat, nbhd_idx);
33 | }
34 |
35 | std::vector clusten_wf_backward(
36 | const torch::Tensor &d_feat_new,
37 | const torch::Tensor &weights,
38 | const torch::Tensor &feat,
39 | const torch::Tensor &nbhd_idx) {
40 | CHECK_INPUT(d_feat_new);
41 | CHECK_INPUT(weights);
42 | CHECK_INPUT(feat);
43 | CHECK_INPUT(nbhd_idx);
44 | return clusten_wf_cuda_backward(d_feat_new, weights, feat, nbhd_idx);
45 | }
46 |
47 |
48 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
49 | m.def("forward", &clusten_wf_forward, "CLUSTENWF forward (CUDA)");
50 | m.def("backward", &clusten_wf_backward, "CLUSTENWF backward (CUDA)");
51 | }
52 |
--------------------------------------------------------------------------------
/mask2former/modeling/clusten/src/msdetrpc_cuda.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * For licensing see accompanying LICENSE file.
3 | * Copyright (C) 2023 Apple Inc. All Rights Reserved.
4 | */
5 |
6 | #include
7 | #include
8 |
9 | torch::Tensor msdetrpc_cuda_forward(
10 | const torch::Tensor &nn_idx, // b x n x m x k
11 | const torch::Tensor &nn_weight, // b x n x m x k
12 | const torch::Tensor &attn, // b x n x m
13 | const torch::Tensor &val); // b x n_ x c
14 |
15 | std::vector msdetrpc_cuda_backward(
16 | const torch::Tensor &d_feat,
17 | const torch::Tensor &nn_idx,
18 | const torch::Tensor &nn_weight,
19 | const torch::Tensor &attn,
20 | const torch::Tensor &val);
21 |
22 | // C++ interface
23 | #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
24 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
25 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
26 |
27 | torch::Tensor msdetrpc_forward(
28 | const torch::Tensor &nn_idx,
29 | const torch::Tensor &nn_weight,
30 | const torch::Tensor &attn,
31 | const torch::Tensor &val) {
32 | CHECK_INPUT(nn_idx);
33 | CHECK_INPUT(nn_weight);
34 | CHECK_INPUT(attn);
35 | CHECK_INPUT(val);
36 | return msdetrpc_cuda_forward(nn_idx, nn_weight, attn, val);
37 | }
38 |
39 | std::vector msdetrpc_backward(
40 | const torch::Tensor &d_feat,
41 | const torch::Tensor &nn_idx,
42 | const torch::Tensor &nn_weight,
43 | const torch::Tensor &attn,
44 | const torch::Tensor &val) {
45 | CHECK_INPUT(d_feat);
46 | CHECK_INPUT(nn_idx);
47 | CHECK_INPUT(nn_weight);
48 | CHECK_INPUT(attn);
49 | CHECK_INPUT(val);
50 | return msdetrpc_cuda_backward(d_feat, nn_idx, nn_weight, attn, val);
51 | }
52 |
53 |
54 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
55 | m.def("forward", &msdetrpc_forward, "MSDETRPC forward (CUDA)");
56 | m.def("backward", &msdetrpc_backward, "MSDETRPC backward (CUDA)");
57 | }
58 |
--------------------------------------------------------------------------------
/mask2former/modeling/clusten/src/setup.py:
--------------------------------------------------------------------------------
1 | #
2 | # For licensing see accompanying LICENSE file.
3 | # Copyright (C) 2023 Apple Inc. All Rights Reserved.
4 | #
5 |
6 | from setuptools import setup
7 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
8 |
9 | setup(
10 | name='clustencuda',
11 | version='0.1',
12 | author='Ziwen Chen',
13 | author_email='chenziw@oregonstate.edu',
14 | description='Cluster Attention CUDA Kernel',
15 | ext_modules=[
16 | CUDAExtension('clustenqk_cuda', [
17 | 'clustenqk_cuda.cpp',
18 | 'clustenqk_cuda_kernel.cu',
19 | ]),
20 | CUDAExtension('clustenav_cuda', [
21 | 'clustenav_cuda.cpp',
22 | 'clustenav_cuda_kernel.cu',
23 | ]),
24 | CUDAExtension('clustenwf_cuda', [
25 | 'clustenwf_cuda.cpp',
26 | 'clustenwf_cuda_kernel.cu',
27 | ]),
28 | CUDAExtension('weighted_gather_cuda', [
29 | 'weighted_gather_cuda.cpp',
30 | 'weighted_gather_cuda_kernel.cu',
31 | ]),
32 | CUDAExtension('msdetrpc_cuda', [
33 | 'msdetrpc_cuda.cpp',
34 | 'msdetrpc_cuda_kernel.cu',
35 | ]),
36 | ],
37 | cmdclass={
38 | 'build_ext': BuildExtension
39 | })
40 |
--------------------------------------------------------------------------------
/mask2former/modeling/clusten/src/weighted_gather_cuda.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * For licensing see accompanying LICENSE file.
3 | * Copyright (C) 2023 Apple Inc. All Rights Reserved.
4 | */
5 |
6 | #include
7 | #include
8 |
9 | torch::Tensor weighted_gather_cuda_forward(
10 | const torch::Tensor &nbhd_idx, // b x n x m
11 | const torch::Tensor &weights, // b x n x m
12 | const torch::Tensor &feat); // b x n_ x c
13 |
14 | std::vector weighted_gather_cuda_backward(
15 | const torch::Tensor &d_feat_new,
16 | const torch::Tensor &nbhd_idx,
17 | const torch::Tensor &weights,
18 | const torch::Tensor &feat);
19 |
20 | // C++ interface
21 | #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
22 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
23 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
24 |
25 | torch::Tensor weighted_gather_forward(
26 | const torch::Tensor &nbhd_idx,
27 | const torch::Tensor &weights,
28 | const torch::Tensor &feat) {
29 | CHECK_INPUT(nbhd_idx);
30 | CHECK_INPUT(weights);
31 | CHECK_INPUT(feat);
32 | return weighted_gather_cuda_forward(nbhd_idx, weights, feat);
33 | }
34 |
35 | std::vector weighted_gather_backward(
36 | const torch::Tensor &d_feat_new,
37 | const torch::Tensor &nbhd_idx,
38 | const torch::Tensor &weights,
39 | const torch::Tensor &feat) {
40 | CHECK_INPUT(d_feat_new);
41 | CHECK_INPUT(nbhd_idx);
42 | CHECK_INPUT(weights);
43 | CHECK_INPUT(feat);
44 | return weighted_gather_cuda_backward(d_feat_new, nbhd_idx, weights, feat);
45 | }
46 |
47 |
48 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
49 | m.def("forward", &weighted_gather_forward, "WEIGHTEDGATHER forward (CUDA)");
50 | m.def("backward", &weighted_gather_backward, "WEIGHTEDGATHER backward (CUDA)");
51 | }
52 |
--------------------------------------------------------------------------------
/mask2former/modeling/clusten/test_msdetrpc_kernel.py:
--------------------------------------------------------------------------------
1 | #
2 | # For licensing see accompanying LICENSE file.
3 | # Copyright (C) 2023 Apple Inc. All Rights Reserved.
4 | #
5 |
6 | import torch
7 | from clusten import MSDETRPCFunction
8 |
9 | """
10 | Test the correctness of MSDETR (point cloud) custom kernel
11 | """
12 |
13 | b = 100
14 | n = 50
15 | n_ = 100
16 | m = 8
17 | k = 4
18 | c = 32
19 |
20 | # dummy data
21 | nn_idx = torch.randint(n_, (b, n, m, k)).cuda()
22 | nn_weights = torch.rand(b, n, m, k).cuda()
23 | attn = torch.rand(b, n, m).cuda()
24 | val = torch.rand(b, n_, c).cuda()
25 |
26 | nn_weights.requires_grad_(True)
27 | nn_weights.retain_grad()
28 | attn.requires_grad_(True)
29 | attn.retain_grad()
30 | val.requires_grad_(True)
31 | val.retain_grad()
32 |
33 | # use the custom kernel
34 | feat = MSDETRPCFunction.apply(nn_idx, nn_weights, attn, val)
35 | feat.mean().backward()
36 | grad_weights = nn_weights.grad.clone().detach()
37 | grad_attn = attn.grad.clone().detach()
38 | grad_val = val.grad.clone().detach()
39 | nn_weights.grad.data.zero_()
40 | attn.grad.data.zero_()
41 | val.grad.data.zero_()
42 |
43 | # use the pytorch equivalent
44 | nn_val = val.gather(index=nn_idx.view(b, -1).unsqueeze(2).expand(-1, -1, c), dim=1).reshape(b, n, m, k, c)
45 | feat2 = ((nn_val * nn_weights.unsqueeze(4)).sum(3) * attn.unsqueeze(3)).sum(2) # b x n x c
46 | feat2.mean().backward()
47 | grad_weights2 = nn_weights.grad.clone().detach()
48 | grad_attn2 = attn.grad.clone().detach()
49 | grad_val2 = val.grad.clone().detach()
50 | nn_weights.grad.data.zero_()
51 | attn.grad.data.zero_()
52 | val.grad.data.zero_()
53 |
54 | print('diff of forward: ', torch.linalg.norm(feat2 - feat))
55 | print('diff of grad weights: ', torch.linalg.norm(grad_weights2 - grad_weights))
56 | print('diff of grad attn: ', torch.linalg.norm(grad_attn2 - grad_attn))
57 | print('diff of grad val: ', torch.linalg.norm(grad_val2 - grad_val))
58 |
--------------------------------------------------------------------------------
/mask2former/modeling/clusten/test_wg_kernel.py:
--------------------------------------------------------------------------------
1 | #
2 | # For licensing see accompanying LICENSE file.
3 | # Copyright (C) 2023 Apple Inc. All Rights Reserved.
4 | #
5 |
6 | import torch
7 | from clusten import WEIGHTEDGATHERFunction
8 |
9 | """
10 | Test the correctness of WeightedGather custom kernel
11 | """
12 |
13 | b = 100
14 | n = 50
15 | n_ = 100
16 | k = 4
17 | c = 32
18 |
19 | # dummy data
20 | nn_idx = torch.randint(n_, (b, n, k)).cuda()
21 | nn_weights = torch.rand(b, n, k).cuda()
22 | feature = torch.rand(b, n_, c).cuda()
23 | nn_weights.requires_grad_(True)
24 | nn_weights.retain_grad()
25 | feature.requires_grad_(True)
26 | feature.retain_grad()
27 |
28 | # use the custom kernel
29 | up_features = WEIGHTEDGATHERFunction.apply(nn_idx, nn_weights, feature)
30 | up_features.mean().backward()
31 | grad_weights = nn_weights.grad.clone().detach()
32 | grad_feat = feature.grad.clone().detach()
33 | nn_weights.grad.data.zero_()
34 | feature.grad.data.zero_()
35 |
36 | # use the pytorch equivalent
37 | nn_features = feature.gather(index=nn_idx.view(b, -1).unsqueeze(2).expand(-1, -1, c), dim=1).reshape(b, n, k, c)
38 | up_features2 = nn_features.mul(nn_weights.unsqueeze(3).expand(-1, -1, -1, c)).sum(dim=2) # b x n x c
39 | up_features2.mean().backward()
40 | grad_weights2 = nn_weights.grad.clone().detach()
41 | grad_feat2 = feature.grad.clone().detach()
42 | nn_weights.grad.data.zero_()
43 | feature.grad.data.zero_()
44 |
45 | print('diff of forward: ', torch.linalg.norm(up_features2 - up_features))
46 | print('diff of grad weights: ', torch.linalg.norm(grad_weights2 - grad_weights))
47 | print('diff of grad feat: ', torch.linalg.norm(grad_feat2 - grad_feat))
48 |
--------------------------------------------------------------------------------
/mask2former/modeling/matcher.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/matcher.py
3 |
4 | """
5 | Modules to compute the matching cost and solve the corresponding LSAP.
6 | """
7 | import torch
8 | import torch.nn.functional as F
9 | from scipy.optimize import linear_sum_assignment
10 | from torch import nn
11 | from torch.cuda.amp import autocast
12 |
13 | from detectron2.projects.point_rend.point_features import point_sample
14 |
15 |
16 | def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor):
17 | """
18 | Compute the DICE loss, similar to generalized IOU for masks
19 | Args:
20 | inputs: A float tensor of arbitrary shape.
21 | The predictions for each example.
22 | targets: A float tensor with the same shape as inputs. Stores the binary
23 | classification label for each element in inputs
24 | (0 for the negative class and 1 for the positive class).
25 | """
26 | inputs = inputs.sigmoid()
27 | inputs = inputs.flatten(1)
28 | numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets)
29 | denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :]
30 | loss = 1 - (numerator + 1) / (denominator + 1)
31 | return loss
32 |
33 |
34 | batch_dice_loss_jit = torch.jit.script(
35 | batch_dice_loss
36 | ) # type: torch.jit.ScriptModule
37 |
38 |
39 | def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor):
40 | """
41 | Args:
42 | inputs: A float tensor of arbitrary shape.
43 | The predictions for each example.
44 | targets: A float tensor with the same shape as inputs. Stores the binary
45 | classification label for each element in inputs
46 | (0 for the negative class and 1 for the positive class).
47 | Returns:
48 | Loss tensor
49 | """
50 | hw = inputs.shape[1]
51 |
52 | pos = F.binary_cross_entropy_with_logits(
53 | inputs, torch.ones_like(inputs), reduction="none"
54 | )
55 | neg = F.binary_cross_entropy_with_logits(
56 | inputs, torch.zeros_like(inputs), reduction="none"
57 | )
58 |
59 | loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum(
60 | "nc,mc->nm", neg, (1 - targets)
61 | )
62 |
63 | return loss / hw
64 |
65 |
66 | batch_sigmoid_ce_loss_jit = torch.jit.script(
67 | batch_sigmoid_ce_loss
68 | ) # type: torch.jit.ScriptModule
69 |
70 |
71 | class HungarianMatcher(nn.Module):
72 | """This class computes an assignment between the targets and the predictions of the network
73 |
74 | For efficiency reasons, the targets don't include the no_object. Because of this, in general,
75 | there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
76 | while the others are un-matched (and thus treated as non-objects).
77 | """
78 |
79 | def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float = 1, num_points: int = 0):
80 | """Creates the matcher
81 |
82 | Params:
83 | cost_class: This is the relative weight of the classification error in the matching cost
84 | cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost
85 | cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost
86 | """
87 | super().__init__()
88 | self.cost_class = cost_class
89 | self.cost_mask = cost_mask
90 | self.cost_dice = cost_dice
91 |
92 | assert cost_class != 0 or cost_mask != 0 or cost_dice != 0, "all costs cant be 0"
93 |
94 | self.num_points = num_points
95 |
96 | @torch.no_grad()
97 | def memory_efficient_forward(self, outputs, targets):
98 | """More memory-friendly matching"""
99 | bs, num_queries = outputs["pred_logits"].shape[:2]
100 |
101 | indices = []
102 |
103 | # Iterate through batch size
104 | for b in range(bs):
105 |
106 | out_prob = outputs["pred_logits"][b].softmax(-1) # [num_queries, num_classes]
107 | tgt_ids = targets[b]["labels"]
108 |
109 | # Compute the classification cost. Contrary to the loss, we don't use the NLL,
110 | # but approximate it in 1 - proba[target class].
111 | # The 1 is a constant that doesn't change the matching, it can be ommitted.
112 | cost_class = -out_prob[:, tgt_ids]
113 |
114 | out_mask = outputs["pred_masks"][b] # [num_queries, H_pred, W_pred]
115 | # gt masks are already padded when preparing target
116 | tgt_mask = targets[b]["masks"].to(out_mask)
117 |
118 | out_mask = out_mask[:, None]
119 | tgt_mask = tgt_mask[:, None]
120 | # all masks share the same set of points for efficient matching!
121 | point_coords = torch.rand(1, self.num_points, 2, device=out_mask.device)
122 | # get gt labels
123 | tgt_mask = point_sample(
124 | tgt_mask,
125 | point_coords.repeat(tgt_mask.shape[0], 1, 1),
126 | align_corners=False,
127 | ).squeeze(1)
128 |
129 | out_mask = point_sample(
130 | out_mask,
131 | point_coords.repeat(out_mask.shape[0], 1, 1),
132 | align_corners=False,
133 | ).squeeze(1)
134 |
135 | with autocast(enabled=False):
136 | out_mask = out_mask.float()
137 | tgt_mask = tgt_mask.float()
138 | # Compute the focal loss between masks
139 | # cost_mask = batch_sigmoid_ce_loss_jit(out_mask, tgt_mask)
140 | cost_mask = batch_sigmoid_ce_loss(out_mask, tgt_mask)
141 |
142 | # Compute the dice loss betwen masks
143 | # cost_dice = batch_dice_loss_jit(out_mask, tgt_mask)
144 | cost_dice = batch_dice_loss(out_mask, tgt_mask)
145 |
146 | # Final cost matrix
147 | C = (
148 | self.cost_mask * cost_mask
149 | + self.cost_class * cost_class
150 | + self.cost_dice * cost_dice
151 | )
152 | C = C.reshape(num_queries, -1).cpu()
153 |
154 | indices.append(linear_sum_assignment(C))
155 |
156 | return [
157 | (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64))
158 | for i, j in indices
159 | ]
160 |
161 | @torch.no_grad()
162 | def forward(self, outputs, targets):
163 | """Performs the matching
164 |
165 | Params:
166 | outputs: This is a dict that contains at least these entries:
167 | "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
168 | "pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks
169 |
170 | targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
171 | "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
172 | objects in the target) containing the class labels
173 | "masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks
174 |
175 | Returns:
176 | A list of size batch_size, containing tuples of (index_i, index_j) where:
177 | - index_i is the indices of the selected predictions (in order)
178 | - index_j is the indices of the corresponding selected targets (in order)
179 | For each batch element, it holds:
180 | len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
181 | """
182 | return self.memory_efficient_forward(outputs, targets)
183 |
184 | def __repr__(self, _repr_indent=4):
185 | head = "Matcher " + self.__class__.__name__
186 | body = [
187 | "cost_class: {}".format(self.cost_class),
188 | "cost_mask: {}".format(self.cost_mask),
189 | "cost_dice: {}".format(self.cost_dice),
190 | ]
191 | lines = [head] + [" " * _repr_indent + line for line in body]
192 | return "\n".join(lines)
193 |
--------------------------------------------------------------------------------
/mask2former/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
--------------------------------------------------------------------------------
/mask2former/modeling/meta_arch/mask_former_head.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
3 | import logging
4 | from typing import Dict
5 |
6 | from torch import nn
7 |
8 | from detectron2.config import configurable
9 | from detectron2.layers import ShapeSpec
10 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
11 |
12 | from ..transformer_decoder.mask2former_transformer_decoder import build_transformer_decoder
13 | from ..pixel_decoder.msdeformattn_pc import build_pixel_decoder
14 |
15 |
16 | @SEM_SEG_HEADS_REGISTRY.register()
17 | class MaskFormerHead(nn.Module):
18 |
19 | _version = 2
20 |
21 | def _load_from_state_dict(
22 | self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
23 | ):
24 | version = local_metadata.get("version", None)
25 | if version is None or version < 2:
26 | # Do not warn if train from scratch
27 | scratch = True
28 | logger = logging.getLogger(__name__)
29 | for k in list(state_dict.keys()):
30 | newk = k
31 | if newk != k:
32 | state_dict[newk] = state_dict[k]
33 | del state_dict[k]
34 | scratch = False
35 |
36 | if not scratch:
37 | logger.warning(
38 | f"Weight format of {self.__class__.__name__} have changed! "
39 | "Please upgrade your models. Applying automatic conversion now ..."
40 | )
41 |
42 |
43 | @configurable
44 | def __init__(
45 | self,
46 | input_shape: Dict[str, ShapeSpec],
47 | *,
48 | num_classes: int,
49 | pixel_decoder: nn.Module,
50 | loss_weight: float = 1.0,
51 | ignore_value: int = -1,
52 | # extra parameters
53 | transformer_predictor: nn.Module,
54 | transformer_in_feature: str,
55 | ):
56 | """
57 | NOTE: this interface is experimental.
58 | Args:
59 | input_shape: shapes (channels and stride) of the input features
60 | num_classes: number of classes to predict
61 | pixel_decoder: the pixel decoder module
62 | loss_weight: loss weight
63 | ignore_value: category id to be ignored during training.
64 | transformer_predictor: the transformer decoder that makes prediction
65 | transformer_in_feature: input feature name to the transformer_predictor
66 | """
67 | super().__init__()
68 | input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
69 | self.in_features = [k for k, v in input_shape]
70 | feature_strides = [v.stride for k, v in input_shape]
71 | feature_channels = [v.channels for k, v in input_shape]
72 |
73 | self.ignore_value = ignore_value
74 | self.common_stride = 4
75 | self.loss_weight = loss_weight
76 |
77 | self.pixel_decoder = pixel_decoder
78 | self.predictor = transformer_predictor
79 | self.transformer_in_feature = transformer_in_feature
80 |
81 | self.num_classes = num_classes
82 |
83 | @classmethod
84 | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
85 | # figure out in_channels to transformer predictor
86 | if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder":
87 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
88 | elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding":
89 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
90 | elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder": # for maskformer2
91 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
92 | else:
93 | transformer_predictor_in_channels = input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels
94 |
95 | return {
96 | "input_shape": {
97 | k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
98 | },
99 | "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
100 | "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
101 | "pixel_decoder": build_pixel_decoder(cfg, input_shape),
102 | "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
103 | "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE,
104 | "transformer_predictor": build_transformer_decoder(
105 | cfg,
106 | transformer_predictor_in_channels,
107 | mask_classification=True,
108 | ),
109 | }
110 |
111 | def forward(self, features, mask=None):
112 | return self.layers(features, mask)
113 |
114 | def layers(self, features, mask=None):
115 | mask_features, mf_pos, transformer_encoder_features, multi_scale_features, multi_scale_poss = self.pixel_decoder.forward_features(features)
116 | if self.transformer_in_feature == "multi_scale_pixel_decoder":
117 | predictions = self.predictor(multi_scale_features, multi_scale_poss, mask_features, mf_pos, mask)
118 | else:
119 | if self.transformer_in_feature == "transformer_encoder":
120 | assert (
121 | transformer_encoder_features is not None
122 | ), "Please use the TransformerEncoderPixelDecoder."
123 | predictions = self.predictor(transformer_encoder_features, mask_features, mask)
124 | elif self.transformer_in_feature == "pixel_embedding":
125 | predictions = self.predictor(mask_features, mask_features, mask)
126 | else:
127 | predictions = self.predictor(features[self.transformer_in_feature], mask_features, mask)
128 | return predictions
129 |
--------------------------------------------------------------------------------
/mask2former/modeling/transformer_decoder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder
4 |
--------------------------------------------------------------------------------
/mask2former/modeling/transformer_decoder/position_encoding.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
3 | # Adapted for AutoFocusFormer by Ziwen 2023
4 |
5 | """
6 | Various positional encodings for the transformer.
7 | """
8 | import math
9 |
10 | import torch
11 | from torch import nn
12 |
13 |
14 | class PositionEmbeddingSine(nn.Module):
15 | """
16 | This is a more standard version of the position embedding, very similar to the one
17 | used by the Attention is all you need paper, generalized to work on images.
18 | """
19 |
20 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
21 | super().__init__()
22 | self.num_pos_feats = num_pos_feats
23 | self.temperature = temperature
24 | self.normalize = normalize
25 | if scale is not None and normalize is False:
26 | raise ValueError("normalize should be True if scale is passed")
27 | if scale is None:
28 | scale = 2 * math.pi
29 | self.scale = scale
30 |
31 | def forward(self, pos):
32 | '''
33 | pos - b x n x d
34 | '''
35 | b, n, d = pos.shape
36 | y_embed = pos[:, :, 1] # b x n
37 | x_embed = pos[:, :, 0]
38 | if self.normalize:
39 | eps = 1e-6
40 | y_embed = y_embed / (y_embed.max() + eps) * self.scale
41 | x_embed = x_embed / (x_embed.max() + eps) * self.scale
42 |
43 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=pos.device) # npf
44 | dim_t = self.temperature ** (2 * (dim_t.div(2, rounding_mode='floor')) / self.num_pos_feats) # npf
45 |
46 | pos_x = x_embed[:, :, None] / dim_t # b x n x npf
47 | pos_y = y_embed[:, :, None] / dim_t
48 | pos_x = torch.cat(
49 | (pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=2
50 | )
51 | pos_y = torch.cat(
52 | (pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=2
53 | )
54 | pos = torch.cat((pos_x, pos_y), dim=2) # b x n x d'
55 | return pos
56 |
57 | def __repr__(self, _repr_indent=4):
58 | head = "Positional encoding " + self.__class__.__name__
59 | body = [
60 | "num_pos_feats: {}".format(self.num_pos_feats),
61 | "temperature: {}".format(self.temperature),
62 | "normalize: {}".format(self.normalize),
63 | "scale: {}".format(self.scale),
64 | ]
65 | # _repr_indent = 4
66 | lines = [head] + [" " * _repr_indent + line for line in body]
67 | return "\n".join(lines)
68 |
--------------------------------------------------------------------------------
/mask2former/test_time_augmentation.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
3 | import copy
4 |
5 | import numpy as np
6 | import torch
7 | from fvcore.transforms import HFlipTransform
8 | from torch import nn
9 | from torch.nn.parallel import DistributedDataParallel
10 |
11 | from detectron2.data.detection_utils import read_image
12 | from detectron2.modeling import DatasetMapperTTA
13 |
14 |
15 | __all__ = [
16 | "SemanticSegmentorWithTTA",
17 | ]
18 |
19 |
20 | class SemanticSegmentorWithTTA(nn.Module):
21 | """
22 | A SemanticSegmentor with test-time augmentation enabled.
23 | Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`.
24 | """
25 |
26 | def __init__(self, cfg, model, tta_mapper=None, batch_size=1):
27 | """
28 | Args:
29 | cfg (CfgNode):
30 | model (SemanticSegmentor): a SemanticSegmentor to apply TTA on.
31 | tta_mapper (callable): takes a dataset dict and returns a list of
32 | augmented versions of the dataset dict. Defaults to
33 | `DatasetMapperTTA(cfg)`.
34 | batch_size (int): batch the augmented images into this batch size for inference.
35 | """
36 | super().__init__()
37 | if isinstance(model, DistributedDataParallel):
38 | model = model.module
39 | self.cfg = cfg.clone()
40 |
41 | self.model = model
42 |
43 | if tta_mapper is None:
44 | tta_mapper = DatasetMapperTTA(cfg)
45 | self.tta_mapper = tta_mapper
46 | self.batch_size = batch_size
47 |
48 | def __call__(self, batched_inputs):
49 | """
50 | Same input/output format as :meth:`SemanticSegmentor.forward`
51 | """
52 |
53 | def _maybe_read_image(dataset_dict):
54 | ret = copy.copy(dataset_dict)
55 | if "image" not in ret:
56 | image = read_image(ret.pop("file_name"), self.model.input_format)
57 | image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1))) # CHW
58 | ret["image"] = image
59 | if "height" not in ret and "width" not in ret:
60 | ret["height"] = image.shape[1]
61 | ret["width"] = image.shape[2]
62 | return ret
63 |
64 | processed_results = []
65 | for x in batched_inputs:
66 | result = self._inference_one_image(_maybe_read_image(x))
67 | processed_results.append(result)
68 | return processed_results
69 |
70 | def _inference_one_image(self, input):
71 | """
72 | Args:
73 | input (dict): one dataset dict with "image" field being a CHW tensor
74 | Returns:
75 | dict: one output dict
76 | """
77 | orig_shape = (input["height"], input["width"])
78 | augmented_inputs, tfms = self._get_augmented_inputs(input)
79 |
80 | final_predictions = None
81 | count_predictions = 0
82 | for input, tfm in zip(augmented_inputs, tfms):
83 | count_predictions += 1
84 | with torch.no_grad():
85 | if final_predictions is None:
86 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
87 | final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2])
88 | else:
89 | final_predictions = self.model([input])[0].pop("sem_seg")
90 | else:
91 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
92 | final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2])
93 | else:
94 | final_predictions += self.model([input])[0].pop("sem_seg")
95 |
96 | final_predictions = final_predictions / count_predictions
97 | return {"sem_seg": final_predictions}
98 |
99 | def _get_augmented_inputs(self, input):
100 | augmented_inputs = self.tta_mapper(input)
101 | tfms = [x.pop("transforms") for x in augmented_inputs]
102 | return augmented_inputs, tfms
103 |
--------------------------------------------------------------------------------
/mask2former/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
--------------------------------------------------------------------------------
/mask2former/utils/misc.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
3 | """
4 | Misc functions, including distributed helpers.
5 |
6 | Mostly copy-paste from torchvision references.
7 | """
8 | from typing import List, Optional
9 |
10 | import torch
11 | import torch.distributed as dist
12 | import torchvision
13 | from torch import Tensor
14 |
15 |
16 | def _max_by_axis(the_list):
17 | # type: (List[List[int]]) -> List[int]
18 | maxes = the_list[0]
19 | for sublist in the_list[1:]:
20 | for index, item in enumerate(sublist):
21 | maxes[index] = max(maxes[index], item)
22 | return maxes
23 |
24 |
25 | class NestedTensor(object):
26 | def __init__(self, tensors, mask: Optional[Tensor]):
27 | self.tensors = tensors
28 | self.mask = mask
29 |
30 | def to(self, device):
31 | # type: (Device) -> NestedTensor # noqa
32 | cast_tensor = self.tensors.to(device)
33 | mask = self.mask
34 | if mask is not None:
35 | assert mask is not None
36 | cast_mask = mask.to(device)
37 | else:
38 | cast_mask = None
39 | return NestedTensor(cast_tensor, cast_mask)
40 |
41 | def decompose(self):
42 | return self.tensors, self.mask
43 |
44 | def __repr__(self):
45 | return str(self.tensors)
46 |
47 |
48 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
49 | # TODO make this more general
50 | if tensor_list[0].ndim == 3:
51 | if torchvision._is_tracing():
52 | # nested_tensor_from_tensor_list() does not export well to ONNX
53 | # call _onnx_nested_tensor_from_tensor_list() instead
54 | return _onnx_nested_tensor_from_tensor_list(tensor_list)
55 |
56 | # TODO make it support different-sized images
57 | max_size = _max_by_axis([list(img.shape) for img in tensor_list])
58 | # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
59 | batch_shape = [len(tensor_list)] + max_size
60 | b, c, h, w = batch_shape
61 | dtype = tensor_list[0].dtype
62 | device = tensor_list[0].device
63 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
64 | mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
65 | for img, pad_img, m in zip(tensor_list, tensor, mask):
66 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
67 | m[: img.shape[1], : img.shape[2]] = False
68 | else:
69 | raise ValueError("not supported")
70 | return NestedTensor(tensor, mask)
71 |
72 |
73 | # _onnx_nested_tensor_from_tensor_list() is an implementation of
74 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing.
75 | @torch.jit.unused
76 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
77 | max_size = []
78 | for i in range(tensor_list[0].dim()):
79 | max_size_i = torch.max(
80 | torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
81 | ).to(torch.int64)
82 | max_size.append(max_size_i)
83 | max_size = tuple(max_size)
84 |
85 | # work around for
86 | # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
87 | # m[: img.shape[1], :img.shape[2]] = False
88 | # which is not yet supported in onnx
89 | padded_imgs = []
90 | padded_masks = []
91 | for img in tensor_list:
92 | padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
93 | padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
94 | padded_imgs.append(padded_img)
95 |
96 | m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
97 | padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
98 | padded_masks.append(padded_mask.to(torch.bool))
99 |
100 | tensor = torch.stack(padded_imgs)
101 | mask = torch.stack(padded_masks)
102 |
103 | return NestedTensor(tensor, mask=mask)
104 |
105 |
106 | def is_dist_avail_and_initialized():
107 | if not dist.is_available():
108 | return False
109 | if not dist.is_initialized():
110 | return False
111 | return True
112 |
--------------------------------------------------------------------------------
/run_aff_segmentation.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # number of parallel gpus
4 | GPUS=2
5 |
6 | # path to config file
7 | CONFIG=configs/cityscapes/panoptic-segmentation/aff/maskformer2_aff_small_bs32_45k.yaml
8 |
9 | # checkpoint path for resume
10 | RESUME=checkpoints/city_pan/aff_small.pth
11 |
12 | # output folder
13 | OUTPUT=outputs/
14 |
15 | python train_net.py --num-gpus $GPUS \
16 | --config-file $CONFIG \
17 | --dist-url tcp://127.0.0.1:12345 \
18 | --resume \
19 | --eval-only \
20 | MODEL.WEIGHTS $RESUME \
21 | OUTPUT_DIR $OUTPUT
22 |
23 | # Remove '--resume', '--eval-only' and 'MODEL.WEIGHTS' to start training from fresh.
24 | # Note that if '--resume' is on, the 'MODEL.WEIGHTS' option will be overwritten by the last_checkpoint file in the output folder (auto-resume), if the file exists.
25 | # The KEY VALUE pairs must be at the end, after all the flags.
26 |
--------------------------------------------------------------------------------
/run_demo.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # path to config file
4 | CONFIG="../configs/cityscapes/panoptic-segmentation/aff/maskformer2_aff_small_bs32_45k.yaml"
5 |
6 | # path to pre-trained checkpoint
7 | CKPT="../checkpoints/city_pan/aff_small.pth"
8 |
9 | # path to images for prediction
10 | INPUTS="../imgs/*.jpg"
11 |
12 | # path to blurred version of input images (optional)
13 | BLUR="../imgs_blur/"
14 |
15 | # output folder to store results
16 | OUTPUT="demo_res"
17 |
18 | # create output folder
19 | mkdir $OUTPUT
20 |
21 | # run visualization code
22 | cd demo/
23 | python demo.py --config-file $CONFIG \
24 | --input $INPUTS \
25 | --output ../$OUTPUT \
26 | --blur $BLUR \
27 | --opts MODEL.WEIGHTS $CKPT \
28 |
29 | # The --opts flag should always be the last one
30 | # Remove --blur flag to visualize predictions on original images
31 |
--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
1 | This directory contains a few tools.
2 |
3 | * `convert-pretrained-model-to-d2.py`
4 |
5 | Tool to convert ImageNet pre-trained weights for D2.
6 |
7 | * `analyze_model.py`
8 |
9 | Tool to analyze model parameters and flops.
10 |
11 | Usage for semantic segmentation (ADE20K only, use with caution!):
12 |
13 | ```
14 | python tools/analyze_model.py --num-inputs 1 --tasks flop --use-fixed-input-size --config-file CONFIG_FILE
15 | ```
16 |
17 | Note that, for semantic segmentation (ADE20K only), we use a dummy image with fixed size that equals to `cfg.INPUT.CROP.SIZE[0] x cfg.INPUT.CROP.SIZE[0]`.
18 | Please do not use `--use-fixed-input-size` for calculating FLOPs on other datasets like COCO!
19 |
20 | Usage for panoptic and instance segmentation:
21 |
22 | ```
23 | python tools/analyze_model.py --num-inputs 100 --tasks flop --config-file CONFIG_FILE
24 | ```
25 |
26 | Note that, for panoptic and instance segmentation, we compute the average flops over 100 real validation images.
27 |
--------------------------------------------------------------------------------
/tools/analyze_model.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Copyright (c) Facebook, Inc. and its affiliates.
3 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detectron2/blob/main/tools/analyze_model.py
4 |
5 | import logging
6 | import numpy as np
7 | from collections import Counter
8 | import tqdm
9 | from fvcore.nn import flop_count_table # can also try flop_count_str
10 |
11 | from detectron2.checkpoint import DetectionCheckpointer
12 | from detectron2.config import CfgNode, LazyConfig, get_cfg, instantiate
13 | from detectron2.data import build_detection_test_loader
14 | from detectron2.engine import default_argument_parser
15 | from detectron2.modeling import build_model
16 | from detectron2.projects.deeplab import add_deeplab_config
17 | from detectron2.utils.analysis import (
18 | FlopCountAnalysis,
19 | activation_count_operators,
20 | parameter_count_table,
21 | )
22 | from detectron2.utils.logger import setup_logger
23 |
24 | # fmt: off
25 | import os
26 | import sys
27 | sys.path.insert(1, os.path.join(sys.path[0], '..'))
28 | # fmt: on
29 |
30 | from mask2former import add_maskformer2_config
31 |
32 | logger = logging.getLogger("detectron2")
33 |
34 | """
35 | Analyzes FLOP count, parameter count, model structure and operator activation count for models
36 | For usage example, please refer to tools/README.md
37 | """
38 |
39 |
40 | def setup(args):
41 | if args.config_file.endswith(".yaml"):
42 | cfg = get_cfg()
43 | add_deeplab_config(cfg)
44 | add_maskformer2_config(cfg)
45 | cfg.merge_from_file(args.config_file)
46 | cfg.DATALOADER.NUM_WORKERS = 0
47 | cfg.merge_from_list(args.opts)
48 | cfg.freeze()
49 | else:
50 | cfg = LazyConfig.load(args.config_file)
51 | cfg = LazyConfig.apply_overrides(cfg, args.opts)
52 | setup_logger(name="fvcore")
53 | setup_logger()
54 | return cfg
55 |
56 |
57 | def do_flop(cfg):
58 | if isinstance(cfg, CfgNode):
59 | data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0])
60 | model = build_model(cfg)
61 | DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
62 | else:
63 | data_loader = instantiate(cfg.dataloader.test)
64 | model = instantiate(cfg.model)
65 | model.to(cfg.train.device)
66 | DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
67 | model.eval()
68 |
69 | counts = Counter()
70 | total_flops = []
71 | for idx, data in zip(tqdm.trange(args.num_inputs), data_loader): # noqa
72 | if args.use_fixed_input_size and isinstance(cfg, CfgNode):
73 | import torch
74 | crop_size = cfg.INPUT.CROP.SIZE[0]
75 | data[0]["image"] = torch.zeros((3, crop_size, crop_size))
76 | flops = FlopCountAnalysis(model, data)
77 | if idx > 0:
78 | flops.unsupported_ops_warnings(False).uncalled_modules_warnings(False)
79 | counts += flops.by_operator()
80 | total_flops.append(flops.total())
81 |
82 | logger.info("Flops table computed from only one input sample:\n" + flop_count_table(flops))
83 | logger.info(
84 | "Average GFlops for each type of operators:\n"
85 | + str([(k, v / (idx + 1) / 1e9) for k, v in counts.items()])
86 | )
87 | logger.info(
88 | "Total GFlops: {:.1f}±{:.1f}".format(np.mean(total_flops) / 1e9, np.std(total_flops) / 1e9)
89 | )
90 |
91 |
92 | def do_activation(cfg):
93 | if isinstance(cfg, CfgNode):
94 | data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0])
95 | model = build_model(cfg)
96 | DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
97 | else:
98 | data_loader = instantiate(cfg.dataloader.test)
99 | model = instantiate(cfg.model)
100 | model.to(cfg.train.device)
101 | DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
102 | model.eval()
103 |
104 | counts = Counter()
105 | total_activations = []
106 | for idx, data in zip(tqdm.trange(args.num_inputs), data_loader): # noqa
107 | count = activation_count_operators(model, data)
108 | counts += count
109 | total_activations.append(sum(count.values()))
110 | logger.info(
111 | "(Million) Activations for Each Type of Operators:\n"
112 | + str([(k, v / idx) for k, v in counts.items()])
113 | )
114 | logger.info(
115 | "Total (Million) Activations: {}±{}".format(
116 | np.mean(total_activations), np.std(total_activations)
117 | )
118 | )
119 |
120 |
121 | def do_parameter(cfg):
122 | if isinstance(cfg, CfgNode):
123 | model = build_model(cfg)
124 | else:
125 | model = instantiate(cfg.model)
126 | logger.info("Parameter Count:\n" + parameter_count_table(model, max_depth=5))
127 |
128 |
129 | def do_structure(cfg):
130 | if isinstance(cfg, CfgNode):
131 | model = build_model(cfg)
132 | else:
133 | model = instantiate(cfg.model)
134 | logger.info("Model Structure:\n" + str(model))
135 |
136 |
137 | if __name__ == "__main__":
138 | parser = default_argument_parser(
139 | epilog="""
140 | Examples:
141 | To show parameters of a model:
142 | $ ./analyze_model.py --tasks parameter \\
143 | --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
144 | Flops and activations are data-dependent, therefore inputs and model weights
145 | are needed to count them:
146 | $ ./analyze_model.py --num-inputs 100 --tasks flop \\
147 | --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \\
148 | MODEL.WEIGHTS /path/to/model.pkl
149 | """
150 | )
151 | parser.add_argument(
152 | "--tasks",
153 | choices=["flop", "activation", "parameter", "structure"],
154 | required=True,
155 | nargs="+",
156 | )
157 | parser.add_argument(
158 | "-n",
159 | "--num-inputs",
160 | default=100,
161 | type=int,
162 | help="number of inputs used to compute statistics for flops/activations, "
163 | "both are data dependent.",
164 | )
165 | parser.add_argument(
166 | "--use-fixed-input-size",
167 | action="store_true",
168 | help="use fixed input size when calculating flops",
169 | )
170 | args = parser.parse_args()
171 | assert not args.eval_only
172 | assert args.num_gpus == 1
173 |
174 | cfg = setup(args)
175 |
176 | for task in args.tasks:
177 | {
178 | "flop": do_flop,
179 | "activation": do_activation,
180 | "parameter": do_parameter,
181 | "structure": do_structure,
182 | }[task](cfg)
183 |
--------------------------------------------------------------------------------
/tools/convert-pretrained-model-to-d2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
3 | # Adapted for AutoFocusFormer by Ziwen 2023
4 |
5 | import pickle as pkl
6 | import sys
7 |
8 | import torch
9 |
10 | """
11 | Usage:
12 | # run the conversion
13 | python ./convert-pretrained-model-to-d2.py aff.pth aff.pkl
14 | # Then, use aff.pkl in config:
15 | MODEL:
16 | WEIGHTS: "/path/to/aff.pkl"
17 | INPUT:
18 | FORMAT: "RGB"
19 | """
20 |
21 | if __name__ == "__main__":
22 | input = sys.argv[1]
23 |
24 | obj = torch.load(input, map_location="cpu")["model"]
25 |
26 | res = {"model": obj, "__author__": "third_party", "matching_heuristics": True}
27 |
28 | with open(sys.argv[2], "wb") as f:
29 | pkl.dump(res, f)
30 |
--------------------------------------------------------------------------------