├── .flake8 ├── .gitattributes ├── .gitignore ├── ACKNOWLEDGMENTS ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── aff.png ├── architecture.png ├── builtin.py ├── builtin_meta.py ├── configs ├── ade20k │ └── semantic-segmentation │ │ ├── Base-ADE20K-SemanticSegmentation.yaml │ │ ├── aff │ │ ├── maskformer2_aff_mini_1_5th_bs32_80k.yaml │ │ ├── maskformer2_aff_mini_bs32_80k.yaml │ │ ├── maskformer2_aff_small_1_5th_bs32_80k.yaml │ │ ├── maskformer2_aff_small_bs32_80k.yaml │ │ ├── maskformer2_aff_tiny_1_5th_bs32_80k.yaml │ │ └── maskformer2_aff_tiny_bs32_80k.yaml │ │ └── maskformer2_R50_bs16_160k.yaml ├── cityscapes │ ├── instance-segmentation │ │ ├── Base-Cityscapes-InstanceSegmentation.yaml │ │ ├── aff │ │ │ ├── maskformer2_aff_base_384_bs16_90k.yaml │ │ │ ├── maskformer2_aff_mini_bs32_45k.yaml │ │ │ ├── maskformer2_aff_small_bs32_45k.yaml │ │ │ └── maskformer2_aff_tiny_bs32_45k.yaml │ │ └── maskformer2_R50_bs16_90k.yaml │ └── panoptic-segmentation │ │ ├── Base-Cityscapes-PanopticSegmentation.yaml │ │ ├── aff │ │ ├── maskformer2_aff_base_384_bs16_90k.yaml │ │ ├── maskformer2_aff_mini_bs32_45k.yaml │ │ ├── maskformer2_aff_small_bs32_45k.yaml │ │ └── maskformer2_aff_tiny_bs32_45k.yaml │ │ └── maskformer2_R50_bs16_90k.yaml └── coco │ └── instance-segmentation │ ├── Base-COCO-InstanceSegmentation.yaml │ ├── aff │ ├── maskformer2_aff_mini_1_5th_bs64_50ep.yaml │ ├── maskformer2_aff_mini_bs64_50ep.yaml │ ├── maskformer2_aff_small_1_5th_bs64_50ep.yaml │ ├── maskformer2_aff_small_bs64_50ep.yaml │ ├── maskformer2_aff_tiny_1_5th_bs64_50ep.yaml │ └── maskformer2_aff_tiny_bs64_50ep.yaml │ └── maskformer2_R50_bs16_50ep.yaml ├── create_env.sh ├── datasets ├── README.md ├── prepare_ade20k_sem_seg.py ├── prepare_coco_semantic_annos_from_panoptic_annos.py ├── prepare_cocofied_lvis.py └── prepare_cocofied_lvisv1.py ├── demo ├── demo.py └── predictor.py ├── demo1.png ├── demo2.png ├── mask2former ├── __init__.py ├── config.py ├── data │ ├── __init__.py │ ├── dataset_mappers │ │ ├── __init__.py │ │ ├── coco_instance_new_baseline_dataset_mapper.py │ │ ├── coco_panoptic_new_baseline_dataset_mapper.py │ │ ├── mask_former_instance_dataset_mapper.py │ │ ├── mask_former_panoptic_dataset_mapper.py │ │ └── mask_former_semantic_dataset_mapper.py │ └── datasets │ │ ├── __init__.py │ │ └── register_coco_panoptic_annos_semseg.py ├── evaluation │ ├── __init__.py │ └── instance_evaluation.py ├── maskformer_model.py ├── modeling │ ├── __init__.py │ ├── backbone │ │ ├── __init__.py │ │ ├── aff.py │ │ └── point_utils.py │ ├── clusten │ │ ├── __init__.py │ │ ├── clusten.py │ │ ├── src │ │ │ ├── clustenav_cuda.cpp │ │ │ ├── clustenav_cuda_kernel.cu │ │ │ ├── clustenqk_cuda.cpp │ │ │ ├── clustenqk_cuda_kernel.cu │ │ │ ├── clustenwf_cuda.cpp │ │ │ ├── clustenwf_cuda_kernel.cu │ │ │ ├── msdetrpc_cuda.cpp │ │ │ ├── msdetrpc_cuda_kernel.cu │ │ │ ├── setup.py │ │ │ ├── weighted_gather_cuda.cpp │ │ │ └── weighted_gather_cuda_kernel.cu │ │ ├── test_msdetrpc_kernel.py │ │ └── test_wg_kernel.py │ ├── criterion.py │ ├── matcher.py │ ├── meta_arch │ │ ├── __init__.py │ │ └── mask_former_head.py │ ├── pixel_decoder │ │ └── msdeformattn_pc.py │ └── transformer_decoder │ │ ├── __init__.py │ │ ├── mask2former_transformer_decoder.py │ │ ├── position_encoding.py │ │ └── transformer.py ├── test_time_augmentation.py └── utils │ ├── __init__.py │ └── misc.py ├── run_aff_segmentation.sh ├── run_demo.sh ├── tools ├── README.md ├── analyze_model.py └── convert-pretrained-model-to-d2.py └── train_net.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | select = B,C,E,F,P,T4,W,B9 3 | max-line-length = 120 4 | # C408 ignored because we like the dict keyword argument syntax 5 | # E501 is not flexible enough, we're using B950 instead 6 | ignore = 7 | E203,E305,E402,E501,E721,E741,F405,F821,F841,F999,W503,W504,C408,E303,E226, 8 | # shebang has extra meaning in fbcode lints, so I think it's not worth trying 9 | # to line this up with executable bit 10 | EXE001, 11 | # these ignores are from flake8-bugbear; please fix! 12 | B007,B008, 13 | # these ignores are from flake8-comprehensions; please fix! 14 | C400,C401,C402,C403,C404,C405,C407,C411,C413,C414,C415, 15 | # for "unable to detect undefined names" 16 | F403, 17 | # for "Too many leading '#' for block comment (E266)" 18 | E266, 19 | # for "E731 do not assign a lambda expression, use a def" 20 | E731, 21 | # for "future feature annotations is not defined" 22 | F407, 23 | # do not use bare 'except' 24 | E722, 25 | per-file-ignores = 26 | __init__.py: F401, 27 | #pre_table is used as a global variable 28 | mask2former/modeling/pixel_decoder/msdeformattn_pc.py: F401 29 | optional-ascii-coding = True 30 | exclude = 31 | ./.git, 32 | ./docs, 33 | ./scripts, 34 | ./test 35 | ./third_party, 36 | ./venv, 37 | *.pyi 38 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.pth filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.svg 2 | .nfs* 3 | .DS_Store 4 | __pycache__/ 5 | *swp* 6 | output/ 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | pip-wheel-metadata/ 31 | share/python-wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | MANIFEST 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .nox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | *.py,cover 58 | .hypothesis/ 59 | .pytest_cache/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | db.sqlite3 69 | db.sqlite3-journal 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | 81 | # PyBuilder 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # IPython 88 | profile_default/ 89 | ipython_config.py 90 | 91 | # pyenv 92 | .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 102 | __pypackages__/ 103 | 104 | # Celery stuff 105 | celerybeat-schedule 106 | celerybeat.pid 107 | 108 | # SageMath parsed files 109 | *.sage.py 110 | 111 | # Environments 112 | .env 113 | .venv 114 | env/ 115 | venv/ 116 | ENV/ 117 | env.bak/ 118 | venv.bak/ 119 | 120 | # Spyder project settings 121 | .spyderproject 122 | .spyproject 123 | 124 | # Rope project settings 125 | .ropeproject 126 | 127 | # mkdocs documentation 128 | /site 129 | 130 | # mypy 131 | .mypy_cache/ 132 | .dmypy.json 133 | dmypy.json 134 | 135 | # Pyre type checker 136 | .pyre/ 137 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, caste, color, religion, or sexual 10 | identity and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the overall 26 | community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or advances of 31 | any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email address, 35 | without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at [opensource-conduct@group.apple.com](mailto:opensource-conduct@group.apple.com). 63 | All complaints will be reviewed and investigated promptly and fairly. 64 | 65 | All community leaders are obligated to respect the privacy and security of the 66 | reporter of any incident. 67 | 68 | ## Enforcement Guidelines 69 | 70 | Community leaders will follow these Community Impact Guidelines in determining 71 | the consequences for any action they deem in violation of this Code of Conduct: 72 | 73 | ### 1. Correction 74 | 75 | **Community Impact**: Use of inappropriate language or other behavior deemed 76 | unprofessional or unwelcome in the community. 77 | 78 | **Consequence**: A private, written warning from community leaders, providing 79 | clarity around the nature of the violation and an explanation of why the 80 | behavior was inappropriate. A public apology may be requested. 81 | 82 | ### 2. Warning 83 | 84 | **Community Impact**: A violation through a single incident or series of 85 | actions. 86 | 87 | **Consequence**: A warning with consequences for continued behavior. No 88 | interaction with the people involved, including unsolicited interaction with 89 | those enforcing the Code of Conduct, for a specified period of time. This 90 | includes avoiding interactions in community spaces as well as external channels 91 | like social media. Violating these terms may lead to a temporary or permanent 92 | ban. 93 | 94 | ### 3. Temporary Ban 95 | 96 | **Community Impact**: A serious violation of community standards, including 97 | sustained inappropriate behavior. 98 | 99 | **Consequence**: A temporary ban from any sort of interaction or public 100 | communication with the community for a specified period of time. No public or 101 | private interaction with the people involved, including unsolicited interaction 102 | with those enforcing the Code of Conduct, is allowed during this period. 103 | Violating these terms may lead to a permanent ban. 104 | 105 | ### 4. Permanent Ban 106 | 107 | **Community Impact**: Demonstrating a pattern of violation of community 108 | standards, including sustained inappropriate behavior, harassment of an 109 | individual, or aggression toward or disparagement of classes of individuals. 110 | 111 | **Consequence**: A permanent ban from any sort of public interaction within the 112 | community. 113 | 114 | ## Attribution 115 | 116 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 117 | version 2.1, available at 118 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. 119 | 120 | Community Impact Guidelines were inspired by 121 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC]. 122 | 123 | For answers to common questions about this code of conduct, see the FAQ at 124 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at 125 | [https://www.contributor-covenant.org/translations][translations]. 126 | 127 | [homepage]: https://www.contributor-covenant.org 128 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html 129 | [Mozilla CoC]: https://github.com/mozilla/diversity 130 | [FAQ]: https://www.contributor-covenant.org/faq 131 | [translations]: https://www.contributor-covenant.org/translations 132 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribution Guide 2 | 3 | Thanks for your interest in contributing. This project was released to accompany a research paper for purposes of reproducability, and beyond its publication there are limited plans for future development of the repository. 4 | 5 | While we welcome new pull requests and issues please note that our response may be limited. Forks and out-of-tree improvements are strongly encouraged. 6 | 7 | ## Before you get started 8 | 9 | By submitting a pull request, you represent that you have the right to license your contribution to Apple and the community, and agree by submitting the patch that your contributions are licensed under the [LICENSE](LICENSE). 10 | 11 | We ask that all community members read and observe our [Code of Conduct](CODE_OF_CONDUCT.md). -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2023 Apple Inc. All Rights Reserved. 2 | 3 | IMPORTANT: This Apple software is supplied to you by Apple 4 | Inc. ("Apple") in consideration of your agreement to the following 5 | terms, and your use, installation, modification or redistribution of 6 | this Apple software constitutes acceptance of these terms. If you do 7 | not agree with these terms, please do not use, install, modify or 8 | redistribute this Apple software. 9 | 10 | In consideration of your agreement to abide by the following terms, and 11 | subject to these terms, Apple grants you a personal, non-exclusive 12 | license, under Apple's copyrights in this original Apple software (the 13 | "Apple Software"), to use, reproduce, modify and redistribute the Apple 14 | Software, with or without modifications, in source and/or binary forms; 15 | provided that if you redistribute the Apple Software in its entirety and 16 | without modifications, you must retain this notice and the following 17 | text and disclaimers in all such redistributions of the Apple Software. 18 | Neither the name, trademarks, service marks or logos of Apple Inc. may 19 | be used to endorse or promote products derived from the Apple Software 20 | without specific prior written permission from Apple. Except as 21 | expressly stated in this notice, no other rights or licenses, express or 22 | implied, are granted by Apple herein, including but not limited to any 23 | patent rights that may be infringed by your derivative works or by other 24 | works in which the Apple Software may be incorporated. 25 | 26 | The Apple Software is provided by Apple on an "AS IS" basis. APPLE 27 | MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION 28 | THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS 29 | FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND 30 | OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS. 31 | 32 | IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL 33 | OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 34 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 35 | INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, 36 | MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED 37 | AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE), 38 | STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE 39 | POSSIBILITY OF SUCH DAMAGE. 40 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AutoFocusFormer 2 | 3 | [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.1-4baaaa.svg)](CODE_OF_CONDUCT.md) 4 | [![CLUSTEN](https://img.shields.io/badge/CUDA%20Extension-CLUSTEN-red)](clusten/) 5 | 6 | AFF-Base: [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/autofocusformer-image-segmentation-off-the/instance-segmentation-on-cityscapes-val)](https://paperswithcode.com/sota/instance-segmentation-on-cityscapes-val?p=autofocusformer-image-segmentation-off-the) [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/autofocusformer-image-segmentation-off-the/panoptic-segmentation-on-cityscapes-val)](https://paperswithcode.com/sota/panoptic-segmentation-on-cityscapes-val?p=autofocusformer-image-segmentation-off-the) 7 | 8 | This software project accompanies the research paper, *AutoFocusFormer: Image Segmentation off the Grid* (CVPR 2023). 9 | 10 | [Chen Ziwen](https://www.chenziwe.com), Kaushik Patnaik, [Shuangfei Zhai](https://scholar.google.com/citations?user=G6vdBYsAAAAJ&hl=en), [Alvin Wan](http://alvinwan.com), [Zhile Ren](https://jrenzhile.com), [Alex Schwing](https://alexander-schwing.de/), [Alex Colburn](https://www.colburn.org), [Li Fuxin](https://web.engr.oregonstate.edu/~lif/) 11 | 12 | [arXiv](https://arxiv.org/abs/2304.12406) | [video narration](https://youtu.be/i1mZtk70yGY) | [AFF-Classification](https://github.com/apple/ml-autofocusformer) | [AFF-Segmentation (this repo)](https://github.com/apple/ml-autofocusformer-segmentation) 13 | 14 | ## Introduction 15 | 16 | AutoFocusFormer (AFF) is the first **adaptive**-downsampling network capable of **dense** prediction tasks such as semantic/instance segmentation. 17 | 18 | AFF abandons the traditional grid structure of image feature maps, and automatically learns to retain the most important pixels with respect to the task goal. 19 | 20 |
21 | 22 |

23 | 24 | AFF consists of a local-attention transformer backbone and a task-specific head. The backbone consists of four stages, each stage containing three modules: balanced clustering, local-attention transformer blocks, and adaptive downsampling. 25 | 26 |
27 | 28 |

29 | 30 | AFF demonstrates significant savings on FLOPs (see our models with 1/5 downsampling rate), and significant improvement on recognition of small objects. 31 | 32 | Notably, AFF-Small achieves **44.0** instance segmentation AP and **66.9** panoptic segmentation PQ on Cityscapes val with a backbone of only **42.6M** parameters, a performance on par with Swin-Large, a backbone with **197M** params (saving **78%**!). 33 | 34 |
35 | 36 |

37 | 38 |
39 | 40 |

41 | 42 | This repository contains the AFF backbone and the point cloud-version of the Mask2Former segmentation head. 43 | 44 | We also add a few convenient functionalities, such as visualizing prediction results on blurred version of the images, and evaluating on cocofied lvis v1 annotations. 45 | 46 | ## Main Results with Pretrained Models 47 | 48 | **ADE20K Semantic Segmentation (val)** 49 | | backbone | method | pretrain | crop size | mIoU | FLOPs | checkpoint | 50 | | :---: | :---: | :---: | :---: | :---: | :---: | :---: | 51 | | AFF-Mini | Mask2Former | ImageNet-1K | 512x512 | 46.5 | 48.3G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/ade_sem/aff_mini.pth) | 52 | | AFF-Mini-1/5 | Mask2Former | ImageNet-1K | 512x512 | 46.0 | 39.9G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/ade_sem/aff_mini_1_5th.pth) | 53 | | AFF-Tiny | Mask2Former | ImageNet-1K | 512x512 | 50.2 | 64.6G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/ade_sem/aff_tiny.pth) | 54 | | AFF-Tiny-1/5 | Mask2Former | ImageNet-1K | 512x512 | 50.0 | 51.1G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/ade_sem/aff_tiny_1_5th.pth) | 55 | | AFF-Small | Mask2Former | ImageNet-1K | 512x512 | 51.2 | 87G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/ade_sem/aff_small.pth) | 56 | | AFF-Small-1/5 | Mask2Former | ImageNet-1K | 512x512 | 51.9 | 67.2G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/ade_sem/aff_small_1_5th.pth) | 57 | 58 | **Cityscapes Instance Segmentation (val)** 59 | | backbone | method | pretrain | AP | checkpoint | 60 | | :---: | :---: | :---: | :---: | :---: | 61 | | AFF-Mini | Mask2Former | ImageNet-1K | 40.0 | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/city_ins/aff_mini.pth) | 62 | | AFF-Tiny | Mask2Former | ImageNet-1K | 42.7 | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/city_ins/aff_tiny.pth) | 63 | | AFF-Small | Mask2Former | ImageNet-1K | 44.0 | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/city_ins/aff_small.pth) | 64 | | AFF-Base | Mask2Former | ImageNet-22K | 46.2 | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/city_pan/aff_base_22kto1k_384.pth) | 65 | 66 | **Cityscapes Panoptic Segmentation (val)** 67 | | backbone | method | pretrain | PQ(s.s.) | checkpoint | 68 | | :---: | :---: | :---: | :---: | :---: | 69 | | AFF-Mini | Mask2Former | ImageNet-1K | 62.7 | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/city_pan/aff_mini.pth) | 70 | | AFF-Tiny | Mask2Former | ImageNet-1K | 65.7 | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/city_pan/aff_tiny.pth) | 71 | | AFF-Small | Mask2Former | ImageNet-1K | 66.9 | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/city_pan/aff_small.pth) | 72 | | AFF-Base | Mask2Former | ImageNet-22K | 67.7 | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/city_pan/aff_base_22kto1k_384.pth) | 73 | 74 | **COCO Instance Segmentation (val)** 75 | | backbone | method | pretrain | epochs | AP | FLOPs | checkpoint | 76 | | :---: | :---: | :---: | :---: | :---: | :---: | :---: | 77 | | AFF-Mini | Mask2Former | ImageNet-1K | 50 | 42.3 | 148G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/coco_ins/aff_mini.pth) | 78 | | AFF-Mini-1/5 | Mask2Former | ImageNet-1K | 50 | 42.3 | 120G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/coco_ins/aff_mini_1_5th.pth) | 79 | | AFF-Tiny | Mask2Former | ImageNet-1K | 50 | 45.3 | 204G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/coco_ins/aff_tiny.pth) | 80 | | AFF-Tiny-1/5 | Mask2Former | ImageNet-1K | 50 | 44.5 | 152G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/coco_ins/aff_tiny_1_5th.pth) | 81 | | AFF-Small | Mask2Former | ImageNet-1K | 50 | 46.4 | 281G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/coco_ins/aff_small.pth) | 82 | | AFF-Small-1/5 | Mask2Former | ImageNet-1K | 50 | 45.7 | 206G | [Apple ML](https://docs-assets.developer.apple.com/ml-research/models/aff/segmentation/coco_ins/aff_small_1_5th.pth) | 83 | 84 | ## Getting Started 85 | 86 | ### Clone this repo 87 | 88 | ```bash 89 | git clone git@github.com:apple/ml-autofocusformer-segmentation.git 90 | cd ml-autofocusformer-segmentation 91 | ``` 92 | One can download the pre-trained checkpoints through the links in the tables above. 93 | 94 | ### Create environment and install requirements 95 | 96 | ```bash 97 | sh create_env.sh 98 | ``` 99 | 100 | See further documentation inside the script file. 101 | 102 | Our experiments are run with `CUDA==11.6` and `pytorch==1.12`. 103 | 104 | ### Prepare data 105 | 106 | Please refer to [dataset README](datasets/README.md). 107 | 108 | ### Prepare pre-trained backbone checkpoint 109 | 110 | Use `tools/convert-pretrained-model-to-d2.py` to convert any torch checkpoint `.pth` file trained on ImageNet into a Detectron2 model zoo format `.pkl` file. 111 | ``` 112 | python tools/convert-pretrained-model-to-d2.py aff_mini.pth aff_mini.pkl 113 | ``` 114 | Otherwise, d2 will assume the checkpoint is for the entire segmentation model and will not add `backbone.` to the parameter names, and thus the checkpoint will not be properly loaded. 115 | 116 | ### Train and evaluate 117 | 118 | Modify the arguments in script `run_aff_segmentation.sh` and run 119 | ```bash 120 | sh run_aff_segmentation.sh 121 | ``` 122 | for training or evaluation. 123 | 124 | One can also directly modify the config files in `configs/`. 125 | 126 | ### Visualize predictions for pre-trained models 127 | 128 | See script `run_demo.sh`. More details can be found in [Mask2Former GETTING_STARTED.md](https://github.com/facebookresearch/Mask2Former/blob/main/GETTING_STARTED.md). 129 | 130 | ### Analyze model FLOPs 131 | 132 | See [tools README](tools/README.md). 133 | 134 | ## Citing AutoFocusFormer 135 | 136 | ```BibTeX 137 | @inproceedings{autofocusformer, 138 | title = {AutoFocusFormer: Image Segmentation off the Grid}, 139 | booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, 140 | author = {Ziwen, Chen and Patnaik, Kaushik and Zhai, Shuangfei and Wan, Alvin and Ren, Zhile and Schwing, Alex and Colburn, Alex and Fuxin, Li}, 141 | year = {2023}, 142 | } 143 | ``` 144 | -------------------------------------------------------------------------------- /aff.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-autofocusformer-segmentation/52908e8ad5112b5bff1d043e6a06a9e8f9aad3ba/aff.png -------------------------------------------------------------------------------- /architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-autofocusformer-segmentation/52908e8ad5112b5bff1d043e6a06a9e8f9aad3ba/architecture.png -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/Base-ADE20K-SemanticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("ade20k_sem_seg_train",) 18 | TEST: ("ade20k_sem_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 160000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: False 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 512 40 | MAX_SIZE_TRAIN: 2048 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (512, 512) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: 512 # used in dataset mapper 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_semantic" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [256, 384, 512, 640, 768, 896] 56 | MAX_SIZE: 3584 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | SEED: 0 63 | -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/aff/maskformer2_aff_mini_1_5th_bs32_80k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "AutoFocusFormer" 5 | AFF: 6 | EMBED_DIM: [32,128,256,384] 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [ 2, 4, 8, 16 ] 9 | DROP_PATH_RATE: 0.0 10 | PATCH_NORM: True 11 | MLP_RATIO: 2. 12 | CLUSTER_SIZE: 8 13 | NBHD_SIZE: [48,48,48,48] 14 | ALPHA: 4.0 15 | DS_RATE: 0.2 16 | WEIGHTS: "aff_mini_1_5th.pkl" 17 | PIXEL_MEAN: [123.675, 116.280, 103.530] 18 | PIXEL_STD: [58.395, 57.120, 57.375] 19 | SOLVER: 20 | IMS_PER_BATCH: 32 21 | BASE_LR: 0.0002 22 | MAX_ITER: 80000 23 | -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/aff/maskformer2_aff_mini_bs32_80k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "AutoFocusFormer" 5 | AFF: 6 | EMBED_DIM: [32,128,256,384] 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [ 2, 4, 8, 16 ] 9 | DROP_PATH_RATE: 0.0 10 | PATCH_NORM: True 11 | MLP_RATIO: 2. 12 | CLUSTER_SIZE: 8 13 | NBHD_SIZE: [48,48,48,48] 14 | ALPHA: 4.0 15 | DS_RATE: 0.25 16 | WEIGHTS: "aff_mini.pkl" 17 | PIXEL_MEAN: [123.675, 116.280, 103.530] 18 | PIXEL_STD: [58.395, 57.120, 57.375] 19 | SOLVER: 20 | IMS_PER_BATCH: 32 21 | BASE_LR: 0.0002 22 | MAX_ITER: 80000 23 | -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/aff/maskformer2_aff_small_1_5th_bs32_80k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "AutoFocusFormer" 5 | AFF: 6 | EMBED_DIM: [96,192,384,768] 7 | DEPTHS: [3,4,18,2] 8 | NUM_HEADS: [3,6,12,24] 9 | DROP_PATH_RATE: 0.3 10 | PATCH_NORM: True 11 | MLP_RATIO: 3. 12 | CLUSTER_SIZE: 8 13 | NBHD_SIZE: [48,48,48,48] 14 | LAYER_SCALE: 1e-5 # turned off if 0.0 15 | ALPHA: 4.0 16 | DS_RATE: 0.2 17 | WEIGHTS: "aff_small_1_5th.pkl" 18 | PIXEL_MEAN: [123.675, 116.280, 103.530] 19 | PIXEL_STD: [58.395, 57.120, 57.375] 20 | SOLVER: 21 | IMS_PER_BATCH: 32 22 | BASE_LR: 0.0002 23 | MAX_ITER: 80000 24 | -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/aff/maskformer2_aff_small_bs32_80k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "AutoFocusFormer" 5 | AFF: 6 | EMBED_DIM: [96,192,384,768] 7 | DEPTHS: [3,4,18,2] 8 | NUM_HEADS: [3,6,12,24] 9 | DROP_PATH_RATE: 0.3 10 | PATCH_NORM: True 11 | MLP_RATIO: 3. 12 | CLUSTER_SIZE: 8 13 | NBHD_SIZE: [48,48,48,48] 14 | LAYER_SCALE: 1e-5 # turned off if 0.0 15 | ALPHA: 4.0 16 | DS_RATE: 0.25 17 | WEIGHTS: "aff_small.pkl" 18 | PIXEL_MEAN: [123.675, 116.280, 103.530] 19 | PIXEL_STD: [58.395, 57.120, 57.375] 20 | SOLVER: 21 | IMS_PER_BATCH: 32 22 | BASE_LR: 0.0002 23 | MAX_ITER: 80000 24 | -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/aff/maskformer2_aff_tiny_1_5th_bs32_80k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "AutoFocusFormer" 5 | AFF: 6 | EMBED_DIM: [64,128,256,512] 7 | DEPTHS: [3,4,18,5] 8 | NUM_HEADS: [2,4,8,16] 9 | DROP_PATH_RATE: 0.3 10 | PATCH_NORM: True 11 | MLP_RATIO: 3. 12 | CLUSTER_SIZE: 8 13 | NBHD_SIZE: [48,48,48,48] 14 | ALPHA: 4.0 15 | DS_RATE: 0.2 16 | WEIGHTS: "aff_tiny_1_5th.pkl" 17 | PIXEL_MEAN: [123.675, 116.280, 103.530] 18 | PIXEL_STD: [58.395, 57.120, 57.375] 19 | SOLVER: 20 | IMS_PER_BATCH: 32 21 | BASE_LR: 0.0002 22 | MAX_ITER: 80000 23 | -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/aff/maskformer2_aff_tiny_bs32_80k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_160k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "AutoFocusFormer" 5 | AFF: 6 | EMBED_DIM: [64,128,256,512] 7 | DEPTHS: [3,4,18,5] 8 | NUM_HEADS: [2,4,8,16] 9 | DROP_PATH_RATE: 0.3 10 | PATCH_NORM: True 11 | MLP_RATIO: 3. 12 | CLUSTER_SIZE: 8 13 | NBHD_SIZE: [48,48,48,48] 14 | ALPHA: 4.0 15 | DS_RATE: 0.25 16 | WEIGHTS: "aff_tiny.pkl" 17 | PIXEL_MEAN: [123.675, 116.280, 103.530] 18 | PIXEL_STD: [58.395, 57.120, 57.375] 19 | SOLVER: 20 | IMS_PER_BATCH: 32 21 | BASE_LR: 0.0002 22 | MAX_ITER: 80000 23 | -------------------------------------------------------------------------------- /configs/ade20k/semantic-segmentation/maskformer2_R50_bs16_160k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-ADE20K-SemanticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 150 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: False 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /configs/cityscapes/instance-segmentation/Base-Cityscapes-InstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | NORM: "SyncBN" # use syncbn for cityscapes dataset 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("cityscapes_fine_instance_seg_train",) 18 | TEST: ("cityscapes_fine_instance_seg_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 90000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: False 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 1024 40 | MAX_SIZE_TRAIN: 4096 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (512, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: -1 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_instance" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792] 56 | MAX_SIZE: 4096 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | SEED: 0 63 | -------------------------------------------------------------------------------- /configs/cityscapes/instance-segmentation/aff/maskformer2_aff_base_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "AutoFocusFormer" 5 | AFF: 6 | EMBED_DIM: [128, 256, 512, 1024] 7 | DEPTHS: [3,4,18,2] 8 | NUM_HEADS: [4,8,16,32] 9 | DROP_PATH_RATE: 0.3 10 | PATCH_NORM: True 11 | MLP_RATIO: 3. 12 | CLUSTER_SIZE: 24 13 | NBHD_SIZE: [144,144,144,144] 14 | LAYER_SCALE: 1e-5 # turned off if 0.0 15 | ALPHA: 8.0 16 | DS_RATE: 0.25 17 | WEIGHTS: "aff_base_22kto1k_384.pkl" 18 | PIXEL_MEAN: [123.675, 116.280, 103.530] 19 | PIXEL_STD: [58.395, 57.120, 57.375] 20 | MASK_FORMER: 21 | NUM_OBJECT_QUERIES: 250 22 | SOLVER: 23 | IMS_PER_BATCH: 16 24 | BASE_LR: 0.0001 25 | MAX_ITER: 90000 26 | TEST: 27 | DETECTIONS_PER_IMAGE: 250 28 | -------------------------------------------------------------------------------- /configs/cityscapes/instance-segmentation/aff/maskformer2_aff_mini_bs32_45k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "AutoFocusFormer" 5 | AFF: 6 | EMBED_DIM: [32,128,256,384] 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [ 2, 4, 8, 16 ] 9 | DROP_PATH_RATE: 0.0 10 | PATCH_NORM: True 11 | MLP_RATIO: 2. 12 | CLUSTER_SIZE: 8 13 | NBHD_SIZE: [48,48,48,48] 14 | ALPHA: 8.0 15 | DS_RATE: 0.25 16 | WEIGHTS: "aff_mini.pkl" 17 | PIXEL_MEAN: [123.675, 116.280, 103.530] 18 | PIXEL_STD: [58.395, 57.120, 57.375] 19 | SOLVER: 20 | IMS_PER_BATCH: 32 21 | BASE_LR: 0.0002 22 | MAX_ITER: 45000 23 | -------------------------------------------------------------------------------- /configs/cityscapes/instance-segmentation/aff/maskformer2_aff_small_bs32_45k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "AutoFocusFormer" 5 | AFF: 6 | EMBED_DIM: [96,192,384,768] 7 | DEPTHS: [3,4,18,2] 8 | NUM_HEADS: [3,6,12,24] 9 | DROP_PATH_RATE: 0.3 10 | PATCH_NORM: True 11 | MLP_RATIO: 3. 12 | CLUSTER_SIZE: 8 13 | NBHD_SIZE: [48,48,48,48] 14 | LAYER_SCALE: 1e-5 # turned off if 0.0 15 | ALPHA: 8.0 16 | DS_RATE: 0.25 17 | WEIGHTS: "aff_small.pkl" 18 | PIXEL_MEAN: [123.675, 116.280, 103.530] 19 | PIXEL_STD: [58.395, 57.120, 57.375] 20 | SOLVER: 21 | IMS_PER_BATCH: 32 22 | BASE_LR: 0.0002 23 | MAX_ITER: 45000 24 | -------------------------------------------------------------------------------- /configs/cityscapes/instance-segmentation/aff/maskformer2_aff_tiny_bs32_45k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "AutoFocusFormer" 5 | AFF: 6 | EMBED_DIM: [64,128,256,512] 7 | DEPTHS: [3,4,18,5] 8 | NUM_HEADS: [2,4,8,16] 9 | DROP_PATH_RATE: 0.3 10 | PATCH_NORM: True 11 | MLP_RATIO: 3. 12 | CLUSTER_SIZE: 8 13 | NBHD_SIZE: [48,48,48,48] 14 | ALPHA: 8.0 15 | DS_RATE: 0.25 16 | WEIGHTS: "aff_tiny.pkl" 17 | PIXEL_MEAN: [123.675, 116.280, 103.530] 18 | PIXEL_STD: [58.395, 57.120, 57.375] 19 | SOLVER: 20 | IMS_PER_BATCH: 32 21 | BASE_LR: 0.0002 22 | MAX_ITER: 45000 23 | -------------------------------------------------------------------------------- /configs/cityscapes/instance-segmentation/maskformer2_R50_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Cityscapes-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 8 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: False 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /configs/cityscapes/panoptic-segmentation/Base-Cityscapes-PanopticSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | NORM: "SyncBN" # use syncbn for cityscapes dataset 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("cityscapes_fine_panoptic_train",) 18 | TEST: ("cityscapes_fine_panoptic_val",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 90000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.05 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | AMP: 35 | ENABLED: False 36 | INPUT: 37 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"] 38 | MIN_SIZE_TRAIN_SAMPLING: "choice" 39 | MIN_SIZE_TEST: 1024 40 | MAX_SIZE_TRAIN: 4096 41 | MAX_SIZE_TEST: 2048 42 | CROP: 43 | ENABLED: True 44 | TYPE: "absolute" 45 | SIZE: (512, 1024) 46 | SINGLE_CATEGORY_MAX_AREA: 1.0 47 | COLOR_AUG_SSD: True 48 | SIZE_DIVISIBILITY: -1 49 | FORMAT: "RGB" 50 | DATASET_MAPPER_NAME: "mask_former_panoptic" 51 | TEST: 52 | EVAL_PERIOD: 5000 53 | AUG: 54 | ENABLED: False 55 | MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792] 56 | MAX_SIZE: 4096 57 | FLIP: True 58 | DATALOADER: 59 | FILTER_EMPTY_ANNOTATIONS: True 60 | NUM_WORKERS: 4 61 | VERSION: 2 62 | SEED: 0 63 | -------------------------------------------------------------------------------- /configs/cityscapes/panoptic-segmentation/aff/maskformer2_aff_base_384_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "AutoFocusFormer" 5 | AFF: 6 | EMBED_DIM: [128, 256, 512, 1024] 7 | DEPTHS: [3,4,18,2] 8 | NUM_HEADS: [4,8,16,32] 9 | DROP_PATH_RATE: 0.3 10 | PATCH_NORM: True 11 | MLP_RATIO: 3. 12 | CLUSTER_SIZE: 24 13 | NBHD_SIZE: [144,144,144,144] 14 | LAYER_SCALE: 1e-5 # turned off if 0.0 15 | ALPHA: 8.0 16 | DS_RATE: 0.25 17 | WEIGHTS: "aff_base_22kto1k_384.pkl" 18 | PIXEL_MEAN: [123.675, 116.280, 103.530] 19 | PIXEL_STD: [58.395, 57.120, 57.375] 20 | MASK_FORMER: 21 | NUM_OBJECT_QUERIES: 250 22 | SOLVER: 23 | IMS_PER_BATCH: 16 24 | BASE_LR: 0.0001 25 | MAX_ITER: 90000 26 | TEST: 27 | DETECTIONS_PER_IMAGE: 250 28 | -------------------------------------------------------------------------------- /configs/cityscapes/panoptic-segmentation/aff/maskformer2_aff_mini_bs32_45k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "AutoFocusFormer" 5 | AFF: 6 | EMBED_DIM: [32,128,256,384] 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [ 2, 4, 8, 16 ] 9 | DROP_PATH_RATE: 0.0 10 | PATCH_NORM: True 11 | MLP_RATIO: 2. 12 | CLUSTER_SIZE: 8 13 | NBHD_SIZE: [48,48,48,48] 14 | ALPHA: 8.0 15 | DS_RATE: 0.25 16 | WEIGHTS: "aff_mini.pkl" 17 | PIXEL_MEAN: [123.675, 116.280, 103.530] 18 | PIXEL_STD: [58.395, 57.120, 57.375] 19 | SOLVER: 20 | IMS_PER_BATCH: 32 21 | BASE_LR: 0.0002 22 | MAX_ITER: 45000 23 | -------------------------------------------------------------------------------- /configs/cityscapes/panoptic-segmentation/aff/maskformer2_aff_small_bs32_45k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "AutoFocusFormer" 5 | AFF: 6 | EMBED_DIM: [96,192,384,768] 7 | DEPTHS: [3,4,18,2] 8 | NUM_HEADS: [3,6,12,24] 9 | DROP_PATH_RATE: 0.3 10 | PATCH_NORM: True 11 | MLP_RATIO: 3. 12 | CLUSTER_SIZE: 8 13 | NBHD_SIZE: [48,48,48,48] 14 | LAYER_SCALE: 1e-5 # turned off if 0.0 15 | ALPHA: 8.0 16 | DS_RATE: 0.25 17 | WEIGHTS: "aff_small.pkl" 18 | PIXEL_MEAN: [123.675, 116.280, 103.530] 19 | PIXEL_STD: [58.395, 57.120, 57.375] 20 | SOLVER: 21 | IMS_PER_BATCH: 32 22 | BASE_LR: 0.0002 23 | MAX_ITER: 45000 24 | -------------------------------------------------------------------------------- /configs/cityscapes/panoptic-segmentation/aff/maskformer2_aff_tiny_bs32_45k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_90k.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "AutoFocusFormer" 5 | AFF: 6 | EMBED_DIM: [64,128,256,512] 7 | DEPTHS: [3,4,18,5] 8 | NUM_HEADS: [2,4,8,16] 9 | DROP_PATH_RATE: 0.3 10 | PATCH_NORM: True 11 | MLP_RATIO: 3. 12 | CLUSTER_SIZE: 8 13 | NBHD_SIZE: [48,48,48,48] 14 | ALPHA: 8.0 15 | DS_RATE: 0.25 16 | WEIGHTS: "aff_tiny.pkl" 17 | PIXEL_MEAN: [123.675, 116.280, 103.530] 18 | PIXEL_STD: [58.395, 57.120, 57.375] 19 | SOLVER: 20 | IMS_PER_BATCH: 32 21 | BASE_LR: 0.0002 22 | MAX_ITER: 45000 23 | -------------------------------------------------------------------------------- /configs/cityscapes/panoptic-segmentation/maskformer2_R50_bs16_90k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-Cityscapes-PanopticSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 19 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: True 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: True 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | AFF: 17 | SHEPARD_POWER: 4.0 18 | SHEPARD_POWER_LEARNABLE: False 19 | DATASETS: 20 | TRAIN: ("coco_2017_train",) 21 | TEST: ("coco_2017_val",) 22 | SOLVER: 23 | IMS_PER_BATCH: 16 24 | BASE_LR: 0.0001 25 | STEPS: (327778, 355092) 26 | MAX_ITER: 368750 27 | WARMUP_FACTOR: 1.0 28 | WARMUP_ITERS: 10 29 | WEIGHT_DECAY: 0.05 30 | OPTIMIZER: "ADAMW" 31 | BACKBONE_MULTIPLIER: 0.1 32 | CLIP_GRADIENTS: 33 | ENABLED: True 34 | CLIP_TYPE: "full_model" 35 | CLIP_VALUE: 0.01 36 | NORM_TYPE: 2.0 37 | AMP: 38 | ENABLED: False 39 | INPUT: 40 | IMAGE_SIZE: 1024 41 | MIN_SCALE: 0.1 42 | MAX_SCALE: 2.0 43 | FORMAT: "RGB" 44 | DATASET_MAPPER_NAME: "coco_instance_lsj" 45 | TEST: 46 | EVAL_PERIOD: 5000 47 | DATALOADER: 48 | FILTER_EMPTY_ANNOTATIONS: True 49 | NUM_WORKERS: 4 50 | VERSION: 2 51 | SEED: 0 52 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/aff/maskformer2_aff_mini_1_5th_bs64_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "AutoFocusFormer" 5 | AFF: 6 | EMBED_DIM: [32,128,256,384] 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [ 2, 4, 8, 16 ] 9 | DROP_PATH_RATE: 0.0 10 | PATCH_NORM: True 11 | MLP_RATIO: 2. 12 | CLUSTER_SIZE: 8 13 | NBHD_SIZE: [48,48,48,48] 14 | ALPHA: 4.0 15 | DS_RATE: 0.2 16 | WEIGHTS: "aff_mini_1_5th.pkl" 17 | PIXEL_MEAN: [123.675, 116.280, 103.530] 18 | PIXEL_STD: [58.395, 57.120, 57.375] 19 | SOLVER: 20 | IMS_PER_BATCH: 64 21 | BASE_LR: 0.0002 22 | STEPS: (81945, 88773) 23 | MAX_ITER: 92188 24 | WARMUP_ITERS: 3 25 | CHECKPOINT_PERIOD: 2500 26 | TEST: 27 | EVAL_PERIOD: 2500 28 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/aff/maskformer2_aff_mini_bs64_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "AutoFocusFormer" 5 | AFF: 6 | EMBED_DIM: [32,128,256,384] 7 | DEPTHS: [2, 2, 6, 2] 8 | NUM_HEADS: [ 2, 4, 8, 16 ] 9 | DROP_PATH_RATE: 0.0 10 | PATCH_NORM: True 11 | MLP_RATIO: 2. 12 | CLUSTER_SIZE: 8 13 | NBHD_SIZE: [48,48,48,48] 14 | ALPHA: 4.0 15 | DS_RATE: 0.25 16 | WEIGHTS: "aff_mini.pkl" 17 | PIXEL_MEAN: [123.675, 116.280, 103.530] 18 | PIXEL_STD: [58.395, 57.120, 57.375] 19 | SOLVER: 20 | IMS_PER_BATCH: 64 21 | BASE_LR: 0.0002 22 | STEPS: (81945, 88773) 23 | MAX_ITER: 92188 24 | WARMUP_ITERS: 3 25 | CHECKPOINT_PERIOD: 2500 26 | TEST: 27 | EVAL_PERIOD: 2500 28 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/aff/maskformer2_aff_small_1_5th_bs64_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "AutoFocusFormer" 5 | AFF: 6 | EMBED_DIM: [96,192,384,768] 7 | DEPTHS: [3,4,18,2] 8 | NUM_HEADS: [3,6,12,24] 9 | DROP_PATH_RATE: 0.3 10 | PATCH_NORM: True 11 | MLP_RATIO: 3. 12 | CLUSTER_SIZE: 8 13 | NBHD_SIZE: [48,48,48,48] 14 | LAYER_SCALE: 1e-5 # turned off if 0.0 15 | ALPHA: 4.0 16 | DS_RATE: 0.2 17 | WEIGHTS: "aff_small_1_5th.pkl" 18 | PIXEL_MEAN: [123.675, 116.280, 103.530] 19 | PIXEL_STD: [58.395, 57.120, 57.375] 20 | SOLVER: 21 | IMS_PER_BATCH: 64 22 | BASE_LR: 0.0002 23 | STEPS: (81945, 88773) 24 | MAX_ITER: 92188 25 | WARMUP_ITERS: 3 26 | CHECKPOINT_PERIOD: 2500 27 | TEST: 28 | EVAL_PERIOD: 2500 29 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/aff/maskformer2_aff_small_bs64_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "AutoFocusFormer" 5 | AFF: 6 | EMBED_DIM: [96,192,384,768] 7 | DEPTHS: [3,4,18,2] 8 | NUM_HEADS: [3,6,12,24] 9 | DROP_PATH_RATE: 0.3 10 | PATCH_NORM: True 11 | MLP_RATIO: 3. 12 | CLUSTER_SIZE: 8 13 | NBHD_SIZE: [48,48,48,48] 14 | LAYER_SCALE: 1e-5 # turned off if 0.0 15 | ALPHA: 4.0 16 | DS_RATE: 0.25 17 | WEIGHTS: "aff_small.pkl" 18 | PIXEL_MEAN: [123.675, 116.280, 103.530] 19 | PIXEL_STD: [58.395, 57.120, 57.375] 20 | SOLVER: 21 | IMS_PER_BATCH: 64 22 | BASE_LR: 0.0002 23 | STEPS: (81945, 88773) 24 | MAX_ITER: 92188 25 | WARMUP_ITERS: 3 26 | CHECKPOINT_PERIOD: 2500 27 | TEST: 28 | EVAL_PERIOD: 2500 29 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/aff/maskformer2_aff_tiny_1_5th_bs64_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "AutoFocusFormer" 5 | AFF: 6 | EMBED_DIM: [64,128,256,512] 7 | DEPTHS: [3,4,18,5] 8 | NUM_HEADS: [2,4,8,16] 9 | DROP_PATH_RATE: 0.3 10 | PATCH_NORM: True 11 | MLP_RATIO: 3. 12 | CLUSTER_SIZE: 8 13 | NBHD_SIZE: [48,48,48,48] 14 | ALPHA: 4.0 15 | DS_RATE: 0.2 16 | WEIGHTS: "aff_tiny_1_5th.pkl" 17 | PIXEL_MEAN: [123.675, 116.280, 103.530] 18 | PIXEL_STD: [58.395, 57.120, 57.375] 19 | SOLVER: 20 | IMS_PER_BATCH: 64 21 | BASE_LR: 0.0002 22 | STEPS: (81945, 88773) 23 | MAX_ITER: 92188 24 | WARMUP_ITERS: 3 25 | CHECKPOINT_PERIOD: 2500 26 | TEST: 27 | EVAL_PERIOD: 2500 28 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/aff/maskformer2_aff_tiny_bs64_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer2_R50_bs16_50ep.yaml 2 | MODEL: 3 | BACKBONE: 4 | NAME: "AutoFocusFormer" 5 | AFF: 6 | EMBED_DIM: [64,128,256,512] 7 | DEPTHS: [3,4,18,5] 8 | NUM_HEADS: [2,4,8,16] 9 | DROP_PATH_RATE: 0.3 10 | PATCH_NORM: True 11 | MLP_RATIO: 3. 12 | CLUSTER_SIZE: 8 13 | NBHD_SIZE: [48,48,48,48] 14 | ALPHA: 4.0 15 | DS_RATE: 0.25 16 | WEIGHTS: "aff_tiny.pkl" 17 | PIXEL_MEAN: [123.675, 116.280, 103.530] 18 | PIXEL_STD: [58.395, 57.120, 57.375] 19 | SOLVER: 20 | IMS_PER_BATCH: 64 21 | BASE_LR: 0.0002 22 | STEPS: (81945, 88773) 23 | MAX_ITER: 92188 24 | WARMUP_ITERS: 3 25 | CHECKPOINT_PERIOD: 2500 26 | TEST: 27 | EVAL_PERIOD: 2500 28 | -------------------------------------------------------------------------------- /configs/coco/instance-segmentation/maskformer2_R50_bs16_50ep.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-COCO-InstanceSegmentation.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IGNORE_VALUE: 255 7 | NUM_CLASSES: 80 8 | LOSS_WEIGHT: 1.0 9 | CONVS_DIM: 256 10 | MASK_DIM: 256 11 | NORM: "GN" 12 | # pixel decoder 13 | PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" 14 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 15 | DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] 16 | COMMON_STRIDE: 4 17 | TRANSFORMER_ENC_LAYERS: 6 18 | MASK_FORMER: 19 | TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder" 20 | TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" 21 | DEEP_SUPERVISION: True 22 | NO_OBJECT_WEIGHT: 0.1 23 | CLASS_WEIGHT: 2.0 24 | MASK_WEIGHT: 5.0 25 | DICE_WEIGHT: 5.0 26 | HIDDEN_DIM: 256 27 | NUM_OBJECT_QUERIES: 100 28 | NHEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | ENC_LAYERS: 0 32 | PRE_NORM: False 33 | ENFORCE_INPUT_PROJ: False 34 | SIZE_DIVISIBILITY: 32 35 | DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query 36 | TRAIN_NUM_POINTS: 12544 37 | OVERSAMPLE_RATIO: 3.0 38 | IMPORTANCE_SAMPLE_RATIO: 0.75 39 | TEST: 40 | SEMANTIC_ON: False 41 | INSTANCE_ON: True 42 | PANOPTIC_ON: False 43 | OVERLAP_THRESHOLD: 0.8 44 | OBJECT_MASK_THRESHOLD: 0.8 45 | -------------------------------------------------------------------------------- /create_env.sh: -------------------------------------------------------------------------------- 1 | # Create a conda virtual environment and activate it 2 | conda create -n aff python=3.8 3 | conda activate aff 4 | 5 | # Install requirements 6 | pip install \ 7 | yacs==0.1.8 \ 8 | termcolor==2.2.0 \ 9 | timm==0.6.12 \ 10 | pykeops==2.1.1 \ 11 | ptflops==0.6.9 \ 12 | numpy==1.22.4 \ 13 | cython==0.29.33 \ 14 | scipy==1.9.1 \ 15 | shapely==2.0.1 \ 16 | h5py==3.8.0 \ 17 | submitit==1.4.5 \ 18 | scikit-image==0.20.0 19 | conda install -c conda-forge opencv 20 | conda install pytorch==1.12.0 torchvision==0.13.0 torchaudio==0.12.0 cudatoolkit=11.6 -c pytorch -c conda-forge 21 | 22 | # Detectron2 23 | python -m pip install 'git+https://github.com/facebookresearch/detectron2.git' 24 | 25 | # add ADE20K_SEM_SEG_CATEGORIES_COLORS for consistent color in ADE prediction visualization 26 | mv ./builtin.py path/to/conda/lib/python3.8/site-packages/detectron2/data/datasets 27 | mv ./builtin_meta.py path/to/conda/lib/python3.8/site-packages/detectron2/data/datasets 28 | 29 | # Install the custom CUDA kernels for AFF 30 | cd mask2former/modeling/clusten/src && python setup.py install 31 | -------------------------------------------------------------------------------- /datasets/README.md: -------------------------------------------------------------------------------- 1 | # Prepare Datasets 2 | 3 | A dataset can be used by accessing [DatasetCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.DatasetCatalog) 4 | for its data, or [MetadataCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.MetadataCatalog) for its metadata (class names, etc). 5 | This document explains how to setup the builtin datasets so they can be used by the above APIs. 6 | [Use Custom Datasets](https://detectron2.readthedocs.io/tutorials/datasets.html) gives a deeper dive on how to use `DatasetCatalog` and `MetadataCatalog`, 7 | and how to add new datasets to them. 8 | 9 | The datasets are assumed to exist in a directory specified by the environment variable 10 | `DETECTRON2_DATASETS`. 11 | Under this directory, detectron2 will look for datasets in the structure described below, if needed. 12 | ``` 13 | $DETECTRON2_DATASETS/ 14 | ADEChallengeData2016/ 15 | coco/ 16 | cityscapes/ 17 | ``` 18 | 19 | You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`. 20 | If left unset, the default is `./datasets` relative to your current working directory. 21 | 22 | 23 | ## Expected dataset structure for [COCO](https://cocodataset.org/#download): 24 | 25 | ``` 26 | coco/ 27 | annotations/ 28 | instances_{train,val}2017.json 29 | panoptic_{train,val}2017.json 30 | {train,val}2017/ 31 | # image files that are mentioned in the corresponding json 32 | panoptic_{train,val}2017/ # png annotations 33 | panoptic_semseg_{train,val}2017/ # generated by the script mentioned below 34 | ``` 35 | 36 | Install panopticapi by: 37 | ``` 38 | pip install git+https://github.com/cocodataset/panopticapi.git 39 | ``` 40 | Then, run `python datasets/prepare_coco_semantic_annos_from_panoptic_annos.py`, to extract semantic annotations from panoptic annotations (only used for evaluation). 41 | 42 | 43 | ## Expected dataset structure for [cityscapes](https://www.cityscapes-dataset.com/downloads/): 44 | ``` 45 | cityscapes/ 46 | gtFine/ 47 | train/ 48 | aachen/ 49 | color.png, instanceIds.png, labelIds.png, polygons.json, 50 | labelTrainIds.png 51 | ... 52 | val/ 53 | test/ 54 | # below are generated Cityscapes panoptic annotation 55 | cityscapes_panoptic_train.json 56 | cityscapes_panoptic_train/ 57 | cityscapes_panoptic_val.json 58 | cityscapes_panoptic_val/ 59 | cityscapes_panoptic_test.json 60 | cityscapes_panoptic_test/ 61 | leftImg8bit/ 62 | train/ 63 | val/ 64 | test/ 65 | ``` 66 | Download cityscapes scripts by: 67 | ``` 68 | git clone https://github.com/mcordts/cityscapesScripts.git 69 | ``` 70 | 71 | Note: to create labelTrainIds.png, first prepare the above structure, then run cityscapesescript with: 72 | ``` 73 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesScripts/cityscapesscripts/preparation/createTrainIdLabelImgs.py 74 | ``` 75 | These files are not needed for instance segmentation. 76 | 77 | Note: to generate Cityscapes panoptic dataset, run cityscapesescript with: 78 | ``` 79 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesScripts/cityscapesscripts/preparation/createPanopticImgs.py 80 | ``` 81 | These files are not needed for semantic and instance segmentation. 82 | 83 | 84 | ## Expected dataset structure for [ADE20k](http://sceneparsing.csail.mit.edu/): 85 | ``` 86 | ADEChallengeData2016/ 87 | images/ 88 | annotations/ 89 | objectInfo150.txt 90 | # download instance annotation 91 | annotations_instance/ 92 | # generated by prepare_ade20k_sem_seg.py 93 | annotations_detectron2/ 94 | # below are generated by prepare_ade20k_pan_seg.py 95 | ade20k_panoptic_{train,val}.json 96 | ade20k_panoptic_{train,val}/ 97 | # below are generated by prepare_ade20k_ins_seg.py 98 | ade20k_instance_{train,val}.json 99 | ``` 100 | 101 | The directory `annotations_detectron2` is generated by running `python datasets/prepare_ade20k_sem_seg.py`. 102 | 103 | ## Expected dataset structure for [LVIS instance segmentation](https://www.lvisdataset.org/dataset): 104 | ``` 105 | coco/ 106 | {train,val,test}2017/ 107 | lvis/ 108 | lvis_v0.5_{train,val}.json 109 | lvis_v0.5_image_info_test.json 110 | lvis_v1_{train,val}.json 111 | lvis_v1_image_info_test{,_challenge}.json 112 | ``` 113 | 114 | Install lvis-api by: 115 | ``` 116 | pip install git+https://github.com/lvis-dataset/lvis-api.git 117 | ``` 118 | 119 | To evaluate models trained on the COCO dataset using LVIS annotations, 120 | run `python datasets/prepare_cocofied_lvis.py` to prepare "cocofied" LVIS v0.5 annotations, 121 | or `python datasets/prepare_cocofied_lvisv1.py` to prepare "cocofied" LVIS v1 annotations. 122 | 123 | Then, add `("lvis_v0.5_val_cocofied",)` or `("lvis_v1_val_cocofied",)` to DATASETS:TEST in config files. 124 | 125 | Finally, for v1, add `lvis_v1_cocofied` entry 126 | ``` 127 | "lvis_v1_cocofied": { 128 | "lvis_v1_val_cocofied": ("coco/", "lvis/lvis_v1_val_cocofied.json"), 129 | }, 130 | ``` 131 | to detectron2/data/datasets/builtin.py. 132 | -------------------------------------------------------------------------------- /datasets/prepare_ade20k_sem_seg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | 5 | import os 6 | from pathlib import Path 7 | 8 | import numpy as np 9 | import tqdm 10 | from PIL import Image 11 | 12 | 13 | def convert(input, output): 14 | img = np.asarray(Image.open(input)) 15 | assert img.dtype == np.uint8 16 | img = img - 1 # 0 (ignore) becomes 255. others are shifted by 1 17 | Image.fromarray(img).save(output) 18 | 19 | 20 | if __name__ == "__main__": 21 | dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016" 22 | for name in ["training", "validation"]: 23 | annotation_dir = dataset_dir / "annotations" / name 24 | output_dir = dataset_dir / "annotations_detectron2" / name 25 | output_dir.mkdir(parents=True, exist_ok=True) 26 | for file in tqdm.tqdm(list(annotation_dir.iterdir())): 27 | output_file = output_dir / file.name 28 | convert(file, output_file) 29 | -------------------------------------------------------------------------------- /datasets/prepare_coco_semantic_annos_from_panoptic_annos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | 5 | import functools 6 | import json 7 | import multiprocessing as mp 8 | import numpy as np 9 | import os 10 | import time 11 | from panopticapi.utils import rgb2id 12 | from PIL import Image 13 | 14 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES 15 | 16 | 17 | def _process_panoptic_to_semantic(input_panoptic, output_semantic, segments, id_map): 18 | panoptic = np.asarray(Image.open(input_panoptic), dtype=np.uint32) 19 | panoptic = rgb2id(panoptic) 20 | output = np.zeros_like(panoptic, dtype=np.uint8) + 255 21 | for seg in segments: 22 | cat_id = seg["category_id"] 23 | new_cat_id = id_map[cat_id] 24 | output[panoptic == seg["id"]] = new_cat_id 25 | Image.fromarray(output).save(output_semantic) 26 | 27 | 28 | def separate_coco_semantic_from_panoptic(panoptic_json, panoptic_root, sem_seg_root, categories): 29 | """ 30 | Create semantic segmentation annotations from panoptic segmentation 31 | annotations, to be used by PanopticFPN. 32 | It maps all thing categories to class 0, and maps all unlabeled pixels to class 255. 33 | It maps all stuff categories to contiguous ids starting from 1. 34 | Args: 35 | panoptic_json (str): path to the panoptic json file, in COCO's format. 36 | panoptic_root (str): a directory with panoptic annotation files, in COCO's format. 37 | sem_seg_root (str): a directory to output semantic annotation files 38 | categories (list[dict]): category metadata. Each dict needs to have: 39 | "id": corresponds to the "category_id" in the json annotations 40 | "isthing": 0 or 1 41 | """ 42 | os.makedirs(sem_seg_root, exist_ok=True) 43 | 44 | id_map = {} # map from category id to id in the output semantic annotation 45 | assert len(categories) <= 254 46 | for i, k in enumerate(categories): 47 | id_map[k["id"]] = i 48 | # what is id = 0? 49 | # id_map[0] = 255 50 | print(id_map) 51 | 52 | with open(panoptic_json) as f: 53 | obj = json.load(f) 54 | 55 | pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4)) 56 | 57 | def iter_annotations(): 58 | for anno in obj["annotations"]: 59 | file_name = anno["file_name"] 60 | segments = anno["segments_info"] 61 | input = os.path.join(panoptic_root, file_name) 62 | output = os.path.join(sem_seg_root, file_name) 63 | yield input, output, segments 64 | 65 | print("Start writing to {} ...".format(sem_seg_root)) 66 | start = time.time() 67 | pool.starmap( 68 | functools.partial(_process_panoptic_to_semantic, id_map=id_map), 69 | iter_annotations(), 70 | chunksize=100, 71 | ) 72 | print("Finished. time: {:.2f}s".format(time.time() - start)) 73 | 74 | 75 | if __name__ == "__main__": 76 | dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco") 77 | for s in ["val2017", "train2017"]: 78 | separate_coco_semantic_from_panoptic( 79 | os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)), 80 | os.path.join(dataset_dir, "panoptic_{}".format(s)), 81 | os.path.join(dataset_dir, "panoptic_semseg_{}".format(s)), 82 | COCO_CATEGORIES, 83 | ) 84 | -------------------------------------------------------------------------------- /datasets/prepare_cocofied_lvis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | 5 | import copy 6 | import json 7 | import os 8 | from collections import defaultdict 9 | 10 | # This mapping is extracted from the official LVIS mapping: 11 | # https://github.com/lvis-dataset/lvis-api/blob/master/data/coco_to_synset.json 12 | COCO_SYNSET_CATEGORIES = [ 13 | {"synset": "person.n.01", "coco_cat_id": 1}, 14 | {"synset": "bicycle.n.01", "coco_cat_id": 2}, 15 | {"synset": "car.n.01", "coco_cat_id": 3}, 16 | {"synset": "motorcycle.n.01", "coco_cat_id": 4}, 17 | {"synset": "airplane.n.01", "coco_cat_id": 5}, 18 | {"synset": "bus.n.01", "coco_cat_id": 6}, 19 | {"synset": "train.n.01", "coco_cat_id": 7}, 20 | {"synset": "truck.n.01", "coco_cat_id": 8}, 21 | {"synset": "boat.n.01", "coco_cat_id": 9}, 22 | {"synset": "traffic_light.n.01", "coco_cat_id": 10}, 23 | {"synset": "fireplug.n.01", "coco_cat_id": 11}, 24 | {"synset": "stop_sign.n.01", "coco_cat_id": 13}, 25 | {"synset": "parking_meter.n.01", "coco_cat_id": 14}, 26 | {"synset": "bench.n.01", "coco_cat_id": 15}, 27 | {"synset": "bird.n.01", "coco_cat_id": 16}, 28 | {"synset": "cat.n.01", "coco_cat_id": 17}, 29 | {"synset": "dog.n.01", "coco_cat_id": 18}, 30 | {"synset": "horse.n.01", "coco_cat_id": 19}, 31 | {"synset": "sheep.n.01", "coco_cat_id": 20}, 32 | {"synset": "beef.n.01", "coco_cat_id": 21}, 33 | {"synset": "elephant.n.01", "coco_cat_id": 22}, 34 | {"synset": "bear.n.01", "coco_cat_id": 23}, 35 | {"synset": "zebra.n.01", "coco_cat_id": 24}, 36 | {"synset": "giraffe.n.01", "coco_cat_id": 25}, 37 | {"synset": "backpack.n.01", "coco_cat_id": 27}, 38 | {"synset": "umbrella.n.01", "coco_cat_id": 28}, 39 | {"synset": "bag.n.04", "coco_cat_id": 31}, 40 | {"synset": "necktie.n.01", "coco_cat_id": 32}, 41 | {"synset": "bag.n.06", "coco_cat_id": 33}, 42 | {"synset": "frisbee.n.01", "coco_cat_id": 34}, 43 | {"synset": "ski.n.01", "coco_cat_id": 35}, 44 | {"synset": "snowboard.n.01", "coco_cat_id": 36}, 45 | {"synset": "ball.n.06", "coco_cat_id": 37}, 46 | {"synset": "kite.n.03", "coco_cat_id": 38}, 47 | {"synset": "baseball_bat.n.01", "coco_cat_id": 39}, 48 | {"synset": "baseball_glove.n.01", "coco_cat_id": 40}, 49 | {"synset": "skateboard.n.01", "coco_cat_id": 41}, 50 | {"synset": "surfboard.n.01", "coco_cat_id": 42}, 51 | {"synset": "tennis_racket.n.01", "coco_cat_id": 43}, 52 | {"synset": "bottle.n.01", "coco_cat_id": 44}, 53 | {"synset": "wineglass.n.01", "coco_cat_id": 46}, 54 | {"synset": "cup.n.01", "coco_cat_id": 47}, 55 | {"synset": "fork.n.01", "coco_cat_id": 48}, 56 | {"synset": "knife.n.01", "coco_cat_id": 49}, 57 | {"synset": "spoon.n.01", "coco_cat_id": 50}, 58 | {"synset": "bowl.n.03", "coco_cat_id": 51}, 59 | {"synset": "banana.n.02", "coco_cat_id": 52}, 60 | {"synset": "apple.n.01", "coco_cat_id": 53}, 61 | {"synset": "sandwich.n.01", "coco_cat_id": 54}, 62 | {"synset": "orange.n.01", "coco_cat_id": 55}, 63 | {"synset": "broccoli.n.01", "coco_cat_id": 56}, 64 | {"synset": "carrot.n.01", "coco_cat_id": 57}, 65 | {"synset": "frank.n.02", "coco_cat_id": 58}, 66 | {"synset": "pizza.n.01", "coco_cat_id": 59}, 67 | {"synset": "doughnut.n.02", "coco_cat_id": 60}, 68 | {"synset": "cake.n.03", "coco_cat_id": 61}, 69 | {"synset": "chair.n.01", "coco_cat_id": 62}, 70 | {"synset": "sofa.n.01", "coco_cat_id": 63}, 71 | {"synset": "pot.n.04", "coco_cat_id": 64}, 72 | {"synset": "bed.n.01", "coco_cat_id": 65}, 73 | {"synset": "dining_table.n.01", "coco_cat_id": 67}, 74 | {"synset": "toilet.n.02", "coco_cat_id": 70}, 75 | {"synset": "television_receiver.n.01", "coco_cat_id": 72}, 76 | {"synset": "laptop.n.01", "coco_cat_id": 73}, 77 | {"synset": "mouse.n.04", "coco_cat_id": 74}, 78 | {"synset": "remote_control.n.01", "coco_cat_id": 75}, 79 | {"synset": "computer_keyboard.n.01", "coco_cat_id": 76}, 80 | {"synset": "cellular_telephone.n.01", "coco_cat_id": 77}, 81 | {"synset": "microwave.n.02", "coco_cat_id": 78}, 82 | {"synset": "oven.n.01", "coco_cat_id": 79}, 83 | {"synset": "toaster.n.02", "coco_cat_id": 80}, 84 | {"synset": "sink.n.01", "coco_cat_id": 81}, 85 | {"synset": "electric_refrigerator.n.01", "coco_cat_id": 82}, 86 | {"synset": "book.n.01", "coco_cat_id": 84}, 87 | {"synset": "clock.n.01", "coco_cat_id": 85}, 88 | {"synset": "vase.n.01", "coco_cat_id": 86}, 89 | {"synset": "scissors.n.01", "coco_cat_id": 87}, 90 | {"synset": "teddy.n.01", "coco_cat_id": 88}, 91 | {"synset": "hand_blower.n.01", "coco_cat_id": 89}, 92 | {"synset": "toothbrush.n.01", "coco_cat_id": 90}, 93 | ] 94 | 95 | 96 | def cocofy_lvis(input_filename, output_filename): 97 | """ 98 | Filter LVIS instance segmentation annotations to remove all categories that are not included in 99 | COCO. The new json files can be used to evaluate COCO AP using `lvis-api`. The category ids in 100 | the output json are the incontiguous COCO dataset ids. 101 | 102 | Args: 103 | input_filename (str): path to the LVIS json file. 104 | output_filename (str): path to the COCOfied json file. 105 | """ 106 | 107 | with open(input_filename, "r") as f: 108 | lvis_json = json.load(f) 109 | 110 | lvis_annos = lvis_json.pop("annotations") 111 | cocofied_lvis = copy.deepcopy(lvis_json) 112 | lvis_json["annotations"] = lvis_annos 113 | 114 | # Mapping from lvis cat id to coco cat id via synset 115 | lvis_cat_id_to_synset = {cat["id"]: cat["synset"] for cat in lvis_json["categories"]} 116 | synset_to_coco_cat_id = {x["synset"]: x["coco_cat_id"] for x in COCO_SYNSET_CATEGORIES} 117 | # Synsets that we will keep in the dataset 118 | synsets_to_keep = set(synset_to_coco_cat_id.keys()) 119 | coco_cat_id_with_instances = defaultdict(int) 120 | 121 | new_annos = [] 122 | ann_id = 1 123 | for ann in lvis_annos: 124 | lvis_cat_id = ann["category_id"] 125 | synset = lvis_cat_id_to_synset[lvis_cat_id] 126 | if synset not in synsets_to_keep: 127 | continue 128 | coco_cat_id = synset_to_coco_cat_id[synset] 129 | new_ann = copy.deepcopy(ann) 130 | new_ann["category_id"] = coco_cat_id 131 | new_ann["id"] = ann_id 132 | ann_id += 1 133 | new_annos.append(new_ann) 134 | coco_cat_id_with_instances[coco_cat_id] += 1 135 | cocofied_lvis["annotations"] = new_annos 136 | 137 | for image in cocofied_lvis["images"]: 138 | for key in ["not_exhaustive_category_ids", "neg_category_ids"]: 139 | new_category_list = [] 140 | for lvis_cat_id in image[key]: 141 | synset = lvis_cat_id_to_synset[lvis_cat_id] 142 | if synset not in synsets_to_keep: 143 | continue 144 | coco_cat_id = synset_to_coco_cat_id[synset] 145 | new_category_list.append(coco_cat_id) 146 | coco_cat_id_with_instances[coco_cat_id] += 1 147 | image[key] = new_category_list 148 | 149 | coco_cat_id_with_instances = set(coco_cat_id_with_instances.keys()) 150 | 151 | new_categories = [] 152 | for cat in lvis_json["categories"]: 153 | synset = cat["synset"] 154 | if synset not in synsets_to_keep: 155 | continue 156 | coco_cat_id = synset_to_coco_cat_id[synset] 157 | if coco_cat_id not in coco_cat_id_with_instances: 158 | continue 159 | new_cat = copy.deepcopy(cat) 160 | new_cat["id"] = coco_cat_id 161 | new_categories.append(new_cat) 162 | cocofied_lvis["categories"] = new_categories 163 | 164 | with open(output_filename, "w") as f: 165 | json.dump(cocofied_lvis, f) 166 | print("{} is COCOfied and stored in {}.".format(input_filename, output_filename)) 167 | 168 | 169 | if __name__ == "__main__": 170 | dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "lvis") 171 | for s in ["lvis_v0.5_train", "lvis_v0.5_val"]: 172 | print("Start COCOfing {}.".format(s)) 173 | cocofy_lvis( 174 | os.path.join(dataset_dir, "{}.json".format(s)), 175 | os.path.join(dataset_dir, "{}_cocofied.json".format(s)), 176 | ) 177 | -------------------------------------------------------------------------------- /datasets/prepare_cocofied_lvisv1.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2023 Apple Inc. All Rights Reserved. 4 | # 5 | 6 | import copy 7 | import json 8 | import os 9 | from collections import defaultdict 10 | 11 | # This mapping is extracted from the official LVIS mapping: 12 | # https://github.com/lvis-dataset/lvis-api/blob/master/data/coco_to_synset.json 13 | COCO_SYNSET_CATEGORIES = [ 14 | {"synset": "person.n.01", "coco_cat_id": 1}, 15 | {"synset": "bicycle.n.01", "coco_cat_id": 2}, 16 | {"synset": "car.n.01", "coco_cat_id": 3}, 17 | {"synset": "motorcycle.n.01", "coco_cat_id": 4}, 18 | {"synset": "airplane.n.01", "coco_cat_id": 5}, 19 | {"synset": "bus.n.01", "coco_cat_id": 6}, 20 | {"synset": "train.n.01", "coco_cat_id": 7}, 21 | {"synset": "truck.n.01", "coco_cat_id": 8}, 22 | {"synset": "boat.n.01", "coco_cat_id": 9}, 23 | {"synset": "traffic_light.n.01", "coco_cat_id": 10}, 24 | {"synset": "fireplug.n.01", "coco_cat_id": 11}, 25 | {"synset": "stop_sign.n.01", "coco_cat_id": 13}, 26 | {"synset": "parking_meter.n.01", "coco_cat_id": 14}, 27 | {"synset": "bench.n.01", "coco_cat_id": 15}, 28 | {"synset": "bird.n.01", "coco_cat_id": 16}, 29 | {"synset": "cat.n.01", "coco_cat_id": 17}, 30 | {"synset": "dog.n.01", "coco_cat_id": 18}, 31 | {"synset": "horse.n.01", "coco_cat_id": 19}, 32 | {"synset": "sheep.n.01", "coco_cat_id": 20}, 33 | {"synset": "beef.n.01", "coco_cat_id": 21}, 34 | {"synset": "elephant.n.01", "coco_cat_id": 22}, 35 | {"synset": "bear.n.01", "coco_cat_id": 23}, 36 | {"synset": "zebra.n.01", "coco_cat_id": 24}, 37 | {"synset": "giraffe.n.01", "coco_cat_id": 25}, 38 | {"synset": "backpack.n.01", "coco_cat_id": 27}, 39 | {"synset": "umbrella.n.01", "coco_cat_id": 28}, 40 | {"synset": "bag.n.04", "coco_cat_id": 31}, 41 | {"synset": "necktie.n.01", "coco_cat_id": 32}, 42 | {"synset": "bag.n.06", "coco_cat_id": 33}, 43 | {"synset": "frisbee.n.01", "coco_cat_id": 34}, 44 | {"synset": "ski.n.01", "coco_cat_id": 35}, 45 | {"synset": "snowboard.n.01", "coco_cat_id": 36}, 46 | {"synset": "ball.n.06", "coco_cat_id": 37}, 47 | {"synset": "kite.n.03", "coco_cat_id": 38}, 48 | {"synset": "baseball_bat.n.01", "coco_cat_id": 39}, 49 | {"synset": "baseball_glove.n.01", "coco_cat_id": 40}, 50 | {"synset": "skateboard.n.01", "coco_cat_id": 41}, 51 | {"synset": "surfboard.n.01", "coco_cat_id": 42}, 52 | {"synset": "tennis_racket.n.01", "coco_cat_id": 43}, 53 | {"synset": "bottle.n.01", "coco_cat_id": 44}, 54 | {"synset": "wineglass.n.01", "coco_cat_id": 46}, 55 | {"synset": "cup.n.01", "coco_cat_id": 47}, 56 | {"synset": "fork.n.01", "coco_cat_id": 48}, 57 | {"synset": "knife.n.01", "coco_cat_id": 49}, 58 | {"synset": "spoon.n.01", "coco_cat_id": 50}, 59 | {"synset": "bowl.n.03", "coco_cat_id": 51}, 60 | {"synset": "banana.n.02", "coco_cat_id": 52}, 61 | {"synset": "apple.n.01", "coco_cat_id": 53}, 62 | {"synset": "sandwich.n.01", "coco_cat_id": 54}, 63 | {"synset": "orange.n.01", "coco_cat_id": 55}, 64 | {"synset": "broccoli.n.01", "coco_cat_id": 56}, 65 | {"synset": "carrot.n.01", "coco_cat_id": 57}, 66 | {"synset": "frank.n.02", "coco_cat_id": 58}, 67 | {"synset": "pizza.n.01", "coco_cat_id": 59}, 68 | {"synset": "doughnut.n.02", "coco_cat_id": 60}, 69 | {"synset": "cake.n.03", "coco_cat_id": 61}, 70 | {"synset": "chair.n.01", "coco_cat_id": 62}, 71 | {"synset": "sofa.n.01", "coco_cat_id": 63}, 72 | {"synset": "pot.n.04", "coco_cat_id": 64}, 73 | {"synset": "bed.n.01", "coco_cat_id": 65}, 74 | {"synset": "dining_table.n.01", "coco_cat_id": 67}, 75 | {"synset": "toilet.n.02", "coco_cat_id": 70}, 76 | {"synset": "television_receiver.n.01", "coco_cat_id": 72}, 77 | {"synset": "laptop.n.01", "coco_cat_id": 73}, 78 | {"synset": "mouse.n.04", "coco_cat_id": 74}, 79 | {"synset": "remote_control.n.01", "coco_cat_id": 75}, 80 | {"synset": "computer_keyboard.n.01", "coco_cat_id": 76}, 81 | {"synset": "cellular_telephone.n.01", "coco_cat_id": 77}, 82 | {"synset": "microwave.n.02", "coco_cat_id": 78}, 83 | {"synset": "oven.n.01", "coco_cat_id": 79}, 84 | {"synset": "toaster.n.02", "coco_cat_id": 80}, 85 | {"synset": "sink.n.01", "coco_cat_id": 81}, 86 | {"synset": "electric_refrigerator.n.01", "coco_cat_id": 82}, 87 | {"synset": "book.n.01", "coco_cat_id": 84}, 88 | {"synset": "clock.n.01", "coco_cat_id": 85}, 89 | {"synset": "vase.n.01", "coco_cat_id": 86}, 90 | {"synset": "scissors.n.01", "coco_cat_id": 87}, 91 | {"synset": "teddy.n.01", "coco_cat_id": 88}, 92 | {"synset": "hand_blower.n.01", "coco_cat_id": 89}, 93 | {"synset": "toothbrush.n.01", "coco_cat_id": 90}, 94 | ] 95 | 96 | 97 | def cocofy_lvis(input_filename, output_filename): 98 | """ 99 | Filter LVIS instance segmentation annotations to remove all categories that are not included in 100 | COCO. The new json files can be used to evaluate COCO AP using `lvis-api`. The category ids in 101 | the output json are the incontiguous COCO dataset ids. 102 | 103 | Args: 104 | input_filename (str): path to the LVIS json file. 105 | output_filename (str): path to the COCOfied json file. 106 | """ 107 | 108 | with open(input_filename, "r") as f: 109 | lvis_json = json.load(f) 110 | 111 | lvis_annos = lvis_json.pop("annotations") 112 | lvis_imgs = lvis_json.pop("images") 113 | cocofied_lvis = copy.deepcopy(lvis_json) 114 | lvis_json["annotations"] = lvis_annos 115 | lvis_json["images"] = lvis_imgs 116 | 117 | # Mapping from lvis cat id to coco cat id via synset 118 | lvis_cat_id_to_synset = {cat["id"]: cat["synset"] for cat in lvis_json["categories"]} 119 | synset_to_coco_cat_id = {x["synset"]: x["coco_cat_id"] for x in COCO_SYNSET_CATEGORIES} 120 | # Synsets that we will keep in the dataset 121 | synsets_to_keep = set(synset_to_coco_cat_id.keys()) 122 | coco_cat_id_with_instances = defaultdict(int) 123 | 124 | invalid_img_ids = set() 125 | new_img_id_dict = {} 126 | 127 | new_images = [] 128 | img_id = 1 129 | for image in lvis_imgs: 130 | coco_url = image['coco_url'] 131 | split, file_name = coco_url.split('/')[-2:] 132 | if split == 'train2017': 133 | invalid_img_ids.add(image['id']) 134 | continue 135 | new_img = copy.deepcopy(image) 136 | new_img_id_dict[new_img['id']] = img_id 137 | new_img['id'] = img_id 138 | img_id += 1 139 | new_img['file_name'] = file_name 140 | for key in ["not_exhaustive_category_ids", "neg_category_ids"]: 141 | new_category_list = [] 142 | for lvis_cat_id in new_img[key]: 143 | synset = lvis_cat_id_to_synset[lvis_cat_id] 144 | if synset not in synsets_to_keep: 145 | continue 146 | coco_cat_id = synset_to_coco_cat_id[synset] 147 | new_category_list.append(coco_cat_id) 148 | coco_cat_id_with_instances[coco_cat_id] += 1 149 | new_img[key] = new_category_list 150 | new_images.append(new_img) 151 | cocofied_lvis["images"] = new_images 152 | 153 | new_annos = [] 154 | ann_id = 1 155 | for ann in lvis_annos: 156 | img_id = ann["image_id"] 157 | if img_id in invalid_img_ids: 158 | continue 159 | lvis_cat_id = ann["category_id"] 160 | synset = lvis_cat_id_to_synset[lvis_cat_id] 161 | if synset not in synsets_to_keep: 162 | continue 163 | coco_cat_id = synset_to_coco_cat_id[synset] 164 | new_ann = copy.deepcopy(ann) 165 | new_ann["category_id"] = coco_cat_id 166 | new_ann["id"] = ann_id 167 | ann_id += 1 168 | new_ann["image_id"] = new_img_id_dict[img_id] 169 | new_annos.append(new_ann) 170 | coco_cat_id_with_instances[coco_cat_id] += 1 171 | cocofied_lvis["annotations"] = new_annos 172 | 173 | 174 | coco_cat_id_with_instances = set(coco_cat_id_with_instances.keys()) 175 | 176 | new_categories = [] 177 | for cat in lvis_json["categories"]: 178 | synset = cat["synset"] 179 | if synset not in synsets_to_keep: 180 | continue 181 | coco_cat_id = synset_to_coco_cat_id[synset] 182 | if coco_cat_id not in coco_cat_id_with_instances: 183 | continue 184 | new_cat = copy.deepcopy(cat) 185 | new_cat["id"] = coco_cat_id 186 | new_categories.append(new_cat) 187 | cocofied_lvis["categories"] = new_categories 188 | 189 | with open(output_filename, "w") as f: 190 | json.dump(cocofied_lvis, f) 191 | print("{} is COCOfied and stored in {}.".format(input_filename, output_filename)) 192 | 193 | 194 | if __name__ == "__main__": 195 | dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "lvis") 196 | for s in ["lvis_v1_val"]: 197 | print("Start COCOfing {}.".format(s)) 198 | cocofy_lvis( 199 | os.path.join(dataset_dir, "{}.json".format(s)), 200 | os.path.join(dataset_dir, "{}_cocofied.json".format(s)), 201 | ) 202 | -------------------------------------------------------------------------------- /demo/demo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py 3 | # Adapted for AutoFocusFormer by Ziwen 2023 4 | 5 | import argparse 6 | import glob 7 | import multiprocessing as mp 8 | import os 9 | 10 | # fmt: off 11 | import sys 12 | sys.path.insert(1, os.path.join(sys.path[0], '..')) 13 | # fmt: on 14 | 15 | import tempfile 16 | import time 17 | import warnings 18 | 19 | import cv2 20 | import numpy as np 21 | import tqdm 22 | 23 | from detectron2.config import get_cfg 24 | from detectron2.data.detection_utils import read_image 25 | from detectron2.projects.deeplab import add_deeplab_config 26 | from detectron2.utils.logger import setup_logger 27 | 28 | from mask2former import add_maskformer2_config 29 | from predictor import VisualizationDemo 30 | 31 | 32 | # constants 33 | WINDOW_NAME = "mask2former demo" 34 | 35 | 36 | def setup_cfg(args): 37 | # load config from file and command-line arguments 38 | cfg = get_cfg() 39 | add_deeplab_config(cfg) 40 | add_maskformer2_config(cfg) 41 | cfg.merge_from_file(args.config_file) 42 | cfg.merge_from_list(args.opts) 43 | cfg.freeze() 44 | return cfg 45 | 46 | 47 | def get_parser(): 48 | parser = argparse.ArgumentParser(description="maskformer2 demo for builtin configs") 49 | parser.add_argument( 50 | "--config-file", 51 | default="configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml", 52 | metavar="FILE", 53 | help="path to config file", 54 | ) 55 | parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.") 56 | parser.add_argument("--video-input", help="Path to video file.") 57 | parser.add_argument( 58 | "--input", 59 | nargs="+", 60 | help="A list of space separated input images; " 61 | "or a single glob pattern such as 'directory/*.jpg'", 62 | ) 63 | parser.add_argument( 64 | "--output", 65 | help="A file or directory to save output visualizations. " 66 | "If not given, will show output in an OpenCV window.", 67 | ) 68 | 69 | parser.add_argument( 70 | "--confidence-threshold", 71 | type=float, 72 | default=0.5, 73 | help="Minimum score for instance predictions to be shown", 74 | ) 75 | parser.add_argument( 76 | "--opts", 77 | help="Modify config options using the command-line 'KEY VALUE' pairs", 78 | default=[], 79 | nargs=argparse.REMAINDER, 80 | ) 81 | parser.add_argument( 82 | "--blur", 83 | help="A directory containing blurred version of the inputs (e.g., blurred human faces). " 84 | "If given, predictions are visualized on the blurred images." 85 | "Images inside this folder need to have the same name as the input images", 86 | ) 87 | return parser 88 | 89 | 90 | def test_opencv_video_format(codec, file_ext): 91 | with tempfile.TemporaryDirectory(prefix="video_format_test") as dir: 92 | filename = os.path.join(dir, "test_file" + file_ext) 93 | writer = cv2.VideoWriter( 94 | filename=filename, 95 | fourcc=cv2.VideoWriter_fourcc(*codec), 96 | fps=float(30), 97 | frameSize=(10, 10), 98 | isColor=True, 99 | ) 100 | [writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)] 101 | writer.release() 102 | if os.path.isfile(filename): 103 | return True 104 | return False 105 | 106 | 107 | if __name__ == "__main__": 108 | mp.set_start_method("spawn", force=True) 109 | args = get_parser().parse_args() 110 | setup_logger(name="fvcore") 111 | logger = setup_logger() 112 | logger.info("Arguments: " + str(args)) 113 | 114 | cfg = setup_cfg(args) 115 | 116 | demo = VisualizationDemo(cfg) 117 | 118 | if args.input: 119 | if len(args.input) == 1: 120 | args.input = glob.glob(os.path.expanduser(args.input[0])) 121 | assert args.input, "The input path(s) was not found" 122 | for path in tqdm.tqdm(args.input, disable=not args.output): 123 | # use PIL, to be consistent with evaluation 124 | img = read_image(path, format="BGR") 125 | if args.blur: 126 | path_blur = os.path.join(args.blur, path.split('/')[-1]) 127 | img_blur = read_image(path_blur, format="BGR") 128 | else: 129 | img_blur = None 130 | start_time = time.time() 131 | predictions, visualized_output = demo.run_on_image(img, blur=img_blur) 132 | logger.info( 133 | "{}: {} in {:.2f}s".format( 134 | path, 135 | "detected {} instances".format(len(predictions["instances"])) 136 | if "instances" in predictions 137 | else "finished", 138 | time.time() - start_time, 139 | ) 140 | ) 141 | 142 | if args.output: 143 | if os.path.isdir(args.output): 144 | assert os.path.isdir(args.output), args.output 145 | out_filename = os.path.join(args.output, os.path.basename(path)) 146 | else: 147 | assert len(args.input) == 1, "Please specify a directory with args.output" 148 | out_filename = args.output 149 | visualized_output.save(out_filename) 150 | else: 151 | cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) 152 | cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1]) 153 | if cv2.waitKey(0) == 27: 154 | break # esc to quit 155 | elif args.webcam: 156 | assert args.input is None, "Cannot have both --input and --webcam!" 157 | assert args.output is None, "output not yet supported with --webcam!" 158 | cam = cv2.VideoCapture(0) 159 | for vis in tqdm.tqdm(demo.run_on_video(cam)): 160 | cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) 161 | cv2.imshow(WINDOW_NAME, vis) 162 | if cv2.waitKey(1) == 27: 163 | break # esc to quit 164 | cam.release() 165 | cv2.destroyAllWindows() 166 | elif args.video_input: 167 | video = cv2.VideoCapture(args.video_input) 168 | width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH)) 169 | height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) 170 | frames_per_second = video.get(cv2.CAP_PROP_FPS) 171 | num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) 172 | basename = os.path.basename(args.video_input) 173 | codec, file_ext = ( 174 | ("x264", ".mkv") if test_opencv_video_format("x264", ".mkv") else ("mp4v", ".mp4") 175 | ) 176 | if codec == ".mp4v": 177 | warnings.warn("x264 codec not available, switching to mp4v") 178 | if args.output: 179 | if os.path.isdir(args.output): 180 | output_fname = os.path.join(args.output, basename) 181 | output_fname = os.path.splitext(output_fname)[0] + file_ext 182 | else: 183 | output_fname = args.output 184 | assert not os.path.isfile(output_fname), output_fname 185 | output_file = cv2.VideoWriter( 186 | filename=output_fname, 187 | # some installation of opencv may not support x264 (due to its license), 188 | # you can try other format (e.g. MPEG) 189 | fourcc=cv2.VideoWriter_fourcc(*codec), 190 | fps=float(frames_per_second), 191 | frameSize=(width, height), 192 | isColor=True, 193 | ) 194 | assert os.path.isfile(args.video_input) 195 | for vis_frame in tqdm.tqdm(demo.run_on_video(video), total=num_frames): 196 | if args.output: 197 | output_file.write(vis_frame) 198 | else: 199 | cv2.namedWindow(basename, cv2.WINDOW_NORMAL) 200 | cv2.imshow(basename, vis_frame) 201 | if cv2.waitKey(1) == 27: 202 | break # esc to quit 203 | video.release() 204 | if args.output: 205 | output_file.release() 206 | else: 207 | cv2.destroyAllWindows() 208 | -------------------------------------------------------------------------------- /demo/predictor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Copied from: https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py 3 | # Adapted for AutoFocusFormer by Ziwen 2023 4 | 5 | import atexit 6 | import bisect 7 | import multiprocessing as mp 8 | from collections import deque 9 | 10 | import cv2 11 | import torch 12 | 13 | from detectron2.data import MetadataCatalog 14 | from detectron2.engine.defaults import DefaultPredictor 15 | from detectron2.utils.video_visualizer import VideoVisualizer 16 | from detectron2.utils.visualizer import ColorMode, Visualizer 17 | 18 | 19 | class VisualizationDemo(object): 20 | def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False): 21 | """ 22 | Args: 23 | cfg (CfgNode): 24 | instance_mode (ColorMode): 25 | parallel (bool): whether to run the model in different processes from visualization. 26 | Useful since the visualization logic can be slow. 27 | """ 28 | self.metadata = MetadataCatalog.get( 29 | cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused" 30 | ) 31 | self.cpu_device = torch.device("cpu") 32 | self.instance_mode = instance_mode 33 | 34 | self.parallel = parallel 35 | if parallel: 36 | num_gpu = torch.cuda.device_count() 37 | self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu) 38 | else: 39 | self.predictor = DefaultPredictor(cfg) 40 | 41 | def run_on_image(self, image, blur=None): 42 | """ 43 | Args: 44 | image (np.ndarray): an image of shape (H, W, C) (in BGR order). 45 | This is the format used by OpenCV. 46 | Returns: 47 | predictions (dict): the output of the model. 48 | vis_output (VisImage): the visualized image output. 49 | """ 50 | vis_output = None 51 | predictions = self.predictor(image) 52 | # Convert image from OpenCV BGR format to Matplotlib RGB format. 53 | if blur is not None: 54 | blur = blur[:, :, ::-1] 55 | visualizer = Visualizer(blur, self.metadata, instance_mode=self.instance_mode) 56 | else: 57 | image = image[:, :, ::-1] 58 | visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode) 59 | if "panoptic_seg" in predictions: 60 | panoptic_seg, segments_info = predictions["panoptic_seg"] 61 | vis_output = visualizer.draw_panoptic_seg_predictions( 62 | panoptic_seg.to(self.cpu_device), segments_info 63 | ) 64 | else: 65 | if "sem_seg" in predictions: 66 | vis_output = visualizer.draw_sem_seg( 67 | predictions["sem_seg"].argmax(dim=0).to(self.cpu_device) 68 | ) 69 | if "instances" in predictions: 70 | instances = predictions["instances"].to(self.cpu_device) 71 | vis_output = visualizer.draw_instance_predictions(predictions=instances) 72 | 73 | return predictions, vis_output 74 | 75 | def _frame_from_video(self, video): 76 | while video.isOpened(): 77 | success, frame = video.read() 78 | if success: 79 | yield frame 80 | else: 81 | break 82 | 83 | def run_on_video(self, video): 84 | """ 85 | Visualizes predictions on frames of the input video. 86 | Args: 87 | video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be 88 | either a webcam or a video file. 89 | Yields: 90 | ndarray: BGR visualizations of each video frame. 91 | """ 92 | video_visualizer = VideoVisualizer(self.metadata, self.instance_mode) 93 | 94 | def process_predictions(frame, predictions): 95 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 96 | if "panoptic_seg" in predictions: 97 | panoptic_seg, segments_info = predictions["panoptic_seg"] 98 | vis_frame = video_visualizer.draw_panoptic_seg_predictions( 99 | frame, panoptic_seg.to(self.cpu_device), segments_info 100 | ) 101 | elif "instances" in predictions: 102 | predictions = predictions["instances"].to(self.cpu_device) 103 | vis_frame = video_visualizer.draw_instance_predictions(frame, predictions) 104 | elif "sem_seg" in predictions: 105 | vis_frame = video_visualizer.draw_sem_seg( 106 | frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device) 107 | ) 108 | 109 | # Converts Matplotlib RGB format to OpenCV BGR format 110 | vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR) 111 | return vis_frame 112 | 113 | frame_gen = self._frame_from_video(video) 114 | if self.parallel: 115 | buffer_size = self.predictor.default_buffer_size 116 | 117 | frame_data = deque() 118 | 119 | for cnt, frame in enumerate(frame_gen): 120 | frame_data.append(frame) 121 | self.predictor.put(frame) 122 | 123 | if cnt >= buffer_size: 124 | frame = frame_data.popleft() 125 | predictions = self.predictor.get() 126 | yield process_predictions(frame, predictions) 127 | 128 | while len(frame_data): 129 | frame = frame_data.popleft() 130 | predictions = self.predictor.get() 131 | yield process_predictions(frame, predictions) 132 | else: 133 | for frame in frame_gen: 134 | yield process_predictions(frame, self.predictor(frame)) 135 | 136 | 137 | class AsyncPredictor: 138 | """ 139 | A predictor that runs the model asynchronously, possibly on >1 GPUs. 140 | Because rendering the visualization takes considerably amount of time, 141 | this helps improve throughput a little bit when rendering videos. 142 | """ 143 | 144 | class _StopToken: 145 | pass 146 | 147 | class _PredictWorker(mp.Process): 148 | def __init__(self, cfg, task_queue, result_queue): 149 | self.cfg = cfg 150 | self.task_queue = task_queue 151 | self.result_queue = result_queue 152 | super().__init__() 153 | 154 | def run(self): 155 | predictor = DefaultPredictor(self.cfg) 156 | 157 | while True: 158 | task = self.task_queue.get() 159 | if isinstance(task, AsyncPredictor._StopToken): 160 | break 161 | idx, data = task 162 | result = predictor(data) 163 | self.result_queue.put((idx, result)) 164 | 165 | def __init__(self, cfg, num_gpus: int = 1): 166 | """ 167 | Args: 168 | cfg (CfgNode): 169 | num_gpus (int): if 0, will run on CPU 170 | """ 171 | num_workers = max(num_gpus, 1) 172 | self.task_queue = mp.Queue(maxsize=num_workers * 3) 173 | self.result_queue = mp.Queue(maxsize=num_workers * 3) 174 | self.procs = [] 175 | for gpuid in range(max(num_gpus, 1)): 176 | cfg = cfg.clone() 177 | cfg.defrost() 178 | cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu" 179 | self.procs.append( 180 | AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue) 181 | ) 182 | 183 | self.put_idx = 0 184 | self.get_idx = 0 185 | self.result_rank = [] 186 | self.result_data = [] 187 | 188 | for p in self.procs: 189 | p.start() 190 | atexit.register(self.shutdown) 191 | 192 | def put(self, image): 193 | self.put_idx += 1 194 | self.task_queue.put((self.put_idx, image)) 195 | 196 | def get(self): 197 | self.get_idx += 1 # the index needed for this request 198 | if len(self.result_rank) and self.result_rank[0] == self.get_idx: 199 | res = self.result_data[0] 200 | del self.result_data[0], self.result_rank[0] 201 | return res 202 | 203 | while True: 204 | # make sure the results are returned in the correct order 205 | idx, res = self.result_queue.get() 206 | if idx == self.get_idx: 207 | return res 208 | insert = bisect.bisect(self.result_rank, idx) 209 | self.result_rank.insert(insert, idx) 210 | self.result_data.insert(insert, res) 211 | 212 | def __len__(self): 213 | return self.put_idx - self.get_idx 214 | 215 | def __call__(self, image): 216 | self.put(image) 217 | return self.get() 218 | 219 | def shutdown(self): 220 | for _ in self.procs: 221 | self.task_queue.put(AsyncPredictor._StopToken()) 222 | 223 | @property 224 | def default_buffer_size(self): 225 | return len(self.procs) * 5 226 | -------------------------------------------------------------------------------- /demo1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-autofocusformer-segmentation/52908e8ad5112b5bff1d043e6a06a9e8f9aad3ba/demo1.png -------------------------------------------------------------------------------- /demo2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-autofocusformer-segmentation/52908e8ad5112b5bff1d043e6a06a9e8f9aad3ba/demo2.png -------------------------------------------------------------------------------- /mask2former/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | from . import data # register all new datasets 4 | from . import modeling 5 | 6 | # config 7 | from .config import add_maskformer2_config 8 | 9 | # dataset loading 10 | from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper 11 | from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper 12 | from .data.dataset_mappers.mask_former_instance_dataset_mapper import ( 13 | MaskFormerInstanceDatasetMapper, 14 | ) 15 | from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import ( 16 | MaskFormerPanopticDatasetMapper, 17 | ) 18 | from .data.dataset_mappers.mask_former_semantic_dataset_mapper import ( 19 | MaskFormerSemanticDatasetMapper, 20 | ) 21 | 22 | # models 23 | from .maskformer_model import MaskFormer 24 | from .test_time_augmentation import SemanticSegmentorWithTTA 25 | 26 | # evaluation 27 | from .evaluation.instance_evaluation import InstanceSegEvaluator 28 | -------------------------------------------------------------------------------- /mask2former/config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Adapted for AutoFocusFormer by Ziwen 2023 3 | 4 | from detectron2.config import CfgNode as CN 5 | 6 | 7 | def add_maskformer2_config(cfg): 8 | """ 9 | Add config for MASK_FORMER. 10 | """ 11 | # NOTE: configs from original maskformer 12 | # data config 13 | # select the dataset mapper 14 | cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic" 15 | # Color augmentation 16 | cfg.INPUT.COLOR_AUG_SSD = False 17 | # We retry random cropping until no single category in semantic segmentation GT occupies more 18 | # than `SINGLE_CATEGORY_MAX_AREA` part of the crop. 19 | cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0 20 | # Pad image and segmentation GT in dataset mapper. 21 | cfg.INPUT.SIZE_DIVISIBILITY = -1 22 | 23 | # solver config 24 | # weight decay on embedding 25 | cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0 26 | # optimizer 27 | cfg.SOLVER.OPTIMIZER = "ADAMW" 28 | cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1 29 | 30 | # mask_former model config 31 | cfg.MODEL.MASK_FORMER = CN() 32 | 33 | # loss 34 | cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True 35 | cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1 36 | cfg.MODEL.MASK_FORMER.CLASS_WEIGHT = 1.0 37 | cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0 38 | cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0 39 | 40 | # transformer config 41 | cfg.MODEL.MASK_FORMER.NHEADS = 8 42 | cfg.MODEL.MASK_FORMER.DROPOUT = 0.1 43 | cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048 44 | cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0 45 | cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6 46 | cfg.MODEL.MASK_FORMER.PRE_NORM = False 47 | 48 | cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256 49 | cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100 50 | 51 | cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5" 52 | cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False 53 | 54 | # mask_former inference config 55 | cfg.MODEL.MASK_FORMER.TEST = CN() 56 | cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True 57 | cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = False 58 | cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False 59 | cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0 60 | cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0 61 | cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False 62 | 63 | # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet) 64 | # you can use this config to override 65 | cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32 66 | 67 | # pixel decoder config 68 | cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256 69 | # adding transformer in pixel decoder 70 | cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0 71 | # pixel decoder 72 | cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "MSDeformAttnPixelDecoder" 73 | 74 | # autofocusformer backbone 75 | cfg.MODEL.AFF = CN() 76 | cfg.MODEL.AFF.EMBED_DIM = [32, 128, 256, 384] 77 | cfg.MODEL.AFF.DEPTHS = [2, 2, 6, 2] 78 | cfg.MODEL.AFF.NUM_HEADS = [3, 6, 12, 24] 79 | cfg.MODEL.AFF.MLP_RATIO = 2.0 80 | cfg.MODEL.AFF.CLUSTER_SIZE = 8 81 | cfg.MODEL.AFF.NBHD_SIZE = [48, 48, 48, 48] 82 | cfg.MODEL.AFF.LAYER_SCALE = 0.0 83 | cfg.MODEL.AFF.ALPHA = 4.0 84 | cfg.MODEL.AFF.DS_RATE = 0.25 85 | cfg.MODEL.AFF.RESERVE = True 86 | cfg.MODEL.AFF.DROP_RATE = 0.0 87 | cfg.MODEL.AFF.ATTN_DROP_RATE = 0.0 88 | cfg.MODEL.AFF.DROP_PATH_RATE = 0.3 89 | cfg.MODEL.AFF.PATCH_NORM = True 90 | cfg.MODEL.AFF.OUT_FEATURES = ["res2", "res3", "res4", "res5"] 91 | cfg.MODEL.AFF.SHEPARD_POWER = 6.0 92 | cfg.MODEL.AFF.SHEPARD_POWER_LEARNABLE = True 93 | 94 | # NOTE: maskformer2 extra configs 95 | # transformer module 96 | cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME = "MultiScaleMaskedTransformerDecoder" 97 | 98 | # LSJ aug 99 | cfg.INPUT.IMAGE_SIZE = 1024 100 | cfg.INPUT.MIN_SCALE = 0.1 101 | cfg.INPUT.MAX_SCALE = 2.0 102 | 103 | # MSDeformAttn encoder configs 104 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"] 105 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4 106 | cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8 107 | 108 | # point loss configs 109 | # Number of points sampled during training for a mask point head. 110 | cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS = 112 * 112 111 | # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the 112 | # original paper. 113 | cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO = 3.0 114 | # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in 115 | # the original paper. 116 | cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75 117 | -------------------------------------------------------------------------------- /mask2former/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | from . import datasets 4 | -------------------------------------------------------------------------------- /mask2former/data/dataset_mappers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /mask2former/data/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py 3 | 4 | import copy 5 | import logging 6 | 7 | import numpy as np 8 | import torch 9 | 10 | from detectron2.config import configurable 11 | from detectron2.data import detection_utils as utils 12 | from detectron2.data import transforms as T 13 | 14 | from pycocotools import mask as coco_mask 15 | 16 | __all__ = ["COCOInstanceNewBaselineDatasetMapper"] 17 | 18 | 19 | def convert_coco_poly_to_mask(segmentations, height, width): 20 | masks = [] 21 | for polygons in segmentations: 22 | rles = coco_mask.frPyObjects(polygons, height, width) 23 | mask = coco_mask.decode(rles) 24 | if len(mask.shape) < 3: 25 | mask = mask[..., None] 26 | mask = torch.as_tensor(mask, dtype=torch.uint8) 27 | mask = mask.any(dim=2) 28 | masks.append(mask) 29 | if masks: 30 | masks = torch.stack(masks, dim=0) 31 | else: 32 | masks = torch.zeros((0, height, width), dtype=torch.uint8) 33 | return masks 34 | 35 | 36 | def build_transform_gen(cfg, is_train): 37 | """ 38 | Create a list of default :class:`Augmentation` from config. 39 | Now it includes resizing and flipping. 40 | Returns: 41 | list[Augmentation] 42 | """ 43 | assert is_train, "Only support training augmentation" 44 | image_size = cfg.INPUT.IMAGE_SIZE 45 | min_scale = cfg.INPUT.MIN_SCALE 46 | max_scale = cfg.INPUT.MAX_SCALE 47 | 48 | augmentation = [] 49 | 50 | if cfg.INPUT.RANDOM_FLIP != "none": 51 | augmentation.append( 52 | T.RandomFlip( 53 | horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal", 54 | vertical=cfg.INPUT.RANDOM_FLIP == "vertical", 55 | ) 56 | ) 57 | 58 | augmentation.extend([ 59 | T.ResizeScale( 60 | min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size 61 | ), 62 | T.FixedSizeCrop(crop_size=(image_size, image_size)), 63 | ]) 64 | 65 | return augmentation 66 | 67 | 68 | # This is specifically designed for the COCO dataset. 69 | class COCOInstanceNewBaselineDatasetMapper: 70 | """ 71 | A callable which takes a dataset dict in Detectron2 Dataset format, 72 | and map it into a format used by MaskFormer. 73 | 74 | This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation. 75 | 76 | The callable currently does the following: 77 | 78 | 1. Read the image from "file_name" 79 | 2. Applies geometric transforms to the image and annotation 80 | 3. Find and applies suitable cropping to the image and annotation 81 | 4. Prepare image and annotation to Tensors 82 | """ 83 | 84 | @configurable 85 | def __init__( 86 | self, 87 | is_train=True, 88 | *, 89 | tfm_gens, 90 | image_format, 91 | ): 92 | """ 93 | NOTE: this interface is experimental. 94 | Args: 95 | is_train: for training or inference 96 | augmentations: a list of augmentations or deterministic transforms to apply 97 | tfm_gens: data augmentation 98 | image_format: an image format supported by :func:`detection_utils.read_image`. 99 | """ 100 | self.tfm_gens = tfm_gens 101 | logging.getLogger(__name__).info( 102 | "[COCOInstanceNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(str(self.tfm_gens)) 103 | ) 104 | 105 | self.img_format = image_format 106 | self.is_train = is_train 107 | 108 | @classmethod 109 | def from_config(cls, cfg, is_train=True): 110 | # Build augmentation 111 | tfm_gens = build_transform_gen(cfg, is_train) 112 | 113 | ret = { 114 | "is_train": is_train, 115 | "tfm_gens": tfm_gens, 116 | "image_format": cfg.INPUT.FORMAT, 117 | } 118 | return ret 119 | 120 | def __call__(self, dataset_dict): 121 | """ 122 | Args: 123 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 124 | 125 | Returns: 126 | dict: a format that builtin models in detectron2 accept 127 | """ 128 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 129 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 130 | utils.check_image_size(dataset_dict, image) 131 | 132 | # TODO: get padding mask 133 | # by feeding a "segmentation mask" to the same transforms 134 | padding_mask = np.ones(image.shape[:2]) 135 | 136 | image, transforms = T.apply_transform_gens(self.tfm_gens, image) 137 | # the crop transformation has default padding value 0 for segmentation 138 | padding_mask = transforms.apply_segmentation(padding_mask) 139 | padding_mask = ~ padding_mask.astype(bool) 140 | 141 | image_shape = image.shape[:2] # h, w 142 | 143 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 144 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 145 | # Therefore it's important to use torch.Tensor. 146 | dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 147 | dataset_dict["padding_mask"] = torch.as_tensor(np.ascontiguousarray(padding_mask)) 148 | 149 | if not self.is_train: 150 | # USER: Modify this if you want to keep them for some reason. 151 | dataset_dict.pop("annotations", None) 152 | return dataset_dict 153 | 154 | if "annotations" in dataset_dict: 155 | # USER: Modify this if you want to keep them for some reason. 156 | for anno in dataset_dict["annotations"]: 157 | # Let's always keep mask 158 | # if not self.mask_on: 159 | # anno.pop("segmentation", None) 160 | anno.pop("keypoints", None) 161 | 162 | # USER: Implement additional transformations if you have other types of data 163 | annos = [ 164 | utils.transform_instance_annotations(obj, transforms, image_shape) 165 | for obj in dataset_dict.pop("annotations") 166 | if obj.get("iscrowd", 0) == 0 167 | ] 168 | # NOTE: does not support BitMask due to augmentation 169 | # Current BitMask cannot handle empty objects 170 | instances = utils.annotations_to_instances(annos, image_shape) 171 | # After transforms such as cropping are applied, the bounding box may no longer 172 | # tightly bound the object. As an example, imagine a triangle object 173 | # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight 174 | # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to 175 | # the intersection of original bounding box and the cropping box. 176 | instances.gt_boxes = instances.gt_masks.get_bounding_boxes() 177 | # Need to filter empty instances first (due to augmentation) 178 | instances = utils.filter_empty_instances(instances) 179 | # Generate masks from polygon 180 | h, w = instances.image_size 181 | # image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float) 182 | if hasattr(instances, 'gt_masks'): 183 | gt_masks = instances.gt_masks 184 | gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w) 185 | instances.gt_masks = gt_masks 186 | dataset_dict["instances"] = instances 187 | 188 | return dataset_dict 189 | -------------------------------------------------------------------------------- /mask2former/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py 3 | 4 | import copy 5 | import logging 6 | 7 | import numpy as np 8 | import torch 9 | 10 | from detectron2.config import configurable 11 | from detectron2.data import detection_utils as utils 12 | from detectron2.data import transforms as T 13 | from detectron2.structures import BitMasks, Boxes, Instances 14 | 15 | __all__ = ["COCOPanopticNewBaselineDatasetMapper"] 16 | 17 | 18 | def build_transform_gen(cfg, is_train): 19 | """ 20 | Create a list of default :class:`Augmentation` from config. 21 | Now it includes resizing and flipping. 22 | Returns: 23 | list[Augmentation] 24 | """ 25 | assert is_train, "Only support training augmentation" 26 | image_size = cfg.INPUT.IMAGE_SIZE 27 | min_scale = cfg.INPUT.MIN_SCALE 28 | max_scale = cfg.INPUT.MAX_SCALE 29 | 30 | augmentation = [] 31 | 32 | if cfg.INPUT.RANDOM_FLIP != "none": 33 | augmentation.append( 34 | T.RandomFlip( 35 | horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal", 36 | vertical=cfg.INPUT.RANDOM_FLIP == "vertical", 37 | ) 38 | ) 39 | 40 | augmentation.extend([ 41 | T.ResizeScale( 42 | min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size 43 | ), 44 | T.FixedSizeCrop(crop_size=(image_size, image_size)), 45 | ]) 46 | 47 | return augmentation 48 | 49 | 50 | # This is specifically designed for the COCO dataset. 51 | class COCOPanopticNewBaselineDatasetMapper: 52 | """ 53 | A callable which takes a dataset dict in Detectron2 Dataset format, 54 | and map it into a format used by MaskFormer. 55 | 56 | This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation. 57 | 58 | The callable currently does the following: 59 | 60 | 1. Read the image from "file_name" 61 | 2. Applies geometric transforms to the image and annotation 62 | 3. Find and applies suitable cropping to the image and annotation 63 | 4. Prepare image and annotation to Tensors 64 | """ 65 | 66 | @configurable 67 | def __init__( 68 | self, 69 | is_train=True, 70 | *, 71 | tfm_gens, 72 | image_format, 73 | ): 74 | """ 75 | NOTE: this interface is experimental. 76 | Args: 77 | is_train: for training or inference 78 | augmentations: a list of augmentations or deterministic transforms to apply 79 | crop_gen: crop augmentation 80 | tfm_gens: data augmentation 81 | image_format: an image format supported by :func:`detection_utils.read_image`. 82 | """ 83 | self.tfm_gens = tfm_gens 84 | logging.getLogger(__name__).info( 85 | "[COCOPanopticNewBaselineDatasetMapper] Full TransformGens used in training: {}".format( 86 | str(self.tfm_gens) 87 | ) 88 | ) 89 | 90 | self.img_format = image_format 91 | self.is_train = is_train 92 | 93 | @classmethod 94 | def from_config(cls, cfg, is_train=True): 95 | # Build augmentation 96 | tfm_gens = build_transform_gen(cfg, is_train) 97 | 98 | ret = { 99 | "is_train": is_train, 100 | "tfm_gens": tfm_gens, 101 | "image_format": cfg.INPUT.FORMAT, 102 | } 103 | return ret 104 | 105 | def __call__(self, dataset_dict): 106 | """ 107 | Args: 108 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 109 | 110 | Returns: 111 | dict: a format that builtin models in detectron2 accept 112 | """ 113 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 114 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 115 | utils.check_image_size(dataset_dict, image) 116 | 117 | image, transforms = T.apply_transform_gens(self.tfm_gens, image) 118 | image_shape = image.shape[:2] # h, w 119 | 120 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 121 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 122 | # Therefore it's important to use torch.Tensor. 123 | dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 124 | 125 | if not self.is_train: 126 | # USER: Modify this if you want to keep them for some reason. 127 | dataset_dict.pop("annotations", None) 128 | return dataset_dict 129 | 130 | if "pan_seg_file_name" in dataset_dict: 131 | pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB") 132 | segments_info = dataset_dict["segments_info"] 133 | 134 | # apply the same transformation to panoptic segmentation 135 | pan_seg_gt = transforms.apply_segmentation(pan_seg_gt) 136 | 137 | from panopticapi.utils import rgb2id 138 | 139 | pan_seg_gt = rgb2id(pan_seg_gt) 140 | 141 | instances = Instances(image_shape) 142 | classes = [] 143 | masks = [] 144 | for segment_info in segments_info: 145 | class_id = segment_info["category_id"] 146 | if not segment_info["iscrowd"]: 147 | classes.append(class_id) 148 | masks.append(pan_seg_gt == segment_info["id"]) 149 | 150 | classes = np.array(classes) 151 | instances.gt_classes = torch.tensor(classes, dtype=torch.int64) 152 | if len(masks) == 0: 153 | # Some image does not have annotation (all ignored) 154 | instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) 155 | instances.gt_boxes = Boxes(torch.zeros((0, 4))) 156 | else: 157 | masks = BitMasks( 158 | torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) 159 | ) 160 | instances.gt_masks = masks.tensor 161 | instances.gt_boxes = masks.get_bounding_boxes() 162 | 163 | dataset_dict["instances"] = instances 164 | 165 | return dataset_dict 166 | -------------------------------------------------------------------------------- /mask2former/data/dataset_mappers/mask_former_instance_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | import copy 4 | import logging 5 | 6 | import numpy as np 7 | import pycocotools.mask as mask_util 8 | import torch 9 | from torch.nn import functional as F 10 | 11 | from detectron2.config import configurable 12 | from detectron2.data import detection_utils as utils 13 | from detectron2.data import transforms as T 14 | from detectron2.projects.point_rend import ColorAugSSDTransform 15 | from detectron2.structures import BitMasks, Instances, polygons_to_bitmask 16 | 17 | __all__ = ["MaskFormerInstanceDatasetMapper"] 18 | 19 | 20 | class MaskFormerInstanceDatasetMapper: 21 | """ 22 | A callable which takes a dataset dict in Detectron2 Dataset format, 23 | and map it into a format used by MaskFormer for instance segmentation. 24 | 25 | The callable currently does the following: 26 | 27 | 1. Read the image from "file_name" 28 | 2. Applies geometric transforms to the image and annotation 29 | 3. Find and applies suitable cropping to the image and annotation 30 | 4. Prepare image and annotation to Tensors 31 | """ 32 | 33 | @configurable 34 | def __init__( 35 | self, 36 | is_train=True, 37 | *, 38 | augmentations, 39 | image_format, 40 | size_divisibility, 41 | ): 42 | """ 43 | NOTE: this interface is experimental. 44 | Args: 45 | is_train: for training or inference 46 | augmentations: a list of augmentations or deterministic transforms to apply 47 | image_format: an image format supported by :func:`detection_utils.read_image`. 48 | size_divisibility: pad image size to be divisible by this value 49 | """ 50 | self.is_train = is_train 51 | self.tfm_gens = augmentations 52 | self.img_format = image_format 53 | self.size_divisibility = size_divisibility 54 | 55 | logger = logging.getLogger(__name__) 56 | mode = "training" if is_train else "inference" 57 | logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}") 58 | 59 | @classmethod 60 | def from_config(cls, cfg, is_train=True): 61 | # Build augmentation 62 | augs = [ 63 | T.ResizeShortestEdge( 64 | cfg.INPUT.MIN_SIZE_TRAIN, 65 | cfg.INPUT.MAX_SIZE_TRAIN, 66 | cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING, 67 | ) 68 | ] 69 | if cfg.INPUT.CROP.ENABLED: 70 | augs.append( 71 | T.RandomCrop( 72 | cfg.INPUT.CROP.TYPE, 73 | cfg.INPUT.CROP.SIZE, 74 | ) 75 | ) 76 | if cfg.INPUT.COLOR_AUG_SSD: 77 | augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT)) 78 | augs.append(T.RandomFlip()) 79 | 80 | ret = { 81 | "is_train": is_train, 82 | "augmentations": augs, 83 | "image_format": cfg.INPUT.FORMAT, 84 | "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY, 85 | } 86 | return ret 87 | 88 | def __call__(self, dataset_dict): 89 | """ 90 | Args: 91 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 92 | 93 | Returns: 94 | dict: a format that builtin models in detectron2 accept 95 | """ 96 | assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!" 97 | 98 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 99 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 100 | utils.check_image_size(dataset_dict, image) 101 | 102 | aug_input = T.AugInput(image) 103 | aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input) 104 | image = aug_input.image 105 | 106 | # transform instnace masks 107 | assert "annotations" in dataset_dict 108 | for anno in dataset_dict["annotations"]: 109 | anno.pop("keypoints", None) 110 | 111 | annos = [ 112 | utils.transform_instance_annotations(obj, transforms, image.shape[:2]) 113 | for obj in dataset_dict.pop("annotations") 114 | if obj.get("iscrowd", 0) == 0 115 | ] 116 | 117 | if len(annos): 118 | assert "segmentation" in annos[0] 119 | segms = [obj["segmentation"] for obj in annos] 120 | masks = [] 121 | for segm in segms: 122 | if isinstance(segm, list): 123 | # polygon 124 | masks.append(polygons_to_bitmask(segm, *image.shape[:2])) 125 | elif isinstance(segm, dict): 126 | # COCO RLE 127 | masks.append(mask_util.decode(segm)) 128 | elif isinstance(segm, np.ndarray): 129 | assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( 130 | segm.ndim 131 | ) 132 | # mask array 133 | masks.append(segm) 134 | else: 135 | raise ValueError( 136 | "Cannot convert segmentation of type '{}' to BitMasks!" 137 | "Supported types are: polygons as list[list[float] or ndarray]," 138 | " COCO-style RLE as a dict, or a binary segmentation mask " 139 | " in a 2D numpy array of shape HxW.".format(type(segm)) 140 | ) 141 | 142 | # Pad image and segmentation label here! 143 | image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 144 | masks = [torch.from_numpy(np.ascontiguousarray(x)) for x in masks] 145 | 146 | classes = [int(obj["category_id"]) for obj in annos] 147 | classes = torch.tensor(classes, dtype=torch.int64) 148 | 149 | if self.size_divisibility > 0: 150 | image_size = (image.shape[-2], image.shape[-1]) 151 | padding_size = [ 152 | 0, 153 | self.size_divisibility - image_size[1], 154 | 0, 155 | self.size_divisibility - image_size[0], 156 | ] 157 | # pad image 158 | image = F.pad(image, padding_size, value=128).contiguous() 159 | # pad mask 160 | masks = [F.pad(x, padding_size, value=0).contiguous() for x in masks] 161 | 162 | image_shape = (image.shape[-2], image.shape[-1]) # h, w 163 | 164 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 165 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 166 | # Therefore it's important to use torch.Tensor. 167 | dataset_dict["image"] = image 168 | 169 | # Prepare per-category binary masks 170 | instances = Instances(image_shape) 171 | instances.gt_classes = classes 172 | if len(masks) == 0: 173 | # Some image does not have annotation (all ignored) 174 | instances.gt_masks = torch.zeros((0, image.shape[-2], image.shape[-1])) 175 | else: 176 | masks = BitMasks(torch.stack(masks)) 177 | instances.gt_masks = masks.tensor 178 | 179 | dataset_dict["instances"] = instances 180 | 181 | return dataset_dict 182 | -------------------------------------------------------------------------------- /mask2former/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | import copy 4 | 5 | import numpy as np 6 | import torch 7 | from torch.nn import functional as F 8 | 9 | from detectron2.config import configurable 10 | from detectron2.data import detection_utils as utils 11 | from detectron2.data import transforms as T 12 | from detectron2.structures import BitMasks, Instances 13 | 14 | from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper 15 | 16 | __all__ = ["MaskFormerPanopticDatasetMapper"] 17 | 18 | 19 | class MaskFormerPanopticDatasetMapper(MaskFormerSemanticDatasetMapper): 20 | """ 21 | A callable which takes a dataset dict in Detectron2 Dataset format, 22 | and map it into a format used by MaskFormer for panoptic segmentation. 23 | 24 | The callable currently does the following: 25 | 26 | 1. Read the image from "file_name" 27 | 2. Applies geometric transforms to the image and annotation 28 | 3. Find and applies suitable cropping to the image and annotation 29 | 4. Prepare image and annotation to Tensors 30 | """ 31 | 32 | @configurable 33 | def __init__( 34 | self, 35 | is_train=True, 36 | *, 37 | augmentations, 38 | image_format, 39 | ignore_label, 40 | size_divisibility, 41 | ): 42 | """ 43 | NOTE: this interface is experimental. 44 | Args: 45 | is_train: for training or inference 46 | augmentations: a list of augmentations or deterministic transforms to apply 47 | image_format: an image format supported by :func:`detection_utils.read_image`. 48 | ignore_label: the label that is ignored to evaluation 49 | size_divisibility: pad image size to be divisible by this value 50 | """ 51 | super().__init__( 52 | is_train, 53 | augmentations=augmentations, 54 | image_format=image_format, 55 | ignore_label=ignore_label, 56 | size_divisibility=size_divisibility, 57 | ) 58 | 59 | def __call__(self, dataset_dict): 60 | """ 61 | Args: 62 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 63 | 64 | Returns: 65 | dict: a format that builtin models in detectron2 accept 66 | """ 67 | assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!" 68 | 69 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 70 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 71 | utils.check_image_size(dataset_dict, image) 72 | 73 | # semantic segmentation 74 | if "sem_seg_file_name" in dataset_dict: 75 | # PyTorch transformation not implemented for uint16, so converting it to double first 76 | sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double") 77 | else: 78 | sem_seg_gt = None 79 | 80 | # panoptic segmentation 81 | if "pan_seg_file_name" in dataset_dict: 82 | pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB") 83 | segments_info = dataset_dict["segments_info"] 84 | else: 85 | pan_seg_gt = None 86 | segments_info = None 87 | 88 | if pan_seg_gt is None: 89 | raise ValueError( 90 | "Cannot find 'pan_seg_file_name' for panoptic segmentation dataset {}.".format( 91 | dataset_dict["file_name"] 92 | ) 93 | ) 94 | 95 | aug_input = T.AugInput(image, sem_seg=sem_seg_gt) 96 | aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input) 97 | image = aug_input.image 98 | if sem_seg_gt is not None: 99 | sem_seg_gt = aug_input.sem_seg 100 | 101 | # apply the same transformation to panoptic segmentation 102 | pan_seg_gt = transforms.apply_segmentation(pan_seg_gt) 103 | 104 | from panopticapi.utils import rgb2id 105 | 106 | pan_seg_gt = rgb2id(pan_seg_gt) 107 | 108 | # Pad image and segmentation label here! 109 | image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 110 | if sem_seg_gt is not None: 111 | sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long")) 112 | pan_seg_gt = torch.as_tensor(pan_seg_gt.astype("long")) 113 | 114 | if self.size_divisibility > 0: 115 | image_size = (image.shape[-2], image.shape[-1]) 116 | padding_size = [ 117 | 0, 118 | self.size_divisibility - image_size[1], 119 | 0, 120 | self.size_divisibility - image_size[0], 121 | ] 122 | image = F.pad(image, padding_size, value=128).contiguous() 123 | if sem_seg_gt is not None: 124 | sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous() 125 | pan_seg_gt = F.pad( 126 | pan_seg_gt, padding_size, value=0 127 | ).contiguous() # 0 is the VOID panoptic label 128 | 129 | image_shape = (image.shape[-2], image.shape[-1]) # h, w 130 | 131 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 132 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 133 | # Therefore it's important to use torch.Tensor. 134 | dataset_dict["image"] = image 135 | if sem_seg_gt is not None: 136 | dataset_dict["sem_seg"] = sem_seg_gt.long() 137 | 138 | if "annotations" in dataset_dict: 139 | raise ValueError("Pemantic segmentation dataset should not have 'annotations'.") 140 | 141 | # Prepare per-category binary masks 142 | pan_seg_gt = pan_seg_gt.numpy() 143 | instances = Instances(image_shape) 144 | classes = [] 145 | masks = [] 146 | for segment_info in segments_info: 147 | class_id = segment_info["category_id"] 148 | if not segment_info["iscrowd"]: 149 | classes.append(class_id) 150 | masks.append(pan_seg_gt == segment_info["id"]) 151 | 152 | classes = np.array(classes) 153 | instances.gt_classes = torch.tensor(classes, dtype=torch.int64) 154 | if len(masks) == 0: 155 | # Some image does not have annotation (all ignored) 156 | instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) 157 | else: 158 | masks = BitMasks( 159 | torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) 160 | ) 161 | instances.gt_masks = masks.tensor 162 | 163 | dataset_dict["instances"] = instances 164 | 165 | return dataset_dict 166 | -------------------------------------------------------------------------------- /mask2former/data/dataset_mappers/mask_former_semantic_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | import copy 4 | import logging 5 | 6 | import numpy as np 7 | import torch 8 | from torch.nn import functional as F 9 | 10 | from detectron2.config import configurable 11 | from detectron2.data import MetadataCatalog 12 | from detectron2.data import detection_utils as utils 13 | from detectron2.data import transforms as T 14 | from detectron2.projects.point_rend import ColorAugSSDTransform 15 | from detectron2.structures import BitMasks, Instances 16 | 17 | __all__ = ["MaskFormerSemanticDatasetMapper"] 18 | 19 | 20 | class MaskFormerSemanticDatasetMapper: 21 | """ 22 | A callable which takes a dataset dict in Detectron2 Dataset format, 23 | and map it into a format used by MaskFormer for semantic segmentation. 24 | 25 | The callable currently does the following: 26 | 27 | 1. Read the image from "file_name" 28 | 2. Applies geometric transforms to the image and annotation 29 | 3. Find and applies suitable cropping to the image and annotation 30 | 4. Prepare image and annotation to Tensors 31 | """ 32 | 33 | @configurable 34 | def __init__( 35 | self, 36 | is_train=True, 37 | *, 38 | augmentations, 39 | image_format, 40 | ignore_label, 41 | size_divisibility, 42 | ): 43 | """ 44 | NOTE: this interface is experimental. 45 | Args: 46 | is_train: for training or inference 47 | augmentations: a list of augmentations or deterministic transforms to apply 48 | image_format: an image format supported by :func:`detection_utils.read_image`. 49 | ignore_label: the label that is ignored to evaluation 50 | size_divisibility: pad image size to be divisible by this value 51 | """ 52 | self.is_train = is_train 53 | self.tfm_gens = augmentations 54 | self.img_format = image_format 55 | self.ignore_label = ignore_label 56 | self.size_divisibility = size_divisibility 57 | 58 | logger = logging.getLogger(__name__) 59 | mode = "training" if is_train else "inference" 60 | logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}") 61 | 62 | @classmethod 63 | def from_config(cls, cfg, is_train=True): 64 | # Build augmentation 65 | augs = [ 66 | T.ResizeShortestEdge( 67 | cfg.INPUT.MIN_SIZE_TRAIN, 68 | cfg.INPUT.MAX_SIZE_TRAIN, 69 | cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING, 70 | ) 71 | ] 72 | if cfg.INPUT.CROP.ENABLED: 73 | augs.append( 74 | T.RandomCrop_CategoryAreaConstraint( 75 | cfg.INPUT.CROP.TYPE, 76 | cfg.INPUT.CROP.SIZE, 77 | cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA, 78 | cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, 79 | ) 80 | ) 81 | if cfg.INPUT.COLOR_AUG_SSD: 82 | augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT)) 83 | augs.append(T.RandomFlip()) 84 | 85 | # Assume always applies to the training set. 86 | dataset_names = cfg.DATASETS.TRAIN 87 | meta = MetadataCatalog.get(dataset_names[0]) 88 | ignore_label = meta.ignore_label 89 | 90 | ret = { 91 | "is_train": is_train, 92 | "augmentations": augs, 93 | "image_format": cfg.INPUT.FORMAT, 94 | "ignore_label": ignore_label, 95 | "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY, 96 | } 97 | return ret 98 | 99 | def __call__(self, dataset_dict): 100 | """ 101 | Args: 102 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 103 | 104 | Returns: 105 | dict: a format that builtin models in detectron2 accept 106 | """ 107 | assert self.is_train, "MaskFormerSemanticDatasetMapper should only be used for training!" 108 | 109 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 110 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 111 | utils.check_image_size(dataset_dict, image) 112 | 113 | if "sem_seg_file_name" in dataset_dict: 114 | # PyTorch transformation not implemented for uint16, so converting it to double first 115 | sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double") 116 | else: 117 | sem_seg_gt = None 118 | 119 | if sem_seg_gt is None: 120 | raise ValueError( 121 | "Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format( 122 | dataset_dict["file_name"] 123 | ) 124 | ) 125 | 126 | aug_input = T.AugInput(image, sem_seg=sem_seg_gt) 127 | aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input) 128 | image = aug_input.image 129 | sem_seg_gt = aug_input.sem_seg 130 | 131 | # Pad image and segmentation label here! 132 | image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 133 | if sem_seg_gt is not None: 134 | sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long")) 135 | 136 | if self.size_divisibility > 0: 137 | image_size = (image.shape[-2], image.shape[-1]) 138 | padding_size = [ 139 | 0, 140 | self.size_divisibility - image_size[1], 141 | 0, 142 | self.size_divisibility - image_size[0], 143 | ] 144 | image = F.pad(image, padding_size, value=128).contiguous() 145 | if sem_seg_gt is not None: 146 | sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous() 147 | 148 | image_shape = (image.shape[-2], image.shape[-1]) # h, w 149 | 150 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 151 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 152 | # Therefore it's important to use torch.Tensor. 153 | dataset_dict["image"] = image 154 | 155 | if sem_seg_gt is not None: 156 | dataset_dict["sem_seg"] = sem_seg_gt.long() 157 | 158 | if "annotations" in dataset_dict: 159 | raise ValueError("Semantic segmentation dataset should not have 'annotations'.") 160 | 161 | # Prepare per-category binary masks 162 | if sem_seg_gt is not None: 163 | sem_seg_gt = sem_seg_gt.numpy() 164 | instances = Instances(image_shape) 165 | classes = np.unique(sem_seg_gt) 166 | # remove ignored region 167 | classes = classes[classes != self.ignore_label] 168 | instances.gt_classes = torch.tensor(classes, dtype=torch.int64) 169 | 170 | masks = [] 171 | for class_id in classes: 172 | masks.append(sem_seg_gt == class_id) 173 | 174 | if len(masks) == 0: 175 | # Some image does not have annotation (all ignored) 176 | instances.gt_masks = torch.zeros((0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1])) 177 | else: 178 | masks = BitMasks( 179 | torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) 180 | ) 181 | instances.gt_masks = masks.tensor 182 | 183 | dataset_dict["instances"] = instances 184 | 185 | return dataset_dict 186 | -------------------------------------------------------------------------------- /mask2former/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | from . import register_coco_panoptic_annos_semseg 4 | -------------------------------------------------------------------------------- /mask2former/data/datasets/register_coco_panoptic_annos_semseg.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | import json 4 | import os 5 | 6 | from detectron2.data import DatasetCatalog, MetadataCatalog 7 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES 8 | from detectron2.utils.file_io import PathManager 9 | 10 | 11 | _PREDEFINED_SPLITS_COCO_PANOPTIC = { 12 | "coco_2017_train_panoptic": ( 13 | # This is the original panoptic annotation directory 14 | "coco/panoptic_train2017", 15 | "coco/annotations/panoptic_train2017.json", 16 | # This directory contains semantic annotations that are 17 | # converted from panoptic annotations. 18 | # It is used by PanopticFPN. 19 | # You can use the script at detectron2/datasets/prepare_panoptic_fpn.py 20 | # to create these directories. 21 | "coco/panoptic_semseg_train2017", 22 | ), 23 | "coco_2017_val_panoptic": ( 24 | "coco/panoptic_val2017", 25 | "coco/annotations/panoptic_val2017.json", 26 | "coco/panoptic_semseg_val2017", 27 | ), 28 | } 29 | 30 | 31 | def get_metadata(): 32 | meta = {} 33 | # The following metadata maps contiguous id from [0, #thing categories + 34 | # #stuff categories) to their names and colors. We have to replica of the 35 | # same name and color under "thing_*" and "stuff_*" because the current 36 | # visualization function in D2 handles thing and class classes differently 37 | # due to some heuristic used in Panoptic FPN. We keep the same naming to 38 | # enable reusing existing visualization functions. 39 | thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1] 40 | thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1] 41 | stuff_classes = [k["name"] for k in COCO_CATEGORIES] 42 | stuff_colors = [k["color"] for k in COCO_CATEGORIES] 43 | 44 | meta["thing_classes"] = thing_classes 45 | meta["thing_colors"] = thing_colors 46 | meta["stuff_classes"] = stuff_classes 47 | meta["stuff_colors"] = stuff_colors 48 | 49 | # Convert category id for training: 50 | # category id: like semantic segmentation, it is the class id for each 51 | # pixel. Since there are some classes not used in evaluation, the category 52 | # id is not always contiguous and thus we have two set of category ids: 53 | # - original category id: category id in the original dataset, mainly 54 | # used for evaluation. 55 | # - contiguous category id: [0, #classes), in order to train the linear 56 | # softmax classifier. 57 | thing_dataset_id_to_contiguous_id = {} 58 | stuff_dataset_id_to_contiguous_id = {} 59 | 60 | for i, cat in enumerate(COCO_CATEGORIES): 61 | if cat["isthing"]: 62 | thing_dataset_id_to_contiguous_id[cat["id"]] = i 63 | # else: 64 | # stuff_dataset_id_to_contiguous_id[cat["id"]] = i 65 | 66 | # in order to use sem_seg evaluator 67 | stuff_dataset_id_to_contiguous_id[cat["id"]] = i 68 | 69 | meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id 70 | meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id 71 | 72 | return meta 73 | 74 | 75 | def load_coco_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta): 76 | """ 77 | Args: 78 | image_dir (str): path to the raw dataset. e.g., "~/coco/train2017". 79 | gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017". 80 | json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json". 81 | Returns: 82 | list[dict]: a list of dicts in Detectron2 standard format. (See 83 | `Using Custom Datasets `_ ) 84 | """ 85 | 86 | def _convert_category_id(segment_info, meta): 87 | if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]: 88 | segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][ 89 | segment_info["category_id"] 90 | ] 91 | segment_info["isthing"] = True 92 | else: 93 | segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][ 94 | segment_info["category_id"] 95 | ] 96 | segment_info["isthing"] = False 97 | return segment_info 98 | 99 | with PathManager.open(json_file) as f: 100 | json_info = json.load(f) 101 | 102 | ret = [] 103 | for ann in json_info["annotations"]: 104 | image_id = int(ann["image_id"]) 105 | # TODO: currently we assume image and label has the same filename but 106 | # different extension, and images have extension ".jpg" for COCO. Need 107 | # to make image extension a user-provided argument if we extend this 108 | # function to support other COCO-like datasets. 109 | image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg") 110 | label_file = os.path.join(gt_dir, ann["file_name"]) 111 | sem_label_file = os.path.join(semseg_dir, ann["file_name"]) 112 | segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]] 113 | ret.append( 114 | { 115 | "file_name": image_file, 116 | "image_id": image_id, 117 | "pan_seg_file_name": label_file, 118 | "sem_seg_file_name": sem_label_file, 119 | "segments_info": segments_info, 120 | } 121 | ) 122 | assert len(ret), f"No images found in {image_dir}!" 123 | assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"] 124 | assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"] 125 | assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"] 126 | return ret 127 | 128 | 129 | def register_coco_panoptic_annos_sem_seg( 130 | name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json 131 | ): 132 | panoptic_name = name 133 | delattr(MetadataCatalog.get(panoptic_name), "thing_classes") 134 | delattr(MetadataCatalog.get(panoptic_name), "thing_colors") 135 | MetadataCatalog.get(panoptic_name).set( 136 | thing_classes=metadata["thing_classes"], 137 | thing_colors=metadata["thing_colors"], 138 | # thing_dataset_id_to_contiguous_id=metadata["thing_dataset_id_to_contiguous_id"], 139 | ) 140 | 141 | # the name is "coco_2017_train_panoptic_with_sem_seg" and "coco_2017_val_panoptic_with_sem_seg" 142 | semantic_name = name + "_with_sem_seg" 143 | DatasetCatalog.register( 144 | semantic_name, 145 | lambda: load_coco_panoptic_json(panoptic_json, image_root, panoptic_root, sem_seg_root, metadata), 146 | ) 147 | MetadataCatalog.get(semantic_name).set( 148 | sem_seg_root=sem_seg_root, 149 | panoptic_root=panoptic_root, 150 | image_root=image_root, 151 | panoptic_json=panoptic_json, 152 | json_file=instances_json, 153 | evaluator_type="coco_panoptic_seg", 154 | ignore_label=255, 155 | label_divisor=1000, 156 | **metadata, 157 | ) 158 | 159 | 160 | def register_all_coco_panoptic_annos_sem_seg(root): 161 | for ( 162 | prefix, 163 | (panoptic_root, panoptic_json, semantic_root), 164 | ) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items(): 165 | prefix_instances = prefix[: -len("_panoptic")] 166 | instances_meta = MetadataCatalog.get(prefix_instances) 167 | image_root, instances_json = instances_meta.image_root, instances_meta.json_file 168 | 169 | register_coco_panoptic_annos_sem_seg( 170 | prefix, 171 | get_metadata(), 172 | image_root, 173 | os.path.join(root, panoptic_root), 174 | os.path.join(root, panoptic_json), 175 | os.path.join(root, semantic_root), 176 | instances_json, 177 | ) 178 | 179 | 180 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 181 | register_all_coco_panoptic_annos_sem_seg(_root) 182 | -------------------------------------------------------------------------------- /mask2former/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /mask2former/evaluation/instance_evaluation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | import itertools 4 | import json 5 | import os 6 | 7 | from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco 8 | from detectron2.utils.file_io import PathManager 9 | 10 | 11 | # modified from COCOEvaluator for instance segmetnat 12 | class InstanceSegEvaluator(COCOEvaluator): 13 | """ 14 | Evaluate AR for object proposals, AP for instance detection/segmentation, AP 15 | for keypoint detection outputs using COCO's metrics. 16 | See http://cocodataset.org/#detection-eval and 17 | http://cocodataset.org/#keypoints-eval to understand its metrics. 18 | The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means 19 | the metric cannot be computed (e.g. due to no predictions made). 20 | 21 | In addition to COCO, this evaluator is able to support any bounding box detection, 22 | instance segmentation, or keypoint detection dataset. 23 | """ 24 | 25 | def _eval_predictions(self, predictions, img_ids=None): 26 | """ 27 | Evaluate predictions. Fill self._results with the metrics of the tasks. 28 | """ 29 | self._logger.info("Preparing results for COCO format ...") 30 | coco_results = list(itertools.chain(*[x["instances"] for x in predictions])) 31 | tasks = self._tasks or self._tasks_from_predictions(coco_results) 32 | 33 | # unmap the category ids for COCO 34 | if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): 35 | dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id 36 | # all_contiguous_ids = list(dataset_id_to_contiguous_id.values()) 37 | # num_classes = len(all_contiguous_ids) 38 | # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1 39 | 40 | reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()} 41 | for result in coco_results: 42 | category_id = result["category_id"] 43 | # assert category_id < num_classes, ( 44 | # f"A prediction has class={category_id}, " 45 | # f"but the dataset only has {num_classes} classes and " 46 | # f"predicted class id should be in [0, {num_classes - 1}]." 47 | # ) 48 | assert category_id in reverse_id_mapping, ( 49 | f"A prediction has class={category_id}, " 50 | f"but the dataset only has class ids in {dataset_id_to_contiguous_id}." 51 | ) 52 | result["category_id"] = reverse_id_mapping[category_id] 53 | 54 | if self._output_dir: 55 | file_path = os.path.join(self._output_dir, "coco_instances_results.json") 56 | self._logger.info("Saving results to {}".format(file_path)) 57 | with PathManager.open(file_path, "w") as f: 58 | f.write(json.dumps(coco_results)) 59 | f.flush() 60 | 61 | if not self._do_evaluation: 62 | self._logger.info("Annotations are not available for evaluation.") 63 | return 64 | 65 | self._logger.info( 66 | "Evaluating predictions with {} COCO API...".format( 67 | "unofficial" if self._use_fast_impl else "official" 68 | ) 69 | ) 70 | for task in sorted(tasks): 71 | assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!" 72 | coco_eval = ( 73 | _evaluate_predictions_on_coco( 74 | self._coco_api, 75 | coco_results, 76 | task, 77 | kpt_oks_sigmas=self._kpt_oks_sigmas, 78 | use_fast_impl=self._use_fast_impl, 79 | img_ids=img_ids, 80 | max_dets_per_image=self._max_dets_per_image, 81 | ) 82 | if len(coco_results) > 0 83 | else None # cocoapi does not handle empty results very well 84 | ) 85 | 86 | res = self._derive_coco_results( 87 | coco_eval, task, class_names=self._metadata.get("thing_classes") 88 | ) 89 | self._results[task] = res 90 | -------------------------------------------------------------------------------- /mask2former/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Adapted for AutoFocusFormer by Ziwen 2023 3 | 4 | from .backbone.aff import AutoFocusFormer 5 | 6 | from .pixel_decoder.msdeformattn_pc import MSDeformAttnPixelDecoder 7 | from .meta_arch.mask_former_head import MaskFormerHead 8 | -------------------------------------------------------------------------------- /mask2former/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2023 Apple Inc. All Rights Reserved. 4 | # 5 | -------------------------------------------------------------------------------- /mask2former/modeling/clusten/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2023 Apple Inc. All Rights Reserved. 4 | # 5 | 6 | from .clusten import CLUSTENQKFunction, CLUSTENAVFunction, CLUSTENWFFunction, WEIGHTEDGATHERFunction, MSDETRPCFunction 7 | -------------------------------------------------------------------------------- /mask2former/modeling/clusten/clusten.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2023 Apple Inc. All Rights Reserved. 4 | # 5 | 6 | from torch.autograd import Function 7 | 8 | try: 9 | import clustenqk_cuda 10 | import clustenav_cuda 11 | import clustenwf_cuda 12 | import weighted_gather_cuda 13 | import msdetrpc_cuda 14 | except ImportError: 15 | raise RuntimeError("Could not load CLUSTEN CUDA extension. " + 16 | "Please make sure your device has CUDA, the CUDA toolkit for PyTorch is installed, and that you've compiled CLUSTEN correctly.") 17 | 18 | 19 | class CLUSTENQKFunction(Function): 20 | """ 21 | query times key function 22 | """ 23 | @staticmethod 24 | def forward(ctx, query, key, nbhd_idx): 25 | query = query.contiguous() 26 | key = key.contiguous() 27 | if key.dtype != query.dtype: 28 | key = key.to(query.dtype) 29 | nbhd_idx = nbhd_idx.contiguous() 30 | attn = clustenqk_cuda.forward( 31 | query, 32 | key.permute(0, 1, 3, 2).contiguous(), 33 | nbhd_idx) 34 | ctx.save_for_backward(query, key, nbhd_idx) 35 | return attn 36 | 37 | @staticmethod 38 | def backward(ctx, grad_attn): 39 | outputs = clustenqk_cuda.backward( 40 | grad_attn.contiguous(), *ctx.saved_tensors) 41 | d_query, d_key = outputs 42 | return d_query, d_key, None 43 | 44 | 45 | class CLUSTENAVFunction(Function): 46 | """ 47 | attention times value function 48 | """ 49 | @staticmethod 50 | def forward(ctx, attn, v, nbhd_idx): 51 | attn = attn.contiguous() 52 | v = v.contiguous() 53 | nbhd_idx = nbhd_idx.contiguous() 54 | if attn.dtype != v.dtype: 55 | v = v.to(attn.dtype) 56 | feat = clustenav_cuda.forward( 57 | attn, 58 | v, 59 | nbhd_idx) 60 | ctx.save_for_backward(attn, v, nbhd_idx) 61 | return feat 62 | 63 | @staticmethod 64 | def backward(ctx, grad_feat): 65 | outputs = clustenav_cuda.backward( 66 | grad_feat.contiguous(), *ctx.saved_tensors) 67 | d_attn, d_v = outputs 68 | return d_attn, d_v, None 69 | 70 | 71 | class CLUSTENWFFunction(Function): 72 | """ 73 | weight times feature function 74 | """ 75 | @staticmethod 76 | def forward(ctx, weights, feat, nbhd_idx): 77 | weights = weights.contiguous() 78 | feat = feat.contiguous() 79 | nbhd_idx = nbhd_idx.contiguous() 80 | if feat.dtype != weights.dtype: 81 | feat = feat.to(weights.dtype) 82 | feat_new = clustenwf_cuda.forward( 83 | weights, 84 | feat, 85 | nbhd_idx) 86 | ctx.save_for_backward(weights, feat, nbhd_idx) 87 | return feat_new 88 | 89 | @staticmethod 90 | def backward(ctx, grad_feat_new): 91 | outputs = clustenwf_cuda.backward( 92 | grad_feat_new.contiguous(), *ctx.saved_tensors) 93 | d_weights, d_feat = outputs 94 | return d_weights, d_feat, None 95 | 96 | 97 | class WEIGHTEDGATHERFunction(Function): 98 | """ 99 | weighted gather function 100 | """ 101 | @staticmethod 102 | def forward(ctx, nbhd_idx, weights, feat): 103 | nbhd_idx = nbhd_idx.contiguous() 104 | weights = weights.contiguous() 105 | feat = feat.contiguous() 106 | if feat.dtype != weights.dtype: 107 | weights = weights.to(feat.dtype) 108 | feat_new = weighted_gather_cuda.forward( 109 | nbhd_idx, 110 | weights, 111 | feat) 112 | ctx.save_for_backward(nbhd_idx, weights, feat) 113 | return feat_new 114 | 115 | @staticmethod 116 | def backward(ctx, grad_feat_new): 117 | outputs = weighted_gather_cuda.backward( 118 | grad_feat_new.contiguous(), *ctx.saved_tensors) 119 | d_weights, d_feat = outputs 120 | return None, d_weights, d_feat 121 | 122 | 123 | class MSDETRPCFunction(Function): 124 | """ 125 | deformable multi scale detr point cloud function 126 | """ 127 | @staticmethod 128 | def forward(ctx, nn_idx, nn_weight, attn, val): 129 | nn_idx = nn_idx.contiguous() 130 | nn_weight = nn_weight.contiguous() 131 | attn = attn.contiguous() 132 | val = val.contiguous() 133 | feat = msdetrpc_cuda.forward( 134 | nn_idx, 135 | nn_weight, 136 | attn, 137 | val) 138 | ctx.save_for_backward(nn_idx, nn_weight, attn, val) 139 | return feat 140 | 141 | @staticmethod 142 | def backward(ctx, grad_feat): 143 | outputs = msdetrpc_cuda.backward( 144 | grad_feat.contiguous(), *ctx.saved_tensors) 145 | d_weight, d_attn, d_val = outputs 146 | return None, d_weight, d_attn, d_val 147 | -------------------------------------------------------------------------------- /mask2former/modeling/clusten/src/clustenav_cuda.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * For licensing see accompanying LICENSE file. 3 | * Copyright (C) 2023 Apple Inc. All Rights Reserved. 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | torch::Tensor clusten_av_cuda_forward( 10 | const torch::Tensor &attn, // b x h x n x m 11 | const torch::Tensor &v, // b x h x n x c 12 | const torch::Tensor &nbhd_idx); // b x n x m 13 | 14 | std::vector clusten_av_cuda_backward( 15 | const torch::Tensor &d_feat, 16 | const torch::Tensor &attn, 17 | const torch::Tensor &v, 18 | const torch::Tensor &nbhd_idx); 19 | 20 | // C++ interface 21 | #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor") 22 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") 23 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) 24 | 25 | torch::Tensor clusten_av_forward( 26 | const torch::Tensor &attn, 27 | const torch::Tensor &v, 28 | const torch::Tensor &nbhd_idx) { 29 | CHECK_INPUT(attn); 30 | CHECK_INPUT(v); 31 | CHECK_INPUT(nbhd_idx); 32 | return clusten_av_cuda_forward(attn, v, nbhd_idx); 33 | } 34 | 35 | std::vector clusten_av_backward( 36 | const torch::Tensor &d_feat, 37 | const torch::Tensor &attn, 38 | const torch::Tensor &v, 39 | const torch::Tensor &nbhd_idx) { 40 | CHECK_INPUT(d_feat); 41 | CHECK_INPUT(attn); 42 | CHECK_INPUT(v); 43 | CHECK_INPUT(nbhd_idx); 44 | return clusten_av_cuda_backward(d_feat, attn, v, nbhd_idx); 45 | } 46 | 47 | 48 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 49 | m.def("forward", &clusten_av_forward, "CLUSTENAV forward (CUDA)"); 50 | m.def("backward", &clusten_av_backward, "CLUSTENAV backward (CUDA)"); 51 | } 52 | -------------------------------------------------------------------------------- /mask2former/modeling/clusten/src/clustenqk_cuda.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * For licensing see accompanying LICENSE file. 3 | * Copyright (C) 2023 Apple Inc. All Rights Reserved. 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | torch::Tensor clusten_qk_cuda_forward( 10 | const torch::Tensor &query, // b x h x n x c 11 | const torch::Tensor &key, // b x h x n x c 12 | const torch::Tensor &nbhd_idx); // b x n x m 13 | 14 | std::vector clusten_qk_cuda_backward( 15 | const torch::Tensor &d_attn, 16 | const torch::Tensor &query, 17 | const torch::Tensor &key, 18 | const torch::Tensor &nbhd_idx); 19 | 20 | // C++ interface 21 | #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor") 22 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") 23 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) 24 | 25 | torch::Tensor clusten_qk_forward( 26 | const torch::Tensor &query, 27 | const torch::Tensor &key, 28 | const torch::Tensor &nbhd_idx) { 29 | CHECK_INPUT(query); 30 | CHECK_INPUT(key); 31 | CHECK_INPUT(nbhd_idx); 32 | return clusten_qk_cuda_forward(query, key, nbhd_idx); 33 | } 34 | 35 | std::vector clusten_qk_backward( 36 | const torch::Tensor &d_attn, 37 | const torch::Tensor &query, 38 | const torch::Tensor &key, 39 | const torch::Tensor &nbhd_idx) { 40 | CHECK_INPUT(d_attn); 41 | CHECK_INPUT(query); 42 | CHECK_INPUT(key); 43 | CHECK_INPUT(nbhd_idx); 44 | return clusten_qk_cuda_backward(d_attn, query, key, nbhd_idx); 45 | } 46 | 47 | 48 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 49 | m.def("forward", &clusten_qk_forward, "CLUSTENQK forward (CUDA)"); 50 | m.def("backward", &clusten_qk_backward, "CLUSTENQK backward (CUDA)"); 51 | } 52 | -------------------------------------------------------------------------------- /mask2former/modeling/clusten/src/clustenqk_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * For licensing see accompanying LICENSE file. 3 | * Copyright (C) 2023 Apple Inc. All Rights Reserved. 4 | */ 5 | 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #define CUDA_NUM_THREADS 1024 17 | 18 | template 19 | __global__ void clusten_qk_cuda_forward_kernel( 20 | const torch::PackedTensorAccessor32 query, // b x h x n x c 21 | const torch::PackedTensorAccessor32 key, // b x h x c x n (reordered by cluster) 22 | const torch::PackedTensorAccessor32 nbhd_idx, // b x n x m 23 | torch::PackedTensorAccessor32 attn, // b x h x n x m 24 | const int length, // n 25 | const int batch_size, // b 26 | const int heads, // h 27 | const int nbhd_size, // m 28 | const int dim) { // c 29 | 30 | const int z = blockIdx.z * blockDim.z + threadIdx.z; 31 | if (z < batch_size * heads){ 32 | const int i = blockIdx.y * blockDim.y + threadIdx.y; 33 | if (i < length){ 34 | const int ni = blockIdx.x * blockDim.x + threadIdx.x; 35 | if (ni < nbhd_size){ 36 | const int b = z / heads; 37 | const int h = z - b * heads; 38 | int64_t nbi = nbhd_idx[b][i][ni]; 39 | // calculate q@k 40 | scalar_t updt = scalar_t(0); 41 | #pragma unroll 42 | for (unsigned int c=0; c < dim; ++c) { 43 | updt += query[b][h][i][c] * key[b][h][c][nbi]; 44 | } 45 | attn[b][h][i][ni] = updt; 46 | } 47 | } 48 | } 49 | } 50 | 51 | 52 | torch::Tensor clusten_qk_cuda_forward( 53 | const torch::Tensor &query, 54 | const torch::Tensor &key, 55 | const torch::Tensor &nbhd_idx) { 56 | 57 | int64_t batch_size = query.size(0); 58 | int64_t heads = query.size(1); 59 | int64_t length = query.size(2); 60 | int64_t dim = query.size(3); 61 | int64_t nbhd_size = nbhd_idx.size(2); 62 | int zsize = batch_size * heads; 63 | 64 | int NBHDTHREADS = min(int64_t(CUDA_NUM_THREADS), nbhd_size); 65 | int TOKENTHREADS = min(int64_t(CUDA_NUM_THREADS / NBHDTHREADS), length); 66 | int BATCHTHREADS = max(1, CUDA_NUM_THREADS / (TOKENTHREADS * NBHDTHREADS)); 67 | 68 | auto attn = torch::zeros( 69 | {batch_size, heads, length, nbhd_size}, query.options()); 70 | 71 | const auto stream = c10::cuda::getCurrentCUDAStream(); 72 | const dim3 blocks( 73 | (dim + NBHDTHREADS - 1) / NBHDTHREADS, 74 | (length + TOKENTHREADS - 1) / TOKENTHREADS, 75 | (zsize + BATCHTHREADS - 1) / BATCHTHREADS); 76 | const dim3 threads(NBHDTHREADS, TOKENTHREADS, BATCHTHREADS); 77 | 78 | AT_DISPATCH_FLOATING_TYPES_AND_HALF(query.scalar_type(), "clusten_qk_cuda_forward", ([&] { 79 | const auto query_a = query.packed_accessor32(); 80 | const auto key_a = key.packed_accessor32(); 81 | const auto nbhd_idx_a = nbhd_idx.packed_accessor32(); 82 | auto attn_a = attn.packed_accessor32(); 83 | 84 | clusten_qk_cuda_forward_kernel<<>>( 85 | query_a, key_a, nbhd_idx_a, attn_a, 86 | length, batch_size, heads, nbhd_size, dim); 87 | })); 88 | return attn; 89 | } 90 | 91 | template 92 | __global__ void clusten_qk_cuda_backward_kernel( 93 | const torch::PackedTensorAccessor32 d_attn, 94 | const torch::PackedTensorAccessor32 query, 95 | const torch::PackedTensorAccessor32 key, 96 | const torch::PackedTensorAccessor32 nbhd_idx, 97 | torch::PackedTensorAccessor32 d_query, 98 | torch::PackedTensorAccessor32 d_key, 99 | const int length, 100 | const int batch_size, 101 | const int heads, 102 | const int nbhd_size, 103 | const int dim, 104 | const size_t d_key_numel) { 105 | 106 | const int z = blockIdx.z * blockDim.z + threadIdx.z; 107 | if (z < batch_size * heads){ 108 | const int i = blockIdx.y * blockDim.y + threadIdx.y; 109 | if (i < length){ 110 | const int c = blockIdx.x * blockDim.x + threadIdx.x; 111 | if (c < dim){ 112 | const int b = z / heads; 113 | const int h = z - b * heads; 114 | size_t index; 115 | scalar_t dq_update = scalar_t(0); 116 | scalar_t d_attn_tmp; 117 | #pragma unroll 118 | for (unsigned int ni=0; ni < nbhd_size; ++ni) { 119 | const int64_t nbi = nbhd_idx[b][i][ni]; 120 | // calculate d_query = key * d_att 121 | // calculate d_key = query * d_att 122 | d_attn_tmp = d_attn[b][h][i][ni]; 123 | dq_update += key[b][h][nbi][c] * d_attn_tmp; 124 | index = b*d_key.stride(0) + h*d_key.stride(1) + nbi*d_key.stride(2) + c; 125 | at::native::fastAtomicAdd(d_key.data(), index, d_key_numel, query[b][h][i][c] * d_attn_tmp, true); 126 | //atomicAdd(&(d_key[b][h][nbi][c]), query[b][h][i][c] * d_attn_tmp); // avoid race condition 127 | } 128 | d_query[b][h][i][c] = dq_update; 129 | } 130 | } 131 | } 132 | } 133 | 134 | std::vector clusten_qk_cuda_backward( 135 | const torch::Tensor &d_attn, 136 | const torch::Tensor &query, 137 | const torch::Tensor &key, 138 | const torch::Tensor &nbhd_idx) { 139 | 140 | int64_t batch_size = query.size(0); 141 | int64_t heads = query.size(1); 142 | int64_t length = query.size(2); 143 | int64_t dim = query.size(3); 144 | int64_t nbhd_size = nbhd_idx.size(2); 145 | int zsize = batch_size * heads; 146 | 147 | int CHANNELTHREADS = min(int64_t(CUDA_NUM_THREADS), dim); 148 | int TOKENTHREADS = min(int64_t(CUDA_NUM_THREADS / CHANNELTHREADS), length); 149 | int BATCHTHREADS = max(1, CUDA_NUM_THREADS / (TOKENTHREADS * CHANNELTHREADS)); 150 | 151 | auto d_query = torch::zeros_like(query); 152 | auto d_key = torch::zeros_like(key); 153 | 154 | const auto stream = c10::cuda::getCurrentCUDAStream(); 155 | 156 | const dim3 blocks( 157 | (dim + CHANNELTHREADS - 1) / CHANNELTHREADS, 158 | (length + TOKENTHREADS - 1) / TOKENTHREADS, 159 | (zsize + BATCHTHREADS - 1) / BATCHTHREADS); 160 | 161 | const dim3 threads(CHANNELTHREADS, TOKENTHREADS, BATCHTHREADS); 162 | 163 | AT_DISPATCH_FLOATING_TYPES_AND_HALF(query.scalar_type(), "clusten_qk_cuda_backward", ([&] { 164 | const auto d_attn_a = d_attn.packed_accessor32(); 165 | const auto query_a = query.packed_accessor32(); 166 | const auto key_a = key.packed_accessor32(); 167 | const auto nbhd_idx_a = nbhd_idx.packed_accessor32(); 168 | auto d_query_a = d_query.packed_accessor32(); 169 | auto d_key_a = d_key.packed_accessor32(); 170 | 171 | const size_t d_key_numel = d_key.numel(); 172 | clusten_qk_cuda_backward_kernel<<>>( 173 | d_attn_a, query_a, key_a, nbhd_idx_a, d_query_a, d_key_a, 174 | length, batch_size, heads, nbhd_size, dim, d_key_numel); 175 | })); 176 | 177 | return {d_query, d_key}; 178 | } 179 | -------------------------------------------------------------------------------- /mask2former/modeling/clusten/src/clustenwf_cuda.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * For licensing see accompanying LICENSE file. 3 | * Copyright (C) 2023 Apple Inc. All Rights Reserved. 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | torch::Tensor clusten_wf_cuda_forward( 10 | const torch::Tensor &weights, // b x n_ x m x ic 11 | const torch::Tensor &feat, // b x n x c 12 | const torch::Tensor &nbhd_idx); // b x n_ x m 13 | 14 | std::vector clusten_wf_cuda_backward( 15 | const torch::Tensor &d_feat_new, 16 | const torch::Tensor &weights, 17 | const torch::Tensor &feat, 18 | const torch::Tensor &nbhd_idx); 19 | 20 | // C++ interface 21 | #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor") 22 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") 23 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) 24 | 25 | torch::Tensor clusten_wf_forward( 26 | const torch::Tensor &weights, 27 | const torch::Tensor &feat, 28 | const torch::Tensor &nbhd_idx) { 29 | CHECK_INPUT(weights); 30 | CHECK_INPUT(feat); 31 | CHECK_INPUT(nbhd_idx); 32 | return clusten_wf_cuda_forward(weights, feat, nbhd_idx); 33 | } 34 | 35 | std::vector clusten_wf_backward( 36 | const torch::Tensor &d_feat_new, 37 | const torch::Tensor &weights, 38 | const torch::Tensor &feat, 39 | const torch::Tensor &nbhd_idx) { 40 | CHECK_INPUT(d_feat_new); 41 | CHECK_INPUT(weights); 42 | CHECK_INPUT(feat); 43 | CHECK_INPUT(nbhd_idx); 44 | return clusten_wf_cuda_backward(d_feat_new, weights, feat, nbhd_idx); 45 | } 46 | 47 | 48 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 49 | m.def("forward", &clusten_wf_forward, "CLUSTENWF forward (CUDA)"); 50 | m.def("backward", &clusten_wf_backward, "CLUSTENWF backward (CUDA)"); 51 | } 52 | -------------------------------------------------------------------------------- /mask2former/modeling/clusten/src/msdetrpc_cuda.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * For licensing see accompanying LICENSE file. 3 | * Copyright (C) 2023 Apple Inc. All Rights Reserved. 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | torch::Tensor msdetrpc_cuda_forward( 10 | const torch::Tensor &nn_idx, // b x n x m x k 11 | const torch::Tensor &nn_weight, // b x n x m x k 12 | const torch::Tensor &attn, // b x n x m 13 | const torch::Tensor &val); // b x n_ x c 14 | 15 | std::vector msdetrpc_cuda_backward( 16 | const torch::Tensor &d_feat, 17 | const torch::Tensor &nn_idx, 18 | const torch::Tensor &nn_weight, 19 | const torch::Tensor &attn, 20 | const torch::Tensor &val); 21 | 22 | // C++ interface 23 | #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor") 24 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") 25 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) 26 | 27 | torch::Tensor msdetrpc_forward( 28 | const torch::Tensor &nn_idx, 29 | const torch::Tensor &nn_weight, 30 | const torch::Tensor &attn, 31 | const torch::Tensor &val) { 32 | CHECK_INPUT(nn_idx); 33 | CHECK_INPUT(nn_weight); 34 | CHECK_INPUT(attn); 35 | CHECK_INPUT(val); 36 | return msdetrpc_cuda_forward(nn_idx, nn_weight, attn, val); 37 | } 38 | 39 | std::vector msdetrpc_backward( 40 | const torch::Tensor &d_feat, 41 | const torch::Tensor &nn_idx, 42 | const torch::Tensor &nn_weight, 43 | const torch::Tensor &attn, 44 | const torch::Tensor &val) { 45 | CHECK_INPUT(d_feat); 46 | CHECK_INPUT(nn_idx); 47 | CHECK_INPUT(nn_weight); 48 | CHECK_INPUT(attn); 49 | CHECK_INPUT(val); 50 | return msdetrpc_cuda_backward(d_feat, nn_idx, nn_weight, attn, val); 51 | } 52 | 53 | 54 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 55 | m.def("forward", &msdetrpc_forward, "MSDETRPC forward (CUDA)"); 56 | m.def("backward", &msdetrpc_backward, "MSDETRPC backward (CUDA)"); 57 | } 58 | -------------------------------------------------------------------------------- /mask2former/modeling/clusten/src/setup.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2023 Apple Inc. All Rights Reserved. 4 | # 5 | 6 | from setuptools import setup 7 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 8 | 9 | setup( 10 | name='clustencuda', 11 | version='0.1', 12 | author='Ziwen Chen', 13 | author_email='chenziw@oregonstate.edu', 14 | description='Cluster Attention CUDA Kernel', 15 | ext_modules=[ 16 | CUDAExtension('clustenqk_cuda', [ 17 | 'clustenqk_cuda.cpp', 18 | 'clustenqk_cuda_kernel.cu', 19 | ]), 20 | CUDAExtension('clustenav_cuda', [ 21 | 'clustenav_cuda.cpp', 22 | 'clustenav_cuda_kernel.cu', 23 | ]), 24 | CUDAExtension('clustenwf_cuda', [ 25 | 'clustenwf_cuda.cpp', 26 | 'clustenwf_cuda_kernel.cu', 27 | ]), 28 | CUDAExtension('weighted_gather_cuda', [ 29 | 'weighted_gather_cuda.cpp', 30 | 'weighted_gather_cuda_kernel.cu', 31 | ]), 32 | CUDAExtension('msdetrpc_cuda', [ 33 | 'msdetrpc_cuda.cpp', 34 | 'msdetrpc_cuda_kernel.cu', 35 | ]), 36 | ], 37 | cmdclass={ 38 | 'build_ext': BuildExtension 39 | }) 40 | -------------------------------------------------------------------------------- /mask2former/modeling/clusten/src/weighted_gather_cuda.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * For licensing see accompanying LICENSE file. 3 | * Copyright (C) 2023 Apple Inc. All Rights Reserved. 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | torch::Tensor weighted_gather_cuda_forward( 10 | const torch::Tensor &nbhd_idx, // b x n x m 11 | const torch::Tensor &weights, // b x n x m 12 | const torch::Tensor &feat); // b x n_ x c 13 | 14 | std::vector weighted_gather_cuda_backward( 15 | const torch::Tensor &d_feat_new, 16 | const torch::Tensor &nbhd_idx, 17 | const torch::Tensor &weights, 18 | const torch::Tensor &feat); 19 | 20 | // C++ interface 21 | #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor") 22 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") 23 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) 24 | 25 | torch::Tensor weighted_gather_forward( 26 | const torch::Tensor &nbhd_idx, 27 | const torch::Tensor &weights, 28 | const torch::Tensor &feat) { 29 | CHECK_INPUT(nbhd_idx); 30 | CHECK_INPUT(weights); 31 | CHECK_INPUT(feat); 32 | return weighted_gather_cuda_forward(nbhd_idx, weights, feat); 33 | } 34 | 35 | std::vector weighted_gather_backward( 36 | const torch::Tensor &d_feat_new, 37 | const torch::Tensor &nbhd_idx, 38 | const torch::Tensor &weights, 39 | const torch::Tensor &feat) { 40 | CHECK_INPUT(d_feat_new); 41 | CHECK_INPUT(nbhd_idx); 42 | CHECK_INPUT(weights); 43 | CHECK_INPUT(feat); 44 | return weighted_gather_cuda_backward(d_feat_new, nbhd_idx, weights, feat); 45 | } 46 | 47 | 48 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 49 | m.def("forward", &weighted_gather_forward, "WEIGHTEDGATHER forward (CUDA)"); 50 | m.def("backward", &weighted_gather_backward, "WEIGHTEDGATHER backward (CUDA)"); 51 | } 52 | -------------------------------------------------------------------------------- /mask2former/modeling/clusten/test_msdetrpc_kernel.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2023 Apple Inc. All Rights Reserved. 4 | # 5 | 6 | import torch 7 | from clusten import MSDETRPCFunction 8 | 9 | """ 10 | Test the correctness of MSDETR (point cloud) custom kernel 11 | """ 12 | 13 | b = 100 14 | n = 50 15 | n_ = 100 16 | m = 8 17 | k = 4 18 | c = 32 19 | 20 | # dummy data 21 | nn_idx = torch.randint(n_, (b, n, m, k)).cuda() 22 | nn_weights = torch.rand(b, n, m, k).cuda() 23 | attn = torch.rand(b, n, m).cuda() 24 | val = torch.rand(b, n_, c).cuda() 25 | 26 | nn_weights.requires_grad_(True) 27 | nn_weights.retain_grad() 28 | attn.requires_grad_(True) 29 | attn.retain_grad() 30 | val.requires_grad_(True) 31 | val.retain_grad() 32 | 33 | # use the custom kernel 34 | feat = MSDETRPCFunction.apply(nn_idx, nn_weights, attn, val) 35 | feat.mean().backward() 36 | grad_weights = nn_weights.grad.clone().detach() 37 | grad_attn = attn.grad.clone().detach() 38 | grad_val = val.grad.clone().detach() 39 | nn_weights.grad.data.zero_() 40 | attn.grad.data.zero_() 41 | val.grad.data.zero_() 42 | 43 | # use the pytorch equivalent 44 | nn_val = val.gather(index=nn_idx.view(b, -1).unsqueeze(2).expand(-1, -1, c), dim=1).reshape(b, n, m, k, c) 45 | feat2 = ((nn_val * nn_weights.unsqueeze(4)).sum(3) * attn.unsqueeze(3)).sum(2) # b x n x c 46 | feat2.mean().backward() 47 | grad_weights2 = nn_weights.grad.clone().detach() 48 | grad_attn2 = attn.grad.clone().detach() 49 | grad_val2 = val.grad.clone().detach() 50 | nn_weights.grad.data.zero_() 51 | attn.grad.data.zero_() 52 | val.grad.data.zero_() 53 | 54 | print('diff of forward: ', torch.linalg.norm(feat2 - feat)) 55 | print('diff of grad weights: ', torch.linalg.norm(grad_weights2 - grad_weights)) 56 | print('diff of grad attn: ', torch.linalg.norm(grad_attn2 - grad_attn)) 57 | print('diff of grad val: ', torch.linalg.norm(grad_val2 - grad_val)) 58 | -------------------------------------------------------------------------------- /mask2former/modeling/clusten/test_wg_kernel.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2023 Apple Inc. All Rights Reserved. 4 | # 5 | 6 | import torch 7 | from clusten import WEIGHTEDGATHERFunction 8 | 9 | """ 10 | Test the correctness of WeightedGather custom kernel 11 | """ 12 | 13 | b = 100 14 | n = 50 15 | n_ = 100 16 | k = 4 17 | c = 32 18 | 19 | # dummy data 20 | nn_idx = torch.randint(n_, (b, n, k)).cuda() 21 | nn_weights = torch.rand(b, n, k).cuda() 22 | feature = torch.rand(b, n_, c).cuda() 23 | nn_weights.requires_grad_(True) 24 | nn_weights.retain_grad() 25 | feature.requires_grad_(True) 26 | feature.retain_grad() 27 | 28 | # use the custom kernel 29 | up_features = WEIGHTEDGATHERFunction.apply(nn_idx, nn_weights, feature) 30 | up_features.mean().backward() 31 | grad_weights = nn_weights.grad.clone().detach() 32 | grad_feat = feature.grad.clone().detach() 33 | nn_weights.grad.data.zero_() 34 | feature.grad.data.zero_() 35 | 36 | # use the pytorch equivalent 37 | nn_features = feature.gather(index=nn_idx.view(b, -1).unsqueeze(2).expand(-1, -1, c), dim=1).reshape(b, n, k, c) 38 | up_features2 = nn_features.mul(nn_weights.unsqueeze(3).expand(-1, -1, -1, c)).sum(dim=2) # b x n x c 39 | up_features2.mean().backward() 40 | grad_weights2 = nn_weights.grad.clone().detach() 41 | grad_feat2 = feature.grad.clone().detach() 42 | nn_weights.grad.data.zero_() 43 | feature.grad.data.zero_() 44 | 45 | print('diff of forward: ', torch.linalg.norm(up_features2 - up_features)) 46 | print('diff of grad weights: ', torch.linalg.norm(grad_weights2 - grad_weights)) 47 | print('diff of grad feat: ', torch.linalg.norm(grad_feat2 - grad_feat)) 48 | -------------------------------------------------------------------------------- /mask2former/modeling/matcher.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/matcher.py 3 | 4 | """ 5 | Modules to compute the matching cost and solve the corresponding LSAP. 6 | """ 7 | import torch 8 | import torch.nn.functional as F 9 | from scipy.optimize import linear_sum_assignment 10 | from torch import nn 11 | from torch.cuda.amp import autocast 12 | 13 | from detectron2.projects.point_rend.point_features import point_sample 14 | 15 | 16 | def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor): 17 | """ 18 | Compute the DICE loss, similar to generalized IOU for masks 19 | Args: 20 | inputs: A float tensor of arbitrary shape. 21 | The predictions for each example. 22 | targets: A float tensor with the same shape as inputs. Stores the binary 23 | classification label for each element in inputs 24 | (0 for the negative class and 1 for the positive class). 25 | """ 26 | inputs = inputs.sigmoid() 27 | inputs = inputs.flatten(1) 28 | numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets) 29 | denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :] 30 | loss = 1 - (numerator + 1) / (denominator + 1) 31 | return loss 32 | 33 | 34 | batch_dice_loss_jit = torch.jit.script( 35 | batch_dice_loss 36 | ) # type: torch.jit.ScriptModule 37 | 38 | 39 | def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor): 40 | """ 41 | Args: 42 | inputs: A float tensor of arbitrary shape. 43 | The predictions for each example. 44 | targets: A float tensor with the same shape as inputs. Stores the binary 45 | classification label for each element in inputs 46 | (0 for the negative class and 1 for the positive class). 47 | Returns: 48 | Loss tensor 49 | """ 50 | hw = inputs.shape[1] 51 | 52 | pos = F.binary_cross_entropy_with_logits( 53 | inputs, torch.ones_like(inputs), reduction="none" 54 | ) 55 | neg = F.binary_cross_entropy_with_logits( 56 | inputs, torch.zeros_like(inputs), reduction="none" 57 | ) 58 | 59 | loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum( 60 | "nc,mc->nm", neg, (1 - targets) 61 | ) 62 | 63 | return loss / hw 64 | 65 | 66 | batch_sigmoid_ce_loss_jit = torch.jit.script( 67 | batch_sigmoid_ce_loss 68 | ) # type: torch.jit.ScriptModule 69 | 70 | 71 | class HungarianMatcher(nn.Module): 72 | """This class computes an assignment between the targets and the predictions of the network 73 | 74 | For efficiency reasons, the targets don't include the no_object. Because of this, in general, 75 | there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, 76 | while the others are un-matched (and thus treated as non-objects). 77 | """ 78 | 79 | def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float = 1, num_points: int = 0): 80 | """Creates the matcher 81 | 82 | Params: 83 | cost_class: This is the relative weight of the classification error in the matching cost 84 | cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost 85 | cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost 86 | """ 87 | super().__init__() 88 | self.cost_class = cost_class 89 | self.cost_mask = cost_mask 90 | self.cost_dice = cost_dice 91 | 92 | assert cost_class != 0 or cost_mask != 0 or cost_dice != 0, "all costs cant be 0" 93 | 94 | self.num_points = num_points 95 | 96 | @torch.no_grad() 97 | def memory_efficient_forward(self, outputs, targets): 98 | """More memory-friendly matching""" 99 | bs, num_queries = outputs["pred_logits"].shape[:2] 100 | 101 | indices = [] 102 | 103 | # Iterate through batch size 104 | for b in range(bs): 105 | 106 | out_prob = outputs["pred_logits"][b].softmax(-1) # [num_queries, num_classes] 107 | tgt_ids = targets[b]["labels"] 108 | 109 | # Compute the classification cost. Contrary to the loss, we don't use the NLL, 110 | # but approximate it in 1 - proba[target class]. 111 | # The 1 is a constant that doesn't change the matching, it can be ommitted. 112 | cost_class = -out_prob[:, tgt_ids] 113 | 114 | out_mask = outputs["pred_masks"][b] # [num_queries, H_pred, W_pred] 115 | # gt masks are already padded when preparing target 116 | tgt_mask = targets[b]["masks"].to(out_mask) 117 | 118 | out_mask = out_mask[:, None] 119 | tgt_mask = tgt_mask[:, None] 120 | # all masks share the same set of points for efficient matching! 121 | point_coords = torch.rand(1, self.num_points, 2, device=out_mask.device) 122 | # get gt labels 123 | tgt_mask = point_sample( 124 | tgt_mask, 125 | point_coords.repeat(tgt_mask.shape[0], 1, 1), 126 | align_corners=False, 127 | ).squeeze(1) 128 | 129 | out_mask = point_sample( 130 | out_mask, 131 | point_coords.repeat(out_mask.shape[0], 1, 1), 132 | align_corners=False, 133 | ).squeeze(1) 134 | 135 | with autocast(enabled=False): 136 | out_mask = out_mask.float() 137 | tgt_mask = tgt_mask.float() 138 | # Compute the focal loss between masks 139 | # cost_mask = batch_sigmoid_ce_loss_jit(out_mask, tgt_mask) 140 | cost_mask = batch_sigmoid_ce_loss(out_mask, tgt_mask) 141 | 142 | # Compute the dice loss betwen masks 143 | # cost_dice = batch_dice_loss_jit(out_mask, tgt_mask) 144 | cost_dice = batch_dice_loss(out_mask, tgt_mask) 145 | 146 | # Final cost matrix 147 | C = ( 148 | self.cost_mask * cost_mask 149 | + self.cost_class * cost_class 150 | + self.cost_dice * cost_dice 151 | ) 152 | C = C.reshape(num_queries, -1).cpu() 153 | 154 | indices.append(linear_sum_assignment(C)) 155 | 156 | return [ 157 | (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) 158 | for i, j in indices 159 | ] 160 | 161 | @torch.no_grad() 162 | def forward(self, outputs, targets): 163 | """Performs the matching 164 | 165 | Params: 166 | outputs: This is a dict that contains at least these entries: 167 | "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits 168 | "pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks 169 | 170 | targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: 171 | "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth 172 | objects in the target) containing the class labels 173 | "masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks 174 | 175 | Returns: 176 | A list of size batch_size, containing tuples of (index_i, index_j) where: 177 | - index_i is the indices of the selected predictions (in order) 178 | - index_j is the indices of the corresponding selected targets (in order) 179 | For each batch element, it holds: 180 | len(index_i) = len(index_j) = min(num_queries, num_target_boxes) 181 | """ 182 | return self.memory_efficient_forward(outputs, targets) 183 | 184 | def __repr__(self, _repr_indent=4): 185 | head = "Matcher " + self.__class__.__name__ 186 | body = [ 187 | "cost_class: {}".format(self.cost_class), 188 | "cost_mask: {}".format(self.cost_mask), 189 | "cost_dice: {}".format(self.cost_dice), 190 | ] 191 | lines = [head] + [" " * _repr_indent + line for line in body] 192 | return "\n".join(lines) 193 | -------------------------------------------------------------------------------- /mask2former/modeling/meta_arch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /mask2former/modeling/meta_arch/mask_former_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | import logging 4 | from typing import Dict 5 | 6 | from torch import nn 7 | 8 | from detectron2.config import configurable 9 | from detectron2.layers import ShapeSpec 10 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY 11 | 12 | from ..transformer_decoder.mask2former_transformer_decoder import build_transformer_decoder 13 | from ..pixel_decoder.msdeformattn_pc import build_pixel_decoder 14 | 15 | 16 | @SEM_SEG_HEADS_REGISTRY.register() 17 | class MaskFormerHead(nn.Module): 18 | 19 | _version = 2 20 | 21 | def _load_from_state_dict( 22 | self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs 23 | ): 24 | version = local_metadata.get("version", None) 25 | if version is None or version < 2: 26 | # Do not warn if train from scratch 27 | scratch = True 28 | logger = logging.getLogger(__name__) 29 | for k in list(state_dict.keys()): 30 | newk = k 31 | if newk != k: 32 | state_dict[newk] = state_dict[k] 33 | del state_dict[k] 34 | scratch = False 35 | 36 | if not scratch: 37 | logger.warning( 38 | f"Weight format of {self.__class__.__name__} have changed! " 39 | "Please upgrade your models. Applying automatic conversion now ..." 40 | ) 41 | 42 | 43 | @configurable 44 | def __init__( 45 | self, 46 | input_shape: Dict[str, ShapeSpec], 47 | *, 48 | num_classes: int, 49 | pixel_decoder: nn.Module, 50 | loss_weight: float = 1.0, 51 | ignore_value: int = -1, 52 | # extra parameters 53 | transformer_predictor: nn.Module, 54 | transformer_in_feature: str, 55 | ): 56 | """ 57 | NOTE: this interface is experimental. 58 | Args: 59 | input_shape: shapes (channels and stride) of the input features 60 | num_classes: number of classes to predict 61 | pixel_decoder: the pixel decoder module 62 | loss_weight: loss weight 63 | ignore_value: category id to be ignored during training. 64 | transformer_predictor: the transformer decoder that makes prediction 65 | transformer_in_feature: input feature name to the transformer_predictor 66 | """ 67 | super().__init__() 68 | input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) 69 | self.in_features = [k for k, v in input_shape] 70 | feature_strides = [v.stride for k, v in input_shape] 71 | feature_channels = [v.channels for k, v in input_shape] 72 | 73 | self.ignore_value = ignore_value 74 | self.common_stride = 4 75 | self.loss_weight = loss_weight 76 | 77 | self.pixel_decoder = pixel_decoder 78 | self.predictor = transformer_predictor 79 | self.transformer_in_feature = transformer_in_feature 80 | 81 | self.num_classes = num_classes 82 | 83 | @classmethod 84 | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): 85 | # figure out in_channels to transformer predictor 86 | if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder": 87 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM 88 | elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding": 89 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM 90 | elif cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder": # for maskformer2 91 | transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM 92 | else: 93 | transformer_predictor_in_channels = input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels 94 | 95 | return { 96 | "input_shape": { 97 | k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES 98 | }, 99 | "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, 100 | "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, 101 | "pixel_decoder": build_pixel_decoder(cfg, input_shape), 102 | "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT, 103 | "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE, 104 | "transformer_predictor": build_transformer_decoder( 105 | cfg, 106 | transformer_predictor_in_channels, 107 | mask_classification=True, 108 | ), 109 | } 110 | 111 | def forward(self, features, mask=None): 112 | return self.layers(features, mask) 113 | 114 | def layers(self, features, mask=None): 115 | mask_features, mf_pos, transformer_encoder_features, multi_scale_features, multi_scale_poss = self.pixel_decoder.forward_features(features) 116 | if self.transformer_in_feature == "multi_scale_pixel_decoder": 117 | predictions = self.predictor(multi_scale_features, multi_scale_poss, mask_features, mf_pos, mask) 118 | else: 119 | if self.transformer_in_feature == "transformer_encoder": 120 | assert ( 121 | transformer_encoder_features is not None 122 | ), "Please use the TransformerEncoderPixelDecoder." 123 | predictions = self.predictor(transformer_encoder_features, mask_features, mask) 124 | elif self.transformer_in_feature == "pixel_embedding": 125 | predictions = self.predictor(mask_features, mask_features, mask) 126 | else: 127 | predictions = self.predictor(features[self.transformer_in_feature], mask_features, mask) 128 | return predictions 129 | -------------------------------------------------------------------------------- /mask2former/modeling/transformer_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder 4 | -------------------------------------------------------------------------------- /mask2former/modeling/transformer_decoder/position_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py 3 | # Adapted for AutoFocusFormer by Ziwen 2023 4 | 5 | """ 6 | Various positional encodings for the transformer. 7 | """ 8 | import math 9 | 10 | import torch 11 | from torch import nn 12 | 13 | 14 | class PositionEmbeddingSine(nn.Module): 15 | """ 16 | This is a more standard version of the position embedding, very similar to the one 17 | used by the Attention is all you need paper, generalized to work on images. 18 | """ 19 | 20 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 21 | super().__init__() 22 | self.num_pos_feats = num_pos_feats 23 | self.temperature = temperature 24 | self.normalize = normalize 25 | if scale is not None and normalize is False: 26 | raise ValueError("normalize should be True if scale is passed") 27 | if scale is None: 28 | scale = 2 * math.pi 29 | self.scale = scale 30 | 31 | def forward(self, pos): 32 | ''' 33 | pos - b x n x d 34 | ''' 35 | b, n, d = pos.shape 36 | y_embed = pos[:, :, 1] # b x n 37 | x_embed = pos[:, :, 0] 38 | if self.normalize: 39 | eps = 1e-6 40 | y_embed = y_embed / (y_embed.max() + eps) * self.scale 41 | x_embed = x_embed / (x_embed.max() + eps) * self.scale 42 | 43 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=pos.device) # npf 44 | dim_t = self.temperature ** (2 * (dim_t.div(2, rounding_mode='floor')) / self.num_pos_feats) # npf 45 | 46 | pos_x = x_embed[:, :, None] / dim_t # b x n x npf 47 | pos_y = y_embed[:, :, None] / dim_t 48 | pos_x = torch.cat( 49 | (pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=2 50 | ) 51 | pos_y = torch.cat( 52 | (pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=2 53 | ) 54 | pos = torch.cat((pos_x, pos_y), dim=2) # b x n x d' 55 | return pos 56 | 57 | def __repr__(self, _repr_indent=4): 58 | head = "Positional encoding " + self.__class__.__name__ 59 | body = [ 60 | "num_pos_feats: {}".format(self.num_pos_feats), 61 | "temperature: {}".format(self.temperature), 62 | "normalize: {}".format(self.normalize), 63 | "scale: {}".format(self.scale), 64 | ] 65 | # _repr_indent = 4 66 | lines = [head] + [" " * _repr_indent + line for line in body] 67 | return "\n".join(lines) 68 | -------------------------------------------------------------------------------- /mask2former/test_time_augmentation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | import copy 4 | 5 | import numpy as np 6 | import torch 7 | from fvcore.transforms import HFlipTransform 8 | from torch import nn 9 | from torch.nn.parallel import DistributedDataParallel 10 | 11 | from detectron2.data.detection_utils import read_image 12 | from detectron2.modeling import DatasetMapperTTA 13 | 14 | 15 | __all__ = [ 16 | "SemanticSegmentorWithTTA", 17 | ] 18 | 19 | 20 | class SemanticSegmentorWithTTA(nn.Module): 21 | """ 22 | A SemanticSegmentor with test-time augmentation enabled. 23 | Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`. 24 | """ 25 | 26 | def __init__(self, cfg, model, tta_mapper=None, batch_size=1): 27 | """ 28 | Args: 29 | cfg (CfgNode): 30 | model (SemanticSegmentor): a SemanticSegmentor to apply TTA on. 31 | tta_mapper (callable): takes a dataset dict and returns a list of 32 | augmented versions of the dataset dict. Defaults to 33 | `DatasetMapperTTA(cfg)`. 34 | batch_size (int): batch the augmented images into this batch size for inference. 35 | """ 36 | super().__init__() 37 | if isinstance(model, DistributedDataParallel): 38 | model = model.module 39 | self.cfg = cfg.clone() 40 | 41 | self.model = model 42 | 43 | if tta_mapper is None: 44 | tta_mapper = DatasetMapperTTA(cfg) 45 | self.tta_mapper = tta_mapper 46 | self.batch_size = batch_size 47 | 48 | def __call__(self, batched_inputs): 49 | """ 50 | Same input/output format as :meth:`SemanticSegmentor.forward` 51 | """ 52 | 53 | def _maybe_read_image(dataset_dict): 54 | ret = copy.copy(dataset_dict) 55 | if "image" not in ret: 56 | image = read_image(ret.pop("file_name"), self.model.input_format) 57 | image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1))) # CHW 58 | ret["image"] = image 59 | if "height" not in ret and "width" not in ret: 60 | ret["height"] = image.shape[1] 61 | ret["width"] = image.shape[2] 62 | return ret 63 | 64 | processed_results = [] 65 | for x in batched_inputs: 66 | result = self._inference_one_image(_maybe_read_image(x)) 67 | processed_results.append(result) 68 | return processed_results 69 | 70 | def _inference_one_image(self, input): 71 | """ 72 | Args: 73 | input (dict): one dataset dict with "image" field being a CHW tensor 74 | Returns: 75 | dict: one output dict 76 | """ 77 | orig_shape = (input["height"], input["width"]) 78 | augmented_inputs, tfms = self._get_augmented_inputs(input) 79 | 80 | final_predictions = None 81 | count_predictions = 0 82 | for input, tfm in zip(augmented_inputs, tfms): 83 | count_predictions += 1 84 | with torch.no_grad(): 85 | if final_predictions is None: 86 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms): 87 | final_predictions = self.model([input])[0].pop("sem_seg").flip(dims=[2]) 88 | else: 89 | final_predictions = self.model([input])[0].pop("sem_seg") 90 | else: 91 | if any(isinstance(t, HFlipTransform) for t in tfm.transforms): 92 | final_predictions += self.model([input])[0].pop("sem_seg").flip(dims=[2]) 93 | else: 94 | final_predictions += self.model([input])[0].pop("sem_seg") 95 | 96 | final_predictions = final_predictions / count_predictions 97 | return {"sem_seg": final_predictions} 98 | 99 | def _get_augmented_inputs(self, input): 100 | augmented_inputs = self.tta_mapper(input) 101 | tfms = [x.pop("transforms") for x in augmented_inputs] 102 | return augmented_inputs, tfms 103 | -------------------------------------------------------------------------------- /mask2former/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /mask2former/utils/misc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | """ 4 | Misc functions, including distributed helpers. 5 | 6 | Mostly copy-paste from torchvision references. 7 | """ 8 | from typing import List, Optional 9 | 10 | import torch 11 | import torch.distributed as dist 12 | import torchvision 13 | from torch import Tensor 14 | 15 | 16 | def _max_by_axis(the_list): 17 | # type: (List[List[int]]) -> List[int] 18 | maxes = the_list[0] 19 | for sublist in the_list[1:]: 20 | for index, item in enumerate(sublist): 21 | maxes[index] = max(maxes[index], item) 22 | return maxes 23 | 24 | 25 | class NestedTensor(object): 26 | def __init__(self, tensors, mask: Optional[Tensor]): 27 | self.tensors = tensors 28 | self.mask = mask 29 | 30 | def to(self, device): 31 | # type: (Device) -> NestedTensor # noqa 32 | cast_tensor = self.tensors.to(device) 33 | mask = self.mask 34 | if mask is not None: 35 | assert mask is not None 36 | cast_mask = mask.to(device) 37 | else: 38 | cast_mask = None 39 | return NestedTensor(cast_tensor, cast_mask) 40 | 41 | def decompose(self): 42 | return self.tensors, self.mask 43 | 44 | def __repr__(self): 45 | return str(self.tensors) 46 | 47 | 48 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): 49 | # TODO make this more general 50 | if tensor_list[0].ndim == 3: 51 | if torchvision._is_tracing(): 52 | # nested_tensor_from_tensor_list() does not export well to ONNX 53 | # call _onnx_nested_tensor_from_tensor_list() instead 54 | return _onnx_nested_tensor_from_tensor_list(tensor_list) 55 | 56 | # TODO make it support different-sized images 57 | max_size = _max_by_axis([list(img.shape) for img in tensor_list]) 58 | # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) 59 | batch_shape = [len(tensor_list)] + max_size 60 | b, c, h, w = batch_shape 61 | dtype = tensor_list[0].dtype 62 | device = tensor_list[0].device 63 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device) 64 | mask = torch.ones((b, h, w), dtype=torch.bool, device=device) 65 | for img, pad_img, m in zip(tensor_list, tensor, mask): 66 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 67 | m[: img.shape[1], : img.shape[2]] = False 68 | else: 69 | raise ValueError("not supported") 70 | return NestedTensor(tensor, mask) 71 | 72 | 73 | # _onnx_nested_tensor_from_tensor_list() is an implementation of 74 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing. 75 | @torch.jit.unused 76 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor: 77 | max_size = [] 78 | for i in range(tensor_list[0].dim()): 79 | max_size_i = torch.max( 80 | torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32) 81 | ).to(torch.int64) 82 | max_size.append(max_size_i) 83 | max_size = tuple(max_size) 84 | 85 | # work around for 86 | # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 87 | # m[: img.shape[1], :img.shape[2]] = False 88 | # which is not yet supported in onnx 89 | padded_imgs = [] 90 | padded_masks = [] 91 | for img in tensor_list: 92 | padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] 93 | padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) 94 | padded_imgs.append(padded_img) 95 | 96 | m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) 97 | padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) 98 | padded_masks.append(padded_mask.to(torch.bool)) 99 | 100 | tensor = torch.stack(padded_imgs) 101 | mask = torch.stack(padded_masks) 102 | 103 | return NestedTensor(tensor, mask=mask) 104 | 105 | 106 | def is_dist_avail_and_initialized(): 107 | if not dist.is_available(): 108 | return False 109 | if not dist.is_initialized(): 110 | return False 111 | return True 112 | -------------------------------------------------------------------------------- /run_aff_segmentation.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # number of parallel gpus 4 | GPUS=2 5 | 6 | # path to config file 7 | CONFIG=configs/cityscapes/panoptic-segmentation/aff/maskformer2_aff_small_bs32_45k.yaml 8 | 9 | # checkpoint path for resume 10 | RESUME=checkpoints/city_pan/aff_small.pth 11 | 12 | # output folder 13 | OUTPUT=outputs/ 14 | 15 | python train_net.py --num-gpus $GPUS \ 16 | --config-file $CONFIG \ 17 | --dist-url tcp://127.0.0.1:12345 \ 18 | --resume \ 19 | --eval-only \ 20 | MODEL.WEIGHTS $RESUME \ 21 | OUTPUT_DIR $OUTPUT 22 | 23 | # Remove '--resume', '--eval-only' and 'MODEL.WEIGHTS' to start training from fresh. 24 | # Note that if '--resume' is on, the 'MODEL.WEIGHTS' option will be overwritten by the last_checkpoint file in the output folder (auto-resume), if the file exists. 25 | # The KEY VALUE pairs must be at the end, after all the flags. 26 | -------------------------------------------------------------------------------- /run_demo.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # path to config file 4 | CONFIG="../configs/cityscapes/panoptic-segmentation/aff/maskformer2_aff_small_bs32_45k.yaml" 5 | 6 | # path to pre-trained checkpoint 7 | CKPT="../checkpoints/city_pan/aff_small.pth" 8 | 9 | # path to images for prediction 10 | INPUTS="../imgs/*.jpg" 11 | 12 | # path to blurred version of input images (optional) 13 | BLUR="../imgs_blur/" 14 | 15 | # output folder to store results 16 | OUTPUT="demo_res" 17 | 18 | # create output folder 19 | mkdir $OUTPUT 20 | 21 | # run visualization code 22 | cd demo/ 23 | python demo.py --config-file $CONFIG \ 24 | --input $INPUTS \ 25 | --output ../$OUTPUT \ 26 | --blur $BLUR \ 27 | --opts MODEL.WEIGHTS $CKPT \ 28 | 29 | # The --opts flag should always be the last one 30 | # Remove --blur flag to visualize predictions on original images 31 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | This directory contains a few tools. 2 | 3 | * `convert-pretrained-model-to-d2.py` 4 | 5 | Tool to convert ImageNet pre-trained weights for D2. 6 | 7 | * `analyze_model.py` 8 | 9 | Tool to analyze model parameters and flops. 10 | 11 | Usage for semantic segmentation (ADE20K only, use with caution!): 12 | 13 | ``` 14 | python tools/analyze_model.py --num-inputs 1 --tasks flop --use-fixed-input-size --config-file CONFIG_FILE 15 | ``` 16 | 17 | Note that, for semantic segmentation (ADE20K only), we use a dummy image with fixed size that equals to `cfg.INPUT.CROP.SIZE[0] x cfg.INPUT.CROP.SIZE[0]`. 18 | Please do not use `--use-fixed-input-size` for calculating FLOPs on other datasets like COCO! 19 | 20 | Usage for panoptic and instance segmentation: 21 | 22 | ``` 23 | python tools/analyze_model.py --num-inputs 100 --tasks flop --config-file CONFIG_FILE 24 | ``` 25 | 26 | Note that, for panoptic and instance segmentation, we compute the average flops over 100 real validation images. 27 | -------------------------------------------------------------------------------- /tools/analyze_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # Modified by Bowen Cheng from https://github.com/facebookresearch/detectron2/blob/main/tools/analyze_model.py 4 | 5 | import logging 6 | import numpy as np 7 | from collections import Counter 8 | import tqdm 9 | from fvcore.nn import flop_count_table # can also try flop_count_str 10 | 11 | from detectron2.checkpoint import DetectionCheckpointer 12 | from detectron2.config import CfgNode, LazyConfig, get_cfg, instantiate 13 | from detectron2.data import build_detection_test_loader 14 | from detectron2.engine import default_argument_parser 15 | from detectron2.modeling import build_model 16 | from detectron2.projects.deeplab import add_deeplab_config 17 | from detectron2.utils.analysis import ( 18 | FlopCountAnalysis, 19 | activation_count_operators, 20 | parameter_count_table, 21 | ) 22 | from detectron2.utils.logger import setup_logger 23 | 24 | # fmt: off 25 | import os 26 | import sys 27 | sys.path.insert(1, os.path.join(sys.path[0], '..')) 28 | # fmt: on 29 | 30 | from mask2former import add_maskformer2_config 31 | 32 | logger = logging.getLogger("detectron2") 33 | 34 | """ 35 | Analyzes FLOP count, parameter count, model structure and operator activation count for models 36 | For usage example, please refer to tools/README.md 37 | """ 38 | 39 | 40 | def setup(args): 41 | if args.config_file.endswith(".yaml"): 42 | cfg = get_cfg() 43 | add_deeplab_config(cfg) 44 | add_maskformer2_config(cfg) 45 | cfg.merge_from_file(args.config_file) 46 | cfg.DATALOADER.NUM_WORKERS = 0 47 | cfg.merge_from_list(args.opts) 48 | cfg.freeze() 49 | else: 50 | cfg = LazyConfig.load(args.config_file) 51 | cfg = LazyConfig.apply_overrides(cfg, args.opts) 52 | setup_logger(name="fvcore") 53 | setup_logger() 54 | return cfg 55 | 56 | 57 | def do_flop(cfg): 58 | if isinstance(cfg, CfgNode): 59 | data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0]) 60 | model = build_model(cfg) 61 | DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) 62 | else: 63 | data_loader = instantiate(cfg.dataloader.test) 64 | model = instantiate(cfg.model) 65 | model.to(cfg.train.device) 66 | DetectionCheckpointer(model).load(cfg.train.init_checkpoint) 67 | model.eval() 68 | 69 | counts = Counter() 70 | total_flops = [] 71 | for idx, data in zip(tqdm.trange(args.num_inputs), data_loader): # noqa 72 | if args.use_fixed_input_size and isinstance(cfg, CfgNode): 73 | import torch 74 | crop_size = cfg.INPUT.CROP.SIZE[0] 75 | data[0]["image"] = torch.zeros((3, crop_size, crop_size)) 76 | flops = FlopCountAnalysis(model, data) 77 | if idx > 0: 78 | flops.unsupported_ops_warnings(False).uncalled_modules_warnings(False) 79 | counts += flops.by_operator() 80 | total_flops.append(flops.total()) 81 | 82 | logger.info("Flops table computed from only one input sample:\n" + flop_count_table(flops)) 83 | logger.info( 84 | "Average GFlops for each type of operators:\n" 85 | + str([(k, v / (idx + 1) / 1e9) for k, v in counts.items()]) 86 | ) 87 | logger.info( 88 | "Total GFlops: {:.1f}±{:.1f}".format(np.mean(total_flops) / 1e9, np.std(total_flops) / 1e9) 89 | ) 90 | 91 | 92 | def do_activation(cfg): 93 | if isinstance(cfg, CfgNode): 94 | data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0]) 95 | model = build_model(cfg) 96 | DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) 97 | else: 98 | data_loader = instantiate(cfg.dataloader.test) 99 | model = instantiate(cfg.model) 100 | model.to(cfg.train.device) 101 | DetectionCheckpointer(model).load(cfg.train.init_checkpoint) 102 | model.eval() 103 | 104 | counts = Counter() 105 | total_activations = [] 106 | for idx, data in zip(tqdm.trange(args.num_inputs), data_loader): # noqa 107 | count = activation_count_operators(model, data) 108 | counts += count 109 | total_activations.append(sum(count.values())) 110 | logger.info( 111 | "(Million) Activations for Each Type of Operators:\n" 112 | + str([(k, v / idx) for k, v in counts.items()]) 113 | ) 114 | logger.info( 115 | "Total (Million) Activations: {}±{}".format( 116 | np.mean(total_activations), np.std(total_activations) 117 | ) 118 | ) 119 | 120 | 121 | def do_parameter(cfg): 122 | if isinstance(cfg, CfgNode): 123 | model = build_model(cfg) 124 | else: 125 | model = instantiate(cfg.model) 126 | logger.info("Parameter Count:\n" + parameter_count_table(model, max_depth=5)) 127 | 128 | 129 | def do_structure(cfg): 130 | if isinstance(cfg, CfgNode): 131 | model = build_model(cfg) 132 | else: 133 | model = instantiate(cfg.model) 134 | logger.info("Model Structure:\n" + str(model)) 135 | 136 | 137 | if __name__ == "__main__": 138 | parser = default_argument_parser( 139 | epilog=""" 140 | Examples: 141 | To show parameters of a model: 142 | $ ./analyze_model.py --tasks parameter \\ 143 | --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml 144 | Flops and activations are data-dependent, therefore inputs and model weights 145 | are needed to count them: 146 | $ ./analyze_model.py --num-inputs 100 --tasks flop \\ 147 | --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \\ 148 | MODEL.WEIGHTS /path/to/model.pkl 149 | """ 150 | ) 151 | parser.add_argument( 152 | "--tasks", 153 | choices=["flop", "activation", "parameter", "structure"], 154 | required=True, 155 | nargs="+", 156 | ) 157 | parser.add_argument( 158 | "-n", 159 | "--num-inputs", 160 | default=100, 161 | type=int, 162 | help="number of inputs used to compute statistics for flops/activations, " 163 | "both are data dependent.", 164 | ) 165 | parser.add_argument( 166 | "--use-fixed-input-size", 167 | action="store_true", 168 | help="use fixed input size when calculating flops", 169 | ) 170 | args = parser.parse_args() 171 | assert not args.eval_only 172 | assert args.num_gpus == 1 173 | 174 | cfg = setup(args) 175 | 176 | for task in args.tasks: 177 | { 178 | "flop": do_flop, 179 | "activation": do_activation, 180 | "parameter": do_parameter, 181 | "structure": do_structure, 182 | }[task](cfg) 183 | -------------------------------------------------------------------------------- /tools/convert-pretrained-model-to-d2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | # Adapted for AutoFocusFormer by Ziwen 2023 4 | 5 | import pickle as pkl 6 | import sys 7 | 8 | import torch 9 | 10 | """ 11 | Usage: 12 | # run the conversion 13 | python ./convert-pretrained-model-to-d2.py aff.pth aff.pkl 14 | # Then, use aff.pkl in config: 15 | MODEL: 16 | WEIGHTS: "/path/to/aff.pkl" 17 | INPUT: 18 | FORMAT: "RGB" 19 | """ 20 | 21 | if __name__ == "__main__": 22 | input = sys.argv[1] 23 | 24 | obj = torch.load(input, map_location="cpu")["model"] 25 | 26 | res = {"model": obj, "__author__": "third_party", "matching_heuristics": True} 27 | 28 | with open(sys.argv[2], "wb") as f: 29 | pkl.dump(res, f) 30 | --------------------------------------------------------------------------------