├── .gitignore ├── LICENSE ├── README.md ├── pipeline.png └── playground └── detection ├── coco ├── anchor.res50.fpn.coco.800size.3x_ms │ ├── README.md │ ├── config.py │ ├── fcos.py │ └── net.py ├── center.res50.fpn.coco.800size.3x_ms │ ├── README.md │ ├── config.py │ ├── fcos.py │ └── net.py ├── fcos.res50.fpn.coco.800size.3x_ms.wo_ctrness │ ├── README.md │ ├── config.py │ ├── fcos.py │ └── net.py ├── fcos.res50.fpn.coco.800size.3x_ms │ ├── README.md │ ├── config.py │ └── net.py ├── loss.res50.fpn.coco.800size.3x_ms │ ├── README.md │ ├── config.py │ ├── fcos.py │ └── net.py ├── poto.res50.fpn.coco.800size.3x_ms.3dmf.aux │ ├── README.md │ ├── config.py │ ├── fcos.py │ └── net.py ├── poto.res50.fpn.coco.800size.3x_ms.3dmf │ ├── README.md │ ├── config.py │ ├── fcos.py │ └── net.py ├── poto.res50.fpn.coco.800size.3x_ms.3dmf_wo_gn.aux │ ├── README.md │ ├── config.py │ ├── fcos.py │ └── net.py ├── poto.res50.fpn.coco.800size.3x_ms.3dmf_wo_gn │ ├── README.md │ ├── config.py │ ├── fcos.py │ └── net.py ├── poto.res50.fpn.coco.800size.3x_ms.argmax │ ├── README.md │ ├── config.py │ ├── fcos.py │ └── net.py ├── poto.res50.fpn.coco.800size.3x_ms │ ├── README.md │ ├── config.py │ ├── fcos.py │ └── net.py ├── poto.res50.fpn.coco.800size.6x_ms │ ├── README.md │ ├── config.py │ ├── fcos.py │ └── net.py └── poto.res50.fpn.coco.800size.9x_ms │ ├── README.md │ ├── config.py │ ├── fcos.py │ └── net.py └── crowdhuman ├── atss.res50.fpn.crowdhuman.800size.30k ├── README.md ├── atss.py ├── config.py └── net.py ├── fcos.res50.fpn.crowdhuman.800size.30k ├── README.md ├── config.py ├── fcos.py └── net.py ├── poto.res50.fpn.crowdhuman.800size.30k.3dmf.aux ├── README.md ├── config.py ├── fcos.py └── net.py ├── poto.res50.fpn.crowdhuman.800size.30k.3dmf ├── README.md ├── config.py ├── fcos.py └── net.py └── poto.res50.fpn.crowdhuman.800size.30k ├── README.md ├── config.py ├── fcos.py └── net.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | log 132 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # End-to-End Object Detection with Fully Convolutional Network 2 | 3 | ![GitHub](https://img.shields.io/github/license/Megvii-BaseDetection/DeFCN) 4 | 5 | This project provides an implementation for "[End-to-End Object Detection with Fully Convolutional Network](https://arxiv.org/abs/2012.03544)" on PyTorch. 6 | 7 | Experiments in the paper were conducted on the internal framework, thus we reimplement them on [cvpods](https://github.com/Megvii-BaseDetection/cvpods) and report details as below. 8 | 9 | ![](./pipeline.png) 10 | 11 | ## Requirements 12 | * [cvpods](https://github.com/Megvii-BaseDetection/cvpods) 13 | * scipy >= 1.5.4 14 | 15 | ## Get Started 16 | 17 | * install cvpods locally (requires cuda to compile) 18 | ```shell 19 | 20 | python3 -m pip install 'git+https://github.com/Megvii-BaseDetection/cvpods.git' 21 | # (add --user if you don't have permission) 22 | 23 | # Or, to install it from a local clone: 24 | git clone https://github.com/Megvii-BaseDetection/cvpods.git 25 | python3 -m pip install -e cvpods 26 | 27 | # Or, 28 | pip install -r requirements.txt 29 | python3 setup.py build develop 30 | ``` 31 | 32 | * prepare datasets 33 | ```shell 34 | cd /path/to/cvpods 35 | cd datasets 36 | ln -s /path/to/your/coco/dataset coco 37 | ``` 38 | 39 | * Train & Test 40 | ```shell 41 | git clone https://github.com/Megvii-BaseDetection/DeFCN.git 42 | cd DeFCN/playground/detection/coco/poto.res50.fpn.coco.800size.3x_ms # for example 43 | 44 | # Train 45 | pods_train --num-gpus 8 46 | 47 | # Test 48 | pods_test --num-gpus 8 \ 49 | MODEL.WEIGHTS /path/to/your/save_dir/ckpt.pth # optional 50 | OUTPUT_DIR /path/to/your/save_dir # optional 51 | 52 | # Multi node training 53 | ## sudo apt install net-tools ifconfig 54 | pods_train --num-gpus 8 --num-machines N --machine-rank 0/1/.../N-1 --dist-url "tcp://MASTER_IP:port" 55 | 56 | ``` 57 | 58 | ## Results on COCO2017 val set 59 | 60 | | model | assignment | with NMS | lr sched. | mAP | mAR | download | 61 | |:------|:----------:|:--------:|:---------:|:---:|:---:|:--------:| 62 | | [FCOS](./playground/detection/coco/fcos.res50.fpn.coco.800size.3x_ms) | one-to-many | Yes | 3x + ms | 41.4 | 59.1 | [weight](https://drive.google.com/file/d/1j9FmyQQxB2g3J4M7F5DubBtW_7qXHiMv/view?usp=sharing) \| [log](https://drive.google.com/file/d/18RK2jZd7g198hAeAz80BsD_6cF8aY1mb/view?usp=sharing) | 63 | | [FCOS baseline](./playground/detection/coco/fcos.res50.fpn.coco.800size.3x_ms.wo_ctrness) | one-to-many | Yes | 3x + ms | 40.9 | 58.4 | [weight](https://drive.google.com/file/d/1diZQFuJQR6XzPXJsyh1zrRuFYjbqKZ9l/view?usp=sharing) \| [log](https://drive.google.com/file/d/1P1ouRHmSMB4-WZ_yu46lU3kVXlDQAkdE/view?usp=sharing) | 64 | | [Anchor](./playground/detection/coco/anchor.res50.fpn.coco.800size.3x_ms) | one-to-one | No | 3x + ms | 37.1 | 60.5 | [weight](https://drive.google.com/file/d/1ZVAZPoOlwtNVlxkaKEFWPrkH57nRpuKr/view?usp=sharing) \| [log](https://drive.google.com/file/d/1CVTcCJvLfPPCDN2rIhk8gX8vp98oQidM/view?usp=sharing) | 65 | | [Center](./playground/detection/coco/center.res50.fpn.coco.800size.3x_ms) | one-to-one | No | 3x + ms | 35.2 | 61.0 | [weight](https://drive.google.com/file/d/1TgNFHMs9uxjTrMMRTSXarwVWZOkX53av/view?usp=sharing) \| [log](https://drive.google.com/file/d/1zcnQTQaOXPLLoHy9lHwfFdESxhIkqD1R/view?usp=sharing) | 66 | | [Foreground Loss](./playground/detection/coco/loss.res50.fpn.coco.800size.3x_ms) | one-to-one | No | 3x + ms | 38.7 | 62.2 | [weight](https://drive.google.com/file/d/1rTsXbEC5Tj8kwXdjTuHYcfoap4TsnkXV/view?usp=sharing) \| [log](https://drive.google.com/file/d/1EAMPnK7s0TabKKzZhWjALsY1Hege4pFx/view?usp=sharing) | 67 | | [POTO](./playground/detection/coco/poto.res50.fpn.coco.800size.3x_ms) | one-to-one | No | 3x + ms | 39.2 | 61.7 | [weight](https://drive.google.com/file/d/1mlk5dxc34PyXMajinlF_zWXxs84Z28MH/view?usp=sharing) \| [log](https://drive.google.com/file/d/1v4TBsbExylfgM7GfGh02vks8AnwSbPDI/view?usp=sharing) | 68 | | [POTO + 3DMF](./playground/detection/coco/poto.res50.fpn.coco.800size.3x_ms.3dmf) | one-to-one | No | 3x + ms | 40.6 | 61.6 | [weight](https://drive.google.com/file/d/1yUzhK_wtzr4_hqi_WT3YpDryGn_rrltU/view?usp=sharing) \| [log](https://drive.google.com/file/d/1ik5JnVLIzmuYlbCkq_MTEDrd2jWoNprV/view?usp=sharing) | 69 | | [POTO + 3DMF + Aux](./playground/detection/coco/poto.res50.fpn.coco.800size.3x_ms.3dmf.aux) | mixture\* | No | 3x + ms | 41.4 | 61.5 | [weight](https://drive.google.com/file/d/1bxpmTzVzCkV6BHzca_TVWo3pTOEZMAFS/view?usp=sharing) \| [log](https://drive.google.com/file/d/12LTwMJ3zuBYVa7K0OA0ZRTfC1kxianjW/view?usp=sharing) | 70 | 71 | \* We adopt a one-to-one assignment in POTO and a one-to-many assignment in the auxiliary loss, respectively. 72 | 73 | - `2x + ms` schedule is adopted in the paper, but we adopt `3x + ms` schedule here to achieve higher performance. 74 | - It's normal to observe ~0.3AP noise in POTO. 75 | 76 | ## Results on CrowdHuman val set 77 | 78 | | model | assignment | with NMS | lr sched. | AP50 | mMR | recall | download | 79 | |:------|:----------:|:--------:|:---------:|:----:|:---:|:------:|:--------:| 80 | | [FCOS](./playground/detection/crowdhuman/fcos.res50.fpn.crowdhuman.800size.30k) | one-to-many | Yes | 30k iters | 86.1 | 54.9 | 94.2 | [weight](https://drive.google.com/file/d/1qf34m13kniTK2fo2o8etjMfocezSyosQ/view?usp=sharing) \| [log](https://drive.google.com/file/d/1DgZbvawWGX7rBonS8WgcByIGn7nLNrmA/view?usp=sharing) | 81 | | [ATSS](./playground/detection/crowdhuman/atss.res50.fpn.crowdhuman.800size.30k) | one-to-many | Yes | 30k iters | 87.2 | 49.7 | 94.0 | [weight](https://drive.google.com/file/d/1J30DVItPgLVg9_ps-NdCXWYqaV0PvwAq/view?usp=sharing) \| [log](https://drive.google.com/file/d/1jdL2v_A_fhU6GjYBOzT80ps5CZEZBtx5/view?usp=sharing) | 82 | | [POTO](./playground/detection/crowdhuman/poto.res50.fpn.crowdhuman.800size.30k) | one-to-one | No | 30k iters | 88.5 | 52.2 | 96.3 | [weight](https://drive.google.com/file/d/1mbP0mmHpva30BcQIxY84XhEMsTGwi-ze/view?usp=sharing) \| [log](https://drive.google.com/file/d/1dmn2ENMkfNXaQUaruSR9Pu1QAAOAhlEC/view?usp=sharing) | 83 | | [POTO + 3DMF](./playground/detection/crowdhuman/poto.res50.fpn.crowdhuman.800size.30k.3dmf) | one-to-one | No | 30k iters | 88.8 | 51.0 | 96.6 | [weight](https://drive.google.com/file/d/1d_Z6g54RTIVYHzaUrEogmL3gId2PTBSb/view?usp=sharing) \| [log](https://drive.google.com/file/d/12G-1nm34DjH2xJGRMsiV8OYIZzWooFkt/view?usp=sharing) | 84 | | [POTO + 3DMF + Aux](./playground/detection/crowdhuman/poto.res50.fpn.crowdhuman.800size.30k.3dmf.aux) | mixture\* | No | 30k iters | 89.1 | 48.9 | 96.5 | [weight](https://drive.google.com/file/d/1P5uWt4kjQnm-P_WC0MzqLC5TWbIH62UY/view?usp=sharing) \| [log](https://drive.google.com/file/d/1sTcb5B0vjwSC6QJnwJlLRBJQlVcM2WDl/view?usp=sharing) | 85 | 86 | \* We adopt a one-to-one assignment in POTO and a one-to-many assignment in the auxiliary loss, respectively. 87 | 88 | - It's normal to observe ~0.3AP noise in POTO, and ~1.0mMR noise in all methods. 89 | 90 | ## Ablations on COCO2017 val set 91 | 92 | | model | assignment | with NMS | lr sched. | mAP | mAR | note | 93 | |:------|:----------:|:--------:|:---------:|:---:|:---:|:----:| 94 | | [POTO](./playground/detection/coco/poto.res50.fpn.coco.800size.6x_ms) | one-to-one | No | 6x + ms | 40.0 | 61.9 | | 95 | | [POTO](./playground/detection/coco/poto.res50.fpn.coco.800size.9x_ms) | one-to-one | No | 9x + ms | 40.2 | 62.3 | | 96 | | [POTO](./playground/detection/coco/poto.res50.fpn.coco.800size.3x_ms.argmax) | one-to-one | No | 3x + ms | 39.2 | 61.1 | replace Hungarian algorithm by `argmax` | 97 | | [POTO + 3DMF](./playground/detection/coco/poto.res50.fpn.coco.800size.3x_ms.3dmf_wo_gn) | one-to-one | No | 3x + ms | 40.9 | 62.0 | remove GN in 3DMF | 98 | | [POTO + 3DMF + Aux](./playground/detection/coco/poto.res50.fpn.coco.800size.3x_ms.3dmf_wo_gn.aux) | mixture\* | No | 3x + ms | 41.5 | 61.5 | remove GN in 3DMF | 99 | 100 | \* We adopt a one-to-one assignment in POTO and a one-to-many assignment in the auxiliary loss, respectively. 101 | 102 | - For `one-to-one` assignment, more training iters lead to higher performance. 103 | - The `argmax` (also known as top-1) operation is indeed the approximate solution of bipartite matching in dense prediction methods. 104 | - It seems harmless to remove GN in 3DMF, which also leads to higher inference speed. 105 | 106 | ## Acknowledgement 107 | This repo is developed based on cvpods. Please check [cvpods](https://github.com/Megvii-BaseDetection/cvpods) for more details and features. 108 | 109 | ## License 110 | This repo is released under the Apache 2.0 license. Please see the LICENSE file for more information. 111 | 112 | ## Citing 113 | If you use this work in your research or wish to refer to the baseline results published here, please use the following BibTeX entries: 114 | ``` 115 | @article{wang2020end, 116 | title = {End-to-End Object Detection with Fully Convolutional Network}, 117 | author = {Wang, Jianfeng and Song, Lin and Li, Zeming and Sun, Hongbin and Sun, Jian and Zheng, Nanning}, 118 | journal = {arXiv preprint arXiv:2012.03544}, 119 | year = {2020} 120 | } 121 | ``` 122 | 123 | ## Contributing to the project 124 | Any pull requests or issues about the implementation are welcome. If you have any issue about the library (e.g. installation, environments), please refer to [cvpods](https://github.com/Megvii-BaseDetection/cvpods). 125 | -------------------------------------------------------------------------------- /pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Megvii-BaseDetection/DeFCN/bd7e24d7408d63edc878d175d9cf798974095049/pipeline.png -------------------------------------------------------------------------------- /playground/detection/coco/anchor.res50.fpn.coco.800size.3x_ms/README.md: -------------------------------------------------------------------------------- 1 | # anchor.res50.fpn.coco.800size.3x_ms 2 | 3 | seed: 10266195 4 | 5 | ## Evaluation results for bbox: 6 | 7 | ``` 8 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.371 9 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.538 10 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.406 11 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.235 12 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.406 13 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.468 14 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.320 15 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.550 16 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.605 17 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.407 18 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.642 19 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.752 20 | ``` 21 | | AP | AP50 | AP75 | APs | APm | APl | 22 | |:------:|:------:|:------:|:------:|:------:|:------:| 23 | | 37.121 | 53.845 | 40.606 | 23.457 | 40.621 | 46.818 | 24 | 25 | ### Per-category bbox AP: 26 | 27 | | category | AP | category | AP | category | AP | 28 | |:--------------|:-------|:-------------|:-------|:---------------|:-------| 29 | | person | 52.613 | bicycle | 27.819 | car | 41.466 | 30 | | motorcycle | 38.400 | airplane | 60.967 | bus | 62.903 | 31 | | train | 58.376 | truck | 30.933 | boat | 22.104 | 32 | | traffic light | 25.824 | fire hydrant | 64.486 | stop sign | 62.992 | 33 | | parking meter | 36.029 | bench | 19.416 | bird | 32.151 | 34 | | cat | 61.740 | dog | 53.929 | horse | 53.916 | 35 | | sheep | 47.808 | cow | 53.603 | elephant | 58.405 | 36 | | bear | 64.446 | zebra | 67.133 | giraffe | 64.894 | 37 | | backpack | 13.931 | umbrella | 35.645 | handbag | 13.484 | 38 | | tie | 29.557 | suitcase | 34.717 | frisbee | 60.803 | 39 | | skis | 18.115 | snowboard | 26.962 | sports ball | 47.191 | 40 | | kite | 41.637 | baseball bat | 21.392 | baseball glove | 32.027 | 41 | | skateboard | 45.930 | surfboard | 28.968 | tennis racket | 42.757 | 42 | | bottle | 34.439 | wine glass | 33.286 | cup | 38.131 | 43 | | fork | 26.456 | knife | 14.392 | spoon | 12.133 | 44 | | bowl | 36.752 | banana | 20.660 | apple | 18.448 | 45 | | sandwich | 28.772 | orange | 28.911 | broccoli | 21.992 | 46 | | carrot | 18.264 | hot dog | 28.677 | pizza | 47.652 | 47 | | donut | 40.860 | cake | 30.599 | chair | 24.919 | 48 | | couch | 40.305 | potted plant | 23.166 | bed | 34.359 | 49 | | dining table | 24.119 | toilet | 56.667 | tv | 51.158 | 50 | | laptop | 52.544 | mouse | 58.780 | remote | 25.107 | 51 | | keyboard | 47.750 | cell phone | 30.600 | microwave | 49.704 | 52 | | oven | 27.081 | toaster | 26.784 | sink | 32.348 | 53 | | refrigerator | 49.154 | book | 12.779 | clock | 47.372 | 54 | | vase | 32.067 | scissors | 22.869 | teddy bear | 39.427 | 55 | | hair drier | 5.461 | toothbrush | 19.293 | | | 56 | -------------------------------------------------------------------------------- /playground/detection/coco/anchor.res50.fpn.coco.800size.3x_ms/config.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from cvpods.configs.fcos_config import FCOSConfig 4 | 5 | _config_dict = dict( 6 | MODEL=dict( 7 | WEIGHTS="detectron2://ImageNetPretrained/MSRA/R-50.pkl", 8 | RESNETS=dict(DEPTH=50), 9 | ANCHOR_GENERATOR=dict( 10 | SIZES=[[32], [64], [128], [256], [512]], 11 | ASPECT_RATIOS=[[1.0]], 12 | OFFSET=0.5, 13 | ), 14 | FCOS=dict( 15 | NMS_THRESH_TEST=1.0, # disable NMS when NMS threshold is 1.0 16 | BBOX_REG_WEIGHTS=(1.0, 1.0, 1.0, 1.0), 17 | FOCAL_LOSS_GAMMA=2.0, 18 | FOCAL_LOSS_ALPHA=0.25, 19 | IOU_LOSS_TYPE="giou", 20 | REG_WEIGHT=2.0, 21 | ), 22 | POTO=dict( 23 | IOU_TOPK=1, 24 | ), 25 | NMS_TYPE=None, 26 | ), 27 | DATASETS=dict( 28 | TRAIN=("coco_2017_train",), 29 | TEST=("coco_2017_val",), 30 | ), 31 | SOLVER=dict( 32 | CHECKPOINT_PERIOD=10000, 33 | LR_SCHEDULER=dict( 34 | MAX_ITER=270000, 35 | STEPS=(210000, 250000), 36 | ), 37 | OPTIMIZER=dict( 38 | BASE_LR=0.01, 39 | ), 40 | IMS_PER_BATCH=16, 41 | ), 42 | INPUT=dict( 43 | AUG=dict( 44 | TRAIN_PIPELINES=[ 45 | ("ResizeShortestEdge", 46 | dict(short_edge_length=(640, 672, 704, 736, 768, 800), max_size=1333, sample_style="choice")), 47 | ("RandomFlip", dict()), 48 | ], 49 | TEST_PIPELINES=[ 50 | ("ResizeShortestEdge", 51 | dict(short_edge_length=800, max_size=1333, sample_style="choice")), 52 | ], 53 | ) 54 | ), 55 | TEST=dict( 56 | EVAL_PEROID=10000, 57 | ), 58 | OUTPUT_DIR=osp.join( 59 | '/data/Outputs/model_logs/cvpods_playground', 60 | osp.split(osp.realpath(__file__))[0].split("playground/")[-1]), 61 | ) 62 | 63 | 64 | class CustomFCOSConfig(FCOSConfig): 65 | def __init__(self): 66 | super(CustomFCOSConfig, self).__init__() 67 | self._register_configuration(_config_dict) 68 | 69 | 70 | config = CustomFCOSConfig() 71 | -------------------------------------------------------------------------------- /playground/detection/coco/anchor.res50.fpn.coco.800size.3x_ms/fcos.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import math 3 | from typing import List 4 | 5 | import torch 6 | import torch.nn.functional as F 7 | from torch import nn 8 | 9 | from cvpods.layers import ShapeSpec, cat, generalized_batched_nms 10 | from cvpods.modeling.box_regression import Box2BoxTransform 11 | from cvpods.modeling.losses import iou_loss, sigmoid_focal_loss_jit 12 | from cvpods.modeling.meta_arch.retinanet import ( 13 | permute_to_N_HWA_K, 14 | permute_all_cls_and_box_to_N_HWA_K_and_concat 15 | ) 16 | from cvpods.modeling.postprocessing import detector_postprocess 17 | from cvpods.structures import Boxes, ImageList, Instances, pairwise_iou 18 | from cvpods.utils import comm, get_event_storage, log_first_n 19 | 20 | 21 | class FCOS(nn.Module): 22 | """ 23 | Implement FCOS (https://arxiv.org/abs/1904.01355). 24 | """ 25 | def __init__(self, cfg): 26 | super().__init__() 27 | 28 | self.device = torch.device(cfg.MODEL.DEVICE) 29 | 30 | # fmt: off 31 | self.num_classes = cfg.MODEL.FCOS.NUM_CLASSES 32 | self.in_features = cfg.MODEL.FCOS.IN_FEATURES 33 | # Loss parameters: 34 | self.focal_loss_alpha = cfg.MODEL.FCOS.FOCAL_LOSS_ALPHA 35 | self.focal_loss_gamma = cfg.MODEL.FCOS.FOCAL_LOSS_GAMMA 36 | self.iou_loss_type = cfg.MODEL.FCOS.IOU_LOSS_TYPE 37 | self.reg_weight = cfg.MODEL.FCOS.REG_WEIGHT 38 | # Inference parameters: 39 | self.score_threshold = cfg.MODEL.FCOS.SCORE_THRESH_TEST 40 | self.topk_candidates = cfg.MODEL.FCOS.TOPK_CANDIDATES_TEST 41 | self.nms_threshold = cfg.MODEL.FCOS.NMS_THRESH_TEST 42 | self.nms_type = cfg.MODEL.NMS_TYPE 43 | self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE 44 | # fmt: on 45 | 46 | self.backbone = cfg.build_backbone( 47 | cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) 48 | 49 | backbone_shape = self.backbone.output_shape() 50 | feature_shapes = [backbone_shape[f] for f in self.in_features] 51 | self.head = FCOSHead(cfg, feature_shapes) 52 | self.anchor_generator = cfg.build_anchor_generator(cfg, feature_shapes) 53 | 54 | # Matching and loss 55 | self.box2box_transform = Box2BoxTransform( 56 | weights=cfg.MODEL.FCOS.BBOX_REG_WEIGHTS) 57 | self.iou_topk = cfg.MODEL.POTO.IOU_TOPK 58 | 59 | pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 60 | 3, 1, 1) 61 | pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 62 | 3, 1, 1) 63 | self.normalizer = lambda x: (x - pixel_mean) / pixel_std 64 | self.to(self.device) 65 | 66 | def forward(self, batched_inputs): 67 | """ 68 | Args: 69 | batched_inputs: a list, batched outputs of :class:`DatasetMapper` . 70 | Each item in the list contains the inputs for one image. 71 | For now, each item in the list is a dict that contains: 72 | 73 | * image: Tensor, image in (C, H, W) format. 74 | * instances: Instances 75 | 76 | Other information that's included in the original dicts, such as: 77 | 78 | * "height", "width" (int): the output resolution of the model, used in inference. 79 | See :meth:`postprocess` for details. 80 | Returns: 81 | dict[str: Tensor]: 82 | mapping from a named loss to a tensor storing the loss. Used during training only. 83 | """ 84 | images = self.preprocess_image(batched_inputs) 85 | if "instances" in batched_inputs[0]: 86 | gt_instances = [ 87 | x["instances"].to(self.device) for x in batched_inputs 88 | ] 89 | elif "targets" in batched_inputs[0]: 90 | log_first_n( 91 | logging.WARN, 92 | "'targets' in the model inputs is now renamed to 'instances'!", 93 | n=10) 94 | gt_instances = [ 95 | x["targets"].to(self.device) for x in batched_inputs 96 | ] 97 | else: 98 | gt_instances = None 99 | 100 | features = self.backbone(images.tensor) 101 | features = [features[f] for f in self.in_features] 102 | box_cls, box_delta = self.head(features) 103 | anchors = self.anchor_generator(features) 104 | 105 | if self.training: 106 | gt_classes, gt_anchors_reg_deltas = self.get_ground_truth( 107 | anchors, gt_instances) 108 | return self.losses(gt_classes, gt_anchors_reg_deltas, box_cls, 109 | box_delta, anchors) 110 | else: 111 | results = self.inference(box_cls, box_delta, anchors, images) 112 | processed_results = [] 113 | for results_per_image, input_per_image, image_size in zip( 114 | results, batched_inputs, images.image_sizes): 115 | height = input_per_image.get("height", image_size[0]) 116 | width = input_per_image.get("width", image_size[1]) 117 | r = detector_postprocess(results_per_image, height, width) 118 | processed_results.append({"instances": r}) 119 | return processed_results 120 | 121 | def losses(self, gt_classes, gt_anchors_deltas, pred_class_logits, 122 | pred_anchor_deltas, anchors): 123 | """ 124 | Args: 125 | For `gt_classes` and `gt_anchors_deltas` parameters, see 126 | :meth:`FCOS.get_ground_truth`. 127 | Their shapes are (N, R) and (N, R, 4), respectively, where R is 128 | the total number of anchors across levels, i.e. sum(Hi x Wi) 129 | For `pred_class_logits` and `pred_anchor_deltas`, see 130 | :meth:`FCOSHead.forward`. 131 | 132 | Returns: 133 | dict[str: Tensor]: 134 | mapping from a named loss to a scalar tensor 135 | storing the loss. Used during training only. The dict keys are: 136 | "loss_cls" and "loss_box_reg" 137 | """ 138 | pred_class_logits, pred_anchor_deltas = \ 139 | permute_all_cls_and_box_to_N_HWA_K_and_concat( 140 | pred_class_logits, pred_anchor_deltas, self.num_classes 141 | ) # Shapes: (N x R, K) and (N x R, 4), respectively. 142 | 143 | gt_classes = gt_classes.flatten() 144 | gt_anchors_deltas = gt_anchors_deltas.view(-1, 4) 145 | 146 | valid_idxs = gt_classes >= 0 147 | foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes) 148 | num_foreground = foreground_idxs.sum() 149 | 150 | gt_classes_target = torch.zeros_like(pred_class_logits) 151 | gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1 152 | 153 | num_foreground = comm.all_reduce(num_foreground) / float(comm.get_world_size()) 154 | 155 | # logits loss 156 | loss_cls = sigmoid_focal_loss_jit( 157 | pred_class_logits[valid_idxs], 158 | gt_classes_target[valid_idxs], 159 | alpha=self.focal_loss_alpha, 160 | gamma=self.focal_loss_gamma, 161 | reduction="sum", 162 | ) / max(1.0, num_foreground) 163 | 164 | anchors = Boxes.cat([Boxes.cat(anchors_i) for anchors_i in anchors]) 165 | pred_anchor_deltas = self.box2box_transform.apply_deltas( 166 | pred_anchor_deltas, anchors.tensor 167 | ) 168 | gt_anchors_deltas = self.box2box_transform.apply_deltas( 169 | gt_anchors_deltas, anchors.tensor 170 | ) 171 | 172 | # regression loss 173 | loss_box_reg = iou_loss( 174 | pred_anchor_deltas[foreground_idxs], 175 | gt_anchors_deltas[foreground_idxs], 176 | box_mode="xyxy", 177 | loss_type=self.iou_loss_type, 178 | reduction="sum", 179 | ) / max(1.0, num_foreground) * self.reg_weight 180 | 181 | return { 182 | "loss_cls": loss_cls, 183 | "loss_box_reg": loss_box_reg, 184 | } 185 | 186 | @torch.no_grad() 187 | def get_ground_truth(self, anchors, targets): 188 | """ 189 | Args: 190 | anchors (list[list[Boxes]]): a list of N=#image elements. Each is a 191 | list of #feature level Boxes. The Boxes contains anchors of 192 | this image on the specific feature level. 193 | targets (list[Instances]): a list of N `Instances`s. The i-th 194 | `Instances` contains the ground-truth per-instance annotations 195 | for the i-th input image. Specify `targets` during training only. 196 | 197 | Returns: 198 | gt_classes (Tensor): 199 | An integer tensor of shape (N, R) storing ground-truth 200 | labels for each anchor. 201 | R is the total number of anchors, i.e. the sum of Hi x Wi for all levels. 202 | Anchors with an IoU with some target higher than the foreground threshold 203 | are assigned their corresponding label in the [0, K-1] range. 204 | Anchors whose IoU are below the background threshold are assigned 205 | the label "K". Anchors whose IoU are between the foreground and background 206 | thresholds are assigned a label "-1", i.e. ignore. 207 | gt_anchors_deltas (Tensor): 208 | Shape (N, R, 4). 209 | The last dimension represents ground-truth box2box transform 210 | targets (dx, dy, dw, dh) that map each anchor to its matched ground-truth box. 211 | The values in the tensor are meaningful only when the corresponding 212 | anchor is labeled as foreground. 213 | """ 214 | gt_classes = [] 215 | gt_anchors_deltas = [] 216 | 217 | num_fg = 0 218 | num_gt = 0 219 | 220 | for anchors_per_image, targets_per_image in zip(anchors, targets): 221 | anchors_per_image = Boxes.cat(anchors_per_image) 222 | 223 | gt_boxes = targets_per_image.gt_boxes 224 | 225 | match_quality_matrix = pairwise_iou(gt_boxes, anchors_per_image) 226 | 227 | _, is_positive = match_quality_matrix.topk(self.iou_topk, dim=1) 228 | is_foreground = torch.zeros_like( 229 | match_quality_matrix, dtype=torch.bool 230 | ).scatter_(1, is_positive, True) 231 | 232 | match_quality_matrix[~is_foreground] = -1 233 | 234 | # if there are still more than one objects for a position, 235 | # we choose the one with maximum quality 236 | anchor_labels, gt_matched_idxs = match_quality_matrix.max(dim=0) 237 | 238 | num_fg += (anchor_labels != -1).sum().item() 239 | num_gt += len(targets_per_image) 240 | 241 | # ground truth box regression 242 | gt_anchors_reg_deltas_i = self.box2box_transform.get_deltas( 243 | anchors_per_image.tensor, gt_boxes[gt_matched_idxs].tensor) 244 | 245 | # ground truth classes 246 | has_gt = len(targets_per_image) > 0 247 | if has_gt: 248 | gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs] 249 | # Anchors with label -1 are treated as background. 250 | gt_classes_i[anchor_labels == -1] = self.num_classes 251 | else: 252 | gt_classes_i = torch.zeros_like( 253 | gt_matched_idxs) + self.num_classes 254 | 255 | gt_classes.append(gt_classes_i) 256 | gt_anchors_deltas.append(gt_anchors_reg_deltas_i) 257 | 258 | get_event_storage().put_scalar("num_fg_per_gt", num_fg / num_gt) 259 | 260 | return torch.stack(gt_classes), torch.stack(gt_anchors_deltas) 261 | 262 | def inference(self, box_cls, box_delta, anchors, images): 263 | """ 264 | Arguments: 265 | box_cls, box_delta: Same as the output of :meth:`FCOSHead.forward` 266 | anchors (list[list[Boxes]]): a list of #images elements. Each is a 267 | list of #feature level Boxes. The Boxes contain anchors of this 268 | image on the specific feature level. 269 | images (ImageList): the input images 270 | 271 | Returns: 272 | results (List[Instances]): a list of #images elements. 273 | """ 274 | assert len(anchors) == len(images) 275 | results = [] 276 | 277 | box_cls = [permute_to_N_HWA_K(x, self.num_classes) for x in box_cls] 278 | box_delta = [permute_to_N_HWA_K(x, 4) for x in box_delta] 279 | # list[Tensor], one per level, each has shape (N, Hi x Wi x A, K or 4) 280 | 281 | for img_idx, anchors_per_image in enumerate(anchors): 282 | image_size = images.image_sizes[img_idx] 283 | box_cls_per_image = [ 284 | box_cls_per_level[img_idx] for box_cls_per_level in box_cls 285 | ] 286 | box_reg_per_image = [ 287 | box_reg_per_level[img_idx] for box_reg_per_level in box_delta 288 | ] 289 | results_per_image = self.inference_single_image( 290 | box_cls_per_image, box_reg_per_image, anchors_per_image, 291 | tuple(image_size)) 292 | results.append(results_per_image) 293 | return results 294 | 295 | def inference_single_image(self, box_cls, box_delta, anchors, image_size): 296 | """ 297 | Single-image inference. Return bounding-box detection results by thresholding 298 | on scores and applying non-maximum suppression (NMS). 299 | 300 | Arguments: 301 | box_cls (list[Tensor]): list of #feature levels. Each entry contains 302 | tensor of size (H x W, K) 303 | box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4. 304 | anchors (list[Boxes]): list of #feature levels. Each entry contains 305 | a Boxes object, which contains all the anchors for that 306 | image in that feature level. 307 | image_size (tuple(H, W)): a tuple of the image height and width. 308 | 309 | Returns: 310 | Same as `inference`, but for only one image. 311 | """ 312 | boxes_all = [] 313 | scores_all = [] 314 | class_idxs_all = [] 315 | 316 | # Iterate over every feature level 317 | for box_cls_i, box_reg_i, anchors_i in zip(box_cls, box_delta, anchors): 318 | # (HxWxK,) 319 | box_cls_i = box_cls_i.sigmoid_().flatten() 320 | 321 | # Keep top k top scoring indices only. 322 | num_topk = min(self.topk_candidates, box_reg_i.size(0)) 323 | # torch.sort is actually faster than .topk (at least on GPUs) 324 | predicted_prob, topk_idxs = box_cls_i.sort(descending=True) 325 | predicted_prob = predicted_prob[:num_topk] 326 | topk_idxs = topk_idxs[:num_topk] 327 | 328 | # filter out the proposals with low confidence score 329 | keep_idxs = predicted_prob > self.score_threshold 330 | predicted_prob = predicted_prob[keep_idxs] 331 | topk_idxs = topk_idxs[keep_idxs] 332 | 333 | anchor_idxs = topk_idxs // self.num_classes 334 | classes_idxs = topk_idxs % self.num_classes 335 | 336 | box_reg_i = box_reg_i[anchor_idxs] 337 | anchors_i = anchors_i[anchor_idxs] 338 | # predict boxes 339 | predicted_boxes = self.box2box_transform.apply_deltas( 340 | box_reg_i, anchors_i.tensor) 341 | 342 | boxes_all.append(predicted_boxes) 343 | scores_all.append(predicted_prob) 344 | class_idxs_all.append(classes_idxs) 345 | 346 | boxes_all, scores_all, class_idxs_all = [ 347 | cat(x) for x in [boxes_all, scores_all, class_idxs_all] 348 | ] 349 | 350 | if self.nms_type is None: 351 | # strategies above (e.g. topk_candidates and score_threshold) are 352 | # useless for POTO, just keep them for debug and analysis 353 | keep = scores_all.argsort(descending=True) 354 | else: 355 | keep = generalized_batched_nms( 356 | boxes_all, scores_all, class_idxs_all, 357 | self.nms_threshold, nms_type=self.nms_type 358 | ) 359 | keep = keep[:self.max_detections_per_image] 360 | 361 | result = Instances(image_size) 362 | result.pred_boxes = Boxes(boxes_all[keep]) 363 | result.scores = scores_all[keep] 364 | result.pred_classes = class_idxs_all[keep] 365 | return result 366 | 367 | def preprocess_image(self, batched_inputs): 368 | """ 369 | Normalize, pad and batch the input images. 370 | """ 371 | images = [x["image"].to(self.device) for x in batched_inputs] 372 | images = [self.normalizer(x) for x in images] 373 | images = ImageList.from_tensors(images, 374 | self.backbone.size_divisibility) 375 | return images 376 | 377 | def _inference_for_ms_test(self, batched_inputs): 378 | """ 379 | function used for multiscale test, will be refactor in the future. 380 | The same input with `forward` function. 381 | """ 382 | assert not self.training, "inference mode with training=True" 383 | assert len(batched_inputs) == 1, "inference image number > 1" 384 | images = self.preprocess_image(batched_inputs) 385 | 386 | features = self.backbone(images.tensor) 387 | features = [features[f] for f in self.in_features] 388 | box_cls, box_delta = self.head(features) 389 | anchors = self.anchor_generator(features) 390 | 391 | results = self.inference(box_cls, box_delta, anchors, images) 392 | for results_per_image, input_per_image, image_size in zip( 393 | results, batched_inputs, images.image_sizes 394 | ): 395 | height = input_per_image.get("height", image_size[0]) 396 | width = input_per_image.get("width", image_size[1]) 397 | processed_results = detector_postprocess(results_per_image, height, width) 398 | return processed_results 399 | 400 | 401 | class FCOSHead(nn.Module): 402 | """ 403 | The head used in FCOS for object classification and box regression. 404 | It has two subnets for the two tasks, with a common structure but separate parameters. 405 | """ 406 | def __init__(self, cfg, input_shape: List[ShapeSpec]): 407 | super().__init__() 408 | # fmt: off 409 | in_channels = input_shape[0].channels 410 | num_classes = cfg.MODEL.FCOS.NUM_CLASSES 411 | num_convs = cfg.MODEL.FCOS.NUM_CONVS 412 | prior_prob = cfg.MODEL.FCOS.PRIOR_PROB 413 | num_anchors = cfg.build_anchor_generator(cfg, input_shape).num_cell_anchors 414 | self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES 415 | self.norm_reg_targets = cfg.MODEL.FCOS.NORM_REG_TARGETS 416 | # fmt: on 417 | assert len(set(num_anchors)) == 1, "using differenct num_anchors value is not supported" 418 | num_anchors = num_anchors[0] 419 | 420 | cls_subnet = [] 421 | bbox_subnet = [] 422 | for _ in range(num_convs): 423 | cls_subnet.append( 424 | nn.Conv2d(in_channels, 425 | in_channels, 426 | kernel_size=3, 427 | stride=1, 428 | padding=1)) 429 | cls_subnet.append(nn.GroupNorm(32, in_channels)) 430 | cls_subnet.append(nn.ReLU()) 431 | bbox_subnet.append( 432 | nn.Conv2d(in_channels, 433 | in_channels, 434 | kernel_size=3, 435 | stride=1, 436 | padding=1)) 437 | bbox_subnet.append(nn.GroupNorm(32, in_channels)) 438 | bbox_subnet.append(nn.ReLU()) 439 | 440 | self.cls_subnet = nn.Sequential(*cls_subnet) 441 | self.bbox_subnet = nn.Sequential(*bbox_subnet) 442 | self.cls_score = nn.Conv2d(in_channels, 443 | num_anchors * num_classes, 444 | kernel_size=3, 445 | stride=1, 446 | padding=1) 447 | self.bbox_pred = nn.Conv2d(in_channels, 448 | num_anchors * 4, 449 | kernel_size=3, 450 | stride=1, 451 | padding=1) 452 | 453 | # Initialization 454 | for modules in [ 455 | self.cls_subnet, self.bbox_subnet, self.cls_score, self.bbox_pred 456 | ]: 457 | for layer in modules.modules(): 458 | if isinstance(layer, nn.Conv2d): 459 | torch.nn.init.normal_(layer.weight, mean=0, std=0.01) 460 | torch.nn.init.constant_(layer.bias, 0) 461 | if isinstance(layer, nn.GroupNorm): 462 | torch.nn.init.constant_(layer.weight, 1) 463 | torch.nn.init.constant_(layer.bias, 0) 464 | 465 | # Use prior in model initialization to improve stability 466 | bias_value = -math.log((1 - prior_prob) / prior_prob) 467 | torch.nn.init.constant_(self.cls_score.bias, bias_value) 468 | 469 | def forward(self, features): 470 | """ 471 | Arguments: 472 | features (list[Tensor]): FPN feature map tensors in high to low resolution. 473 | Each tensor in the list correspond to different feature levels. 474 | 475 | Returns: 476 | logits (list[Tensor]): #lvl tensors, each has shape (N, K, Hi, Wi). 477 | The tensor predicts the classification probability 478 | at each spatial position for each of the K object classes. 479 | bbox_reg (list[Tensor]): #lvl tensors, each has shape (N, 4, Hi, Wi). 480 | The tensor predicts 4-vector (dl,dt,dr,db) box 481 | regression values for every anchor. These values are the 482 | relative offset between the anchor and the ground truth box. 483 | """ 484 | logits = [] 485 | bbox_reg = [] 486 | for feature in features: 487 | cls_subnet = self.cls_subnet(feature) 488 | bbox_subnet = self.bbox_subnet(feature) 489 | 490 | logits.append(self.cls_score(cls_subnet)) 491 | bbox_reg.append(self.bbox_pred(bbox_subnet)) 492 | return logits, bbox_reg 493 | -------------------------------------------------------------------------------- /playground/detection/coco/anchor.res50.fpn.coco.800size.3x_ms/net.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from cvpods.layers import ShapeSpec 4 | from cvpods.modeling.anchor_generator import DefaultAnchorGenerator 5 | from cvpods.modeling.backbone import Backbone 6 | from cvpods.modeling.backbone.fpn import build_retinanet_resnet_fpn_p5_backbone 7 | 8 | from fcos import FCOS 9 | 10 | 11 | def build_backbone(cfg, input_shape=None): 12 | """ 13 | Build a backbone from `cfg.MODEL.BACKBONE.NAME`. 14 | 15 | Returns: 16 | an instance of :class:`Backbone` 17 | """ 18 | if input_shape is None: 19 | input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) 20 | 21 | backbone = build_retinanet_resnet_fpn_p5_backbone(cfg, input_shape) 22 | assert isinstance(backbone, Backbone) 23 | return backbone 24 | 25 | 26 | def build_anchor_generator(cfg, input_shape): 27 | 28 | return DefaultAnchorGenerator(cfg, input_shape) 29 | 30 | 31 | def build_model(cfg): 32 | 33 | cfg.build_backbone = build_backbone 34 | cfg.build_anchor_generator = build_anchor_generator 35 | 36 | model = FCOS(cfg) 37 | logger = logging.getLogger(__name__) 38 | logger.info("Model:\n{}".format(model)) 39 | return model 40 | -------------------------------------------------------------------------------- /playground/detection/coco/center.res50.fpn.coco.800size.3x_ms/README.md: -------------------------------------------------------------------------------- 1 | # center.res50.fpn.coco.800size.3x_ms 2 | 3 | seed: 23440541 4 | 5 | ## Evaluation results for bbox: 6 | 7 | ``` 8 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.352 9 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.491 10 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.388 11 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.212 12 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.391 13 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.436 14 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.323 15 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.549 16 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.610 17 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.401 18 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.646 19 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.775 20 | ``` 21 | | AP | AP50 | AP75 | APs | APm | APl | 22 | |:------:|:------:|:------:|:------:|:------:|:------:| 23 | | 35.184 | 49.112 | 38.833 | 21.240 | 39.081 | 43.624 | 24 | 25 | ### Per-category bbox AP: 26 | 27 | | category | AP | category | AP | category | AP | 28 | |:--------------|:-------|:-------------|:-------|:---------------|:-------| 29 | | person | 50.745 | bicycle | 24.736 | car | 40.273 | 30 | | motorcycle | 35.226 | airplane | 56.666 | bus | 58.531 | 31 | | train | 56.242 | truck | 30.254 | boat | 21.643 | 32 | | traffic light | 23.882 | fire hydrant | 59.002 | stop sign | 57.892 | 33 | | parking meter | 38.458 | bench | 16.190 | bird | 30.530 | 34 | | cat | 56.696 | dog | 49.404 | horse | 49.831 | 35 | | sheep | 44.098 | cow | 52.376 | elephant | 57.041 | 36 | | bear | 61.582 | zebra | 64.655 | giraffe | 63.143 | 37 | | backpack | 12.811 | umbrella | 32.360 | handbag | 11.444 | 38 | | tie | 27.783 | suitcase | 31.496 | frisbee | 57.670 | 39 | | skis | 17.429 | snowboard | 27.009 | sports ball | 44.850 | 40 | | kite | 41.692 | baseball bat | 22.479 | baseball glove | 30.474 | 41 | | skateboard | 45.098 | surfboard | 29.039 | tennis racket | 39.681 | 42 | | bottle | 33.861 | wine glass | 31.908 | cup | 34.875 | 43 | | fork | 23.307 | knife | 12.405 | spoon | 10.374 | 44 | | bowl | 35.602 | banana | 18.808 | apple | 14.727 | 45 | | sandwich | 27.692 | orange | 29.594 | broccoli | 17.581 | 46 | | carrot | 17.170 | hot dog | 26.330 | pizza | 44.958 | 47 | | donut | 39.596 | cake | 27.949 | chair | 22.646 | 48 | | couch | 37.705 | potted plant | 20.811 | bed | 35.757 | 49 | | dining table | 22.243 | toilet | 53.837 | tv | 48.742 | 50 | | laptop | 47.942 | mouse | 56.732 | remote | 23.299 | 51 | | keyboard | 43.587 | cell phone | 28.547 | microwave | 49.056 | 52 | | oven | 27.945 | toaster | 38.837 | sink | 31.613 | 53 | | refrigerator | 44.433 | book | 10.468 | clock | 44.014 | 54 | | vase | 34.515 | scissors | 21.610 | teddy bear | 36.052 | 55 | | hair drier | 3.383 | toothbrush | 13.791 | | | 56 | -------------------------------------------------------------------------------- /playground/detection/coco/center.res50.fpn.coco.800size.3x_ms/config.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from cvpods.configs.fcos_config import FCOSConfig 4 | 5 | _config_dict = dict( 6 | MODEL=dict( 7 | WEIGHTS="detectron2://ImageNetPretrained/MSRA/R-50.pkl", 8 | RESNETS=dict(DEPTH=50), 9 | SHIFT_GENERATOR=dict( 10 | NUM_SHIFTS=1, 11 | OFFSET=0.5, 12 | ), 13 | FCOS=dict( 14 | NORM_REG_TARGETS=True, 15 | NMS_THRESH_TEST=1.0, # disable NMS when NMS threshold is 1.0 16 | BBOX_REG_WEIGHTS=(1.0, 1.0, 1.0, 1.0), 17 | FOCAL_LOSS_GAMMA=2.0, 18 | FOCAL_LOSS_ALPHA=0.25, 19 | IOU_LOSS_TYPE="giou", 20 | REG_WEIGHT=2.0, 21 | ), 22 | POTO=dict( 23 | OBJECT_SIZES_OF_INTEREST=[ 24 | [-1, 64], 25 | [64, 128], 26 | [128, 256], 27 | [256, 512], 28 | [512, float("inf")], 29 | ], 30 | DISTANCE_TOPK=1, 31 | ), 32 | NMS_TYPE=None, 33 | ), 34 | DATASETS=dict( 35 | TRAIN=("coco_2017_train",), 36 | TEST=("coco_2017_val",), 37 | ), 38 | SOLVER=dict( 39 | CHECKPOINT_PERIOD=10000, 40 | LR_SCHEDULER=dict( 41 | MAX_ITER=270000, 42 | STEPS=(210000, 250000), 43 | ), 44 | OPTIMIZER=dict( 45 | BASE_LR=0.01, 46 | ), 47 | IMS_PER_BATCH=16, 48 | ), 49 | INPUT=dict( 50 | AUG=dict( 51 | TRAIN_PIPELINES=[ 52 | ("ResizeShortestEdge", 53 | dict(short_edge_length=(640, 672, 704, 736, 768, 800), max_size=1333, sample_style="choice")), 54 | ("RandomFlip", dict()), 55 | ], 56 | TEST_PIPELINES=[ 57 | ("ResizeShortestEdge", 58 | dict(short_edge_length=800, max_size=1333, sample_style="choice")), 59 | ], 60 | ) 61 | ), 62 | TEST=dict( 63 | EVAL_PEROID=10000, 64 | ), 65 | OUTPUT_DIR=osp.join( 66 | '/data/Outputs/model_logs/cvpods_playground', 67 | osp.split(osp.realpath(__file__))[0].split("playground/")[-1]), 68 | ) 69 | 70 | 71 | class CustomFCOSConfig(FCOSConfig): 72 | def __init__(self): 73 | super(CustomFCOSConfig, self).__init__() 74 | self._register_configuration(_config_dict) 75 | 76 | 77 | config = CustomFCOSConfig() 78 | -------------------------------------------------------------------------------- /playground/detection/coco/center.res50.fpn.coco.800size.3x_ms/fcos.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import math 3 | from typing import List 4 | 5 | import torch 6 | import torch.nn.functional as F 7 | from torch import nn 8 | 9 | from cvpods.layers import ShapeSpec, cat, generalized_batched_nms 10 | from cvpods.modeling.box_regression import Shift2BoxTransform 11 | from cvpods.modeling.losses import iou_loss, sigmoid_focal_loss_jit 12 | from cvpods.modeling.meta_arch.fcos import Scale 13 | from cvpods.modeling.meta_arch.retinanet import ( 14 | permute_to_N_HWA_K, 15 | permute_all_cls_and_box_to_N_HWA_K_and_concat 16 | ) 17 | from cvpods.modeling.postprocessing import detector_postprocess 18 | from cvpods.structures import Boxes, ImageList, Instances 19 | from cvpods.utils import comm, get_event_storage, log_first_n 20 | 21 | 22 | class FCOS(nn.Module): 23 | """ 24 | Implement FCOS (https://arxiv.org/abs/1904.01355). 25 | """ 26 | def __init__(self, cfg): 27 | super().__init__() 28 | 29 | self.device = torch.device(cfg.MODEL.DEVICE) 30 | 31 | # fmt: off 32 | self.num_classes = cfg.MODEL.FCOS.NUM_CLASSES 33 | self.in_features = cfg.MODEL.FCOS.IN_FEATURES 34 | self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES 35 | # Loss parameters: 36 | self.focal_loss_alpha = cfg.MODEL.FCOS.FOCAL_LOSS_ALPHA 37 | self.focal_loss_gamma = cfg.MODEL.FCOS.FOCAL_LOSS_GAMMA 38 | self.iou_loss_type = cfg.MODEL.FCOS.IOU_LOSS_TYPE 39 | self.reg_weight = cfg.MODEL.FCOS.REG_WEIGHT 40 | # Inference parameters: 41 | self.score_threshold = cfg.MODEL.FCOS.SCORE_THRESH_TEST 42 | self.topk_candidates = cfg.MODEL.FCOS.TOPK_CANDIDATES_TEST 43 | self.nms_threshold = cfg.MODEL.FCOS.NMS_THRESH_TEST 44 | self.nms_type = cfg.MODEL.NMS_TYPE 45 | self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE 46 | # fmt: on 47 | 48 | self.backbone = cfg.build_backbone( 49 | cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) 50 | 51 | backbone_shape = self.backbone.output_shape() 52 | feature_shapes = [backbone_shape[f] for f in self.in_features] 53 | self.head = FCOSHead(cfg, feature_shapes) 54 | self.shift_generator = cfg.build_shift_generator(cfg, feature_shapes) 55 | 56 | # Matching and loss 57 | self.shift2box_transform = Shift2BoxTransform( 58 | weights=cfg.MODEL.FCOS.BBOX_REG_WEIGHTS) 59 | self.object_sizes_of_interest = cfg.MODEL.POTO.OBJECT_SIZES_OF_INTEREST 60 | self.distance_topk = cfg.MODEL.POTO.DISTANCE_TOPK 61 | 62 | pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 63 | 3, 1, 1) 64 | pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 65 | 3, 1, 1) 66 | self.normalizer = lambda x: (x - pixel_mean) / pixel_std 67 | self.to(self.device) 68 | 69 | def forward(self, batched_inputs): 70 | """ 71 | Args: 72 | batched_inputs: a list, batched outputs of :class:`DatasetMapper` . 73 | Each item in the list contains the inputs for one image. 74 | For now, each item in the list is a dict that contains: 75 | 76 | * image: Tensor, image in (C, H, W) format. 77 | * instances: Instances 78 | 79 | Other information that's included in the original dicts, such as: 80 | 81 | * "height", "width" (int): the output resolution of the model, used in inference. 82 | See :meth:`postprocess` for details. 83 | Returns: 84 | dict[str: Tensor]: 85 | mapping from a named loss to a tensor storing the loss. Used during training only. 86 | """ 87 | images = self.preprocess_image(batched_inputs) 88 | if "instances" in batched_inputs[0]: 89 | gt_instances = [ 90 | x["instances"].to(self.device) for x in batched_inputs 91 | ] 92 | elif "targets" in batched_inputs[0]: 93 | log_first_n( 94 | logging.WARN, 95 | "'targets' in the model inputs is now renamed to 'instances'!", 96 | n=10) 97 | gt_instances = [ 98 | x["targets"].to(self.device) for x in batched_inputs 99 | ] 100 | else: 101 | gt_instances = None 102 | 103 | features = self.backbone(images.tensor) 104 | features = [features[f] for f in self.in_features] 105 | box_cls, box_delta = self.head(features) 106 | shifts = self.shift_generator(features) 107 | 108 | if self.training: 109 | gt_classes, gt_shifts_reg_deltas = self.get_ground_truth( 110 | shifts, gt_instances) 111 | return self.losses(gt_classes, gt_shifts_reg_deltas, box_cls, box_delta) 112 | else: 113 | results = self.inference(box_cls, box_delta, shifts, images) 114 | processed_results = [] 115 | for results_per_image, input_per_image, image_size in zip( 116 | results, batched_inputs, images.image_sizes): 117 | height = input_per_image.get("height", image_size[0]) 118 | width = input_per_image.get("width", image_size[1]) 119 | r = detector_postprocess(results_per_image, height, width) 120 | processed_results.append({"instances": r}) 121 | return processed_results 122 | 123 | def losses(self, gt_classes, gt_shifts_deltas, pred_class_logits, 124 | pred_shift_deltas): 125 | """ 126 | Args: 127 | For `gt_classes` and `gt_shifts_deltas` parameters, see 128 | :meth:`FCOS.get_ground_truth`. 129 | Their shapes are (N, R) and (N, R, 4), respectively, where R is 130 | the total number of shifts across levels, i.e. sum(Hi x Wi) 131 | For `pred_class_logits` and `pred_shift_deltas`, see 132 | :meth:`FCOSHead.forward`. 133 | 134 | Returns: 135 | dict[str: Tensor]: 136 | mapping from a named loss to a scalar tensor 137 | storing the loss. Used during training only. The dict keys are: 138 | "loss_cls" and "loss_box_reg" 139 | """ 140 | pred_class_logits, pred_shift_deltas = \ 141 | permute_all_cls_and_box_to_N_HWA_K_and_concat( 142 | pred_class_logits, pred_shift_deltas, self.num_classes 143 | ) # Shapes: (N x R, K) and (N x R, 4), respectively. 144 | 145 | gt_classes = gt_classes.flatten() 146 | gt_shifts_deltas = gt_shifts_deltas.view(-1, 4) 147 | 148 | valid_idxs = gt_classes >= 0 149 | foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes) 150 | num_foreground = foreground_idxs.sum() 151 | 152 | gt_classes_target = torch.zeros_like(pred_class_logits) 153 | gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1 154 | 155 | num_foreground = comm.all_reduce(num_foreground) / float(comm.get_world_size()) 156 | 157 | # logits loss 158 | loss_cls = sigmoid_focal_loss_jit( 159 | pred_class_logits[valid_idxs], 160 | gt_classes_target[valid_idxs], 161 | alpha=self.focal_loss_alpha, 162 | gamma=self.focal_loss_gamma, 163 | reduction="sum", 164 | ) / max(1.0, num_foreground) 165 | 166 | # regression loss 167 | loss_box_reg = iou_loss( 168 | pred_shift_deltas[foreground_idxs], 169 | gt_shifts_deltas[foreground_idxs], 170 | box_mode="ltrb", 171 | loss_type=self.iou_loss_type, 172 | reduction="sum", 173 | ) / max(1.0, num_foreground) * self.reg_weight 174 | 175 | return { 176 | "loss_cls": loss_cls, 177 | "loss_box_reg": loss_box_reg, 178 | } 179 | 180 | @torch.no_grad() 181 | def get_ground_truth(self, shifts, targets): 182 | """ 183 | Args: 184 | shifts (list[list[Tensor]]): a list of N=#image elements. Each is a 185 | list of #feature level tensors. The tensors contains shifts of 186 | this image on the specific feature level. 187 | targets (list[Instances]): a list of N `Instances`s. The i-th 188 | `Instances` contains the ground-truth per-instance annotations 189 | for the i-th input image. Specify `targets` during training only. 190 | 191 | Returns: 192 | gt_classes (Tensor): 193 | An integer tensor of shape (N, R) storing ground-truth 194 | labels for each shift. 195 | R is the total number of shifts, i.e. the sum of Hi x Wi for all levels. 196 | Shifts in the valid boxes are assigned their corresponding label in the 197 | [0, K-1] range. Shifts in the background are assigned the label "K". 198 | Shifts in the ignore areas are assigned a label "-1", i.e. ignore. 199 | gt_shifts_deltas (Tensor): 200 | Shape (N, R, 4). 201 | The last dimension represents ground-truth shift2box transform 202 | targets (dl, dt, dr, db) that map each shift to its matched ground-truth box. 203 | The values in the tensor are meaningful only when the corresponding 204 | shift is labeled as foreground. 205 | """ 206 | gt_classes = [] 207 | gt_shifts_deltas = [] 208 | 209 | num_fg = 0 210 | num_gt = 0 211 | 212 | for shifts_per_image, targets_per_image in zip(shifts, targets): 213 | object_sizes_of_interest = torch.cat([ 214 | shifts_i.new_tensor(size).unsqueeze(0).expand( 215 | shifts_i.size(0), -1) for shifts_i, size in zip( 216 | shifts_per_image, self.object_sizes_of_interest) 217 | ], dim=0) 218 | 219 | shifts_over_all_feature_maps = torch.cat(shifts_per_image, dim=0) 220 | 221 | gt_boxes = targets_per_image.gt_boxes 222 | 223 | max_deltas = self.shift2box_transform.get_deltas( 224 | shifts_over_all_feature_maps, gt_boxes.tensor.unsqueeze(1) 225 | ).max(dim=2).values 226 | # limit the regression range for each location 227 | is_cared_in_the_level = \ 228 | (max_deltas >= object_sizes_of_interest[None, :, 0]) & \ 229 | (max_deltas <= object_sizes_of_interest[None, :, 1]) 230 | 231 | candidate_idxs = [] 232 | base = 0 233 | for stride, shifts_i in zip(self.fpn_strides, shifts_per_image): 234 | distances = torch.cdist(gt_boxes.get_centers(), shifts_i) 235 | _, topk_idxs = distances.topk( 236 | self.distance_topk, dim=1, largest=False) 237 | candidate_idxs.append(base + topk_idxs) 238 | base += len(shifts_i) 239 | candidate_idxs = torch.cat(candidate_idxs, dim=1) 240 | 241 | is_foreground = torch.zeros_like( 242 | is_cared_in_the_level 243 | ).scatter_(1, candidate_idxs, True) 244 | 245 | gt_positions_area = gt_boxes.area().unsqueeze(1).repeat( 246 | 1, shifts_over_all_feature_maps.size(0)) 247 | gt_positions_area[~is_cared_in_the_level] = math.inf 248 | gt_positions_area[~is_foreground] = math.inf 249 | 250 | # if there are still more than one objects for a position, 251 | # we choose the one with minimal area 252 | positions_min_area, gt_matched_idxs = gt_positions_area.min(dim=0) 253 | 254 | num_fg += (positions_min_area != math.inf).sum().item() 255 | num_gt += len(targets_per_image) 256 | 257 | # ground truth box regression 258 | gt_shifts_reg_deltas_i = self.shift2box_transform.get_deltas( 259 | shifts_over_all_feature_maps, gt_boxes[gt_matched_idxs].tensor) 260 | 261 | # ground truth classes 262 | has_gt = len(targets_per_image) > 0 263 | if has_gt: 264 | gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs] 265 | # Shifts with area inf are treated as background. 266 | gt_classes_i[positions_min_area == math.inf] = self.num_classes 267 | else: 268 | gt_classes_i = torch.zeros_like( 269 | gt_matched_idxs) + self.num_classes 270 | 271 | gt_classes.append(gt_classes_i) 272 | gt_shifts_deltas.append(gt_shifts_reg_deltas_i) 273 | 274 | get_event_storage().put_scalar("num_fg_per_gt", num_fg / num_gt) 275 | 276 | return torch.stack(gt_classes), torch.stack(gt_shifts_deltas) 277 | 278 | def inference(self, box_cls, box_delta, shifts, images): 279 | """ 280 | Arguments: 281 | box_cls, box_delta: Same as the output of :meth:`FCOSHead.forward` 282 | shifts (list[list[Tensor]): a list of #images elements. Each is a 283 | list of #feature level tensor. The tensor contain shifts of this 284 | image on the specific feature level. 285 | images (ImageList): the input images 286 | 287 | Returns: 288 | results (List[Instances]): a list of #images elements. 289 | """ 290 | assert len(shifts) == len(images) 291 | results = [] 292 | 293 | box_cls = [permute_to_N_HWA_K(x, self.num_classes) for x in box_cls] 294 | box_delta = [permute_to_N_HWA_K(x, 4) for x in box_delta] 295 | # list[Tensor], one per level, each has shape (N, Hi x Wi x A, K or 4) 296 | 297 | for img_idx, shifts_per_image in enumerate(shifts): 298 | image_size = images.image_sizes[img_idx] 299 | box_cls_per_image = [ 300 | box_cls_per_level[img_idx] for box_cls_per_level in box_cls 301 | ] 302 | box_reg_per_image = [ 303 | box_reg_per_level[img_idx] for box_reg_per_level in box_delta 304 | ] 305 | results_per_image = self.inference_single_image( 306 | box_cls_per_image, box_reg_per_image, shifts_per_image, 307 | tuple(image_size)) 308 | results.append(results_per_image) 309 | return results 310 | 311 | def inference_single_image(self, box_cls, box_delta, shifts, image_size): 312 | """ 313 | Single-image inference. Return bounding-box detection results by thresholding 314 | on scores and applying non-maximum suppression (NMS). 315 | 316 | Arguments: 317 | box_cls (list[Tensor]): list of #feature levels. Each entry contains 318 | tensor of size (H x W, K) 319 | box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4. 320 | shifts (list[Tensor]): list of #feature levels. Each entry contains 321 | a tensor, which contains all the shifts for that 322 | image in that feature level. 323 | image_size (tuple(H, W)): a tuple of the image height and width. 324 | 325 | Returns: 326 | Same as `inference`, but for only one image. 327 | """ 328 | boxes_all = [] 329 | scores_all = [] 330 | class_idxs_all = [] 331 | 332 | # Iterate over every feature level 333 | for box_cls_i, box_reg_i, shifts_i in zip(box_cls, box_delta, shifts): 334 | # (HxWxK,) 335 | box_cls_i = box_cls_i.sigmoid_().flatten() 336 | 337 | # Keep top k top scoring indices only. 338 | num_topk = min(self.topk_candidates, box_reg_i.size(0)) 339 | # torch.sort is actually faster than .topk (at least on GPUs) 340 | predicted_prob, topk_idxs = box_cls_i.sort(descending=True) 341 | predicted_prob = predicted_prob[:num_topk] 342 | topk_idxs = topk_idxs[:num_topk] 343 | 344 | # filter out the proposals with low confidence score 345 | keep_idxs = predicted_prob > self.score_threshold 346 | predicted_prob = predicted_prob[keep_idxs] 347 | topk_idxs = topk_idxs[keep_idxs] 348 | 349 | shift_idxs = topk_idxs // self.num_classes 350 | classes_idxs = topk_idxs % self.num_classes 351 | 352 | box_reg_i = box_reg_i[shift_idxs] 353 | shifts_i = shifts_i[shift_idxs] 354 | # predict boxes 355 | predicted_boxes = self.shift2box_transform.apply_deltas( 356 | box_reg_i, shifts_i) 357 | 358 | boxes_all.append(predicted_boxes) 359 | scores_all.append(predicted_prob) 360 | class_idxs_all.append(classes_idxs) 361 | 362 | boxes_all, scores_all, class_idxs_all = [ 363 | cat(x) for x in [boxes_all, scores_all, class_idxs_all] 364 | ] 365 | 366 | if self.nms_type is None: 367 | # strategies above (e.g. topk_candidates and score_threshold) are 368 | # useless for POTO, just keep them for debug and analysis 369 | keep = scores_all.argsort(descending=True) 370 | else: 371 | keep = generalized_batched_nms( 372 | boxes_all, scores_all, class_idxs_all, 373 | self.nms_threshold, nms_type=self.nms_type 374 | ) 375 | keep = keep[:self.max_detections_per_image] 376 | 377 | result = Instances(image_size) 378 | result.pred_boxes = Boxes(boxes_all[keep]) 379 | result.scores = scores_all[keep] 380 | result.pred_classes = class_idxs_all[keep] 381 | return result 382 | 383 | def preprocess_image(self, batched_inputs): 384 | """ 385 | Normalize, pad and batch the input images. 386 | """ 387 | images = [x["image"].to(self.device) for x in batched_inputs] 388 | images = [self.normalizer(x) for x in images] 389 | images = ImageList.from_tensors(images, 390 | self.backbone.size_divisibility) 391 | return images 392 | 393 | def _inference_for_ms_test(self, batched_inputs): 394 | """ 395 | function used for multiscale test, will be refactor in the future. 396 | The same input with `forward` function. 397 | """ 398 | assert not self.training, "inference mode with training=True" 399 | assert len(batched_inputs) == 1, "inference image number > 1" 400 | images = self.preprocess_image(batched_inputs) 401 | 402 | features = self.backbone(images.tensor) 403 | features = [features[f] for f in self.in_features] 404 | box_cls, box_delta = self.head(features) 405 | shifts = self.shift_generator(features) 406 | 407 | results = self.inference(box_cls, box_delta, shifts, images) 408 | for results_per_image, input_per_image, image_size in zip( 409 | results, batched_inputs, images.image_sizes 410 | ): 411 | height = input_per_image.get("height", image_size[0]) 412 | width = input_per_image.get("width", image_size[1]) 413 | processed_results = detector_postprocess(results_per_image, height, width) 414 | return processed_results 415 | 416 | 417 | class FCOSHead(nn.Module): 418 | """ 419 | The head used in FCOS for object classification and box regression. 420 | It has two subnets for the two tasks, with a common structure but separate parameters. 421 | """ 422 | def __init__(self, cfg, input_shape: List[ShapeSpec]): 423 | super().__init__() 424 | # fmt: off 425 | in_channels = input_shape[0].channels 426 | num_classes = cfg.MODEL.FCOS.NUM_CLASSES 427 | num_convs = cfg.MODEL.FCOS.NUM_CONVS 428 | prior_prob = cfg.MODEL.FCOS.PRIOR_PROB 429 | num_shifts = cfg.build_shift_generator(cfg, input_shape).num_cell_shifts 430 | self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES 431 | self.norm_reg_targets = cfg.MODEL.FCOS.NORM_REG_TARGETS 432 | # fmt: on 433 | assert len(set(num_shifts)) == 1, "using differenct num_shifts value is not supported" 434 | num_shifts = num_shifts[0] 435 | 436 | cls_subnet = [] 437 | bbox_subnet = [] 438 | for _ in range(num_convs): 439 | cls_subnet.append( 440 | nn.Conv2d(in_channels, 441 | in_channels, 442 | kernel_size=3, 443 | stride=1, 444 | padding=1)) 445 | cls_subnet.append(nn.GroupNorm(32, in_channels)) 446 | cls_subnet.append(nn.ReLU()) 447 | bbox_subnet.append( 448 | nn.Conv2d(in_channels, 449 | in_channels, 450 | kernel_size=3, 451 | stride=1, 452 | padding=1)) 453 | bbox_subnet.append(nn.GroupNorm(32, in_channels)) 454 | bbox_subnet.append(nn.ReLU()) 455 | 456 | self.cls_subnet = nn.Sequential(*cls_subnet) 457 | self.bbox_subnet = nn.Sequential(*bbox_subnet) 458 | self.cls_score = nn.Conv2d(in_channels, 459 | num_shifts * num_classes, 460 | kernel_size=3, 461 | stride=1, 462 | padding=1) 463 | self.bbox_pred = nn.Conv2d(in_channels, 464 | num_shifts * 4, 465 | kernel_size=3, 466 | stride=1, 467 | padding=1) 468 | 469 | # Initialization 470 | for modules in [ 471 | self.cls_subnet, self.bbox_subnet, self.cls_score, self.bbox_pred 472 | ]: 473 | for layer in modules.modules(): 474 | if isinstance(layer, nn.Conv2d): 475 | torch.nn.init.normal_(layer.weight, mean=0, std=0.01) 476 | torch.nn.init.constant_(layer.bias, 0) 477 | if isinstance(layer, nn.GroupNorm): 478 | torch.nn.init.constant_(layer.weight, 1) 479 | torch.nn.init.constant_(layer.bias, 0) 480 | 481 | # Use prior in model initialization to improve stability 482 | bias_value = -math.log((1 - prior_prob) / prior_prob) 483 | torch.nn.init.constant_(self.cls_score.bias, bias_value) 484 | 485 | self.scales = nn.ModuleList( 486 | [Scale(init_value=1.0) for _ in range(len(self.fpn_strides))]) 487 | 488 | def forward(self, features): 489 | """ 490 | Arguments: 491 | features (list[Tensor]): FPN feature map tensors in high to low resolution. 492 | Each tensor in the list correspond to different feature levels. 493 | 494 | Returns: 495 | logits (list[Tensor]): #lvl tensors, each has shape (N, K, Hi, Wi). 496 | The tensor predicts the classification probability 497 | at each spatial position for each of the K object classes. 498 | bbox_reg (list[Tensor]): #lvl tensors, each has shape (N, 4, Hi, Wi). 499 | The tensor predicts 4-vector (dl,dt,dr,db) box 500 | regression values for every shift. These values are the 501 | relative offset between the shift and the ground truth box. 502 | """ 503 | logits = [] 504 | bbox_reg = [] 505 | for level, feature in enumerate(features): 506 | cls_subnet = self.cls_subnet(feature) 507 | bbox_subnet = self.bbox_subnet(feature) 508 | 509 | logits.append(self.cls_score(cls_subnet)) 510 | 511 | bbox_pred = self.scales[level](self.bbox_pred(bbox_subnet)) 512 | if self.norm_reg_targets: 513 | bbox_reg.append(F.relu(bbox_pred) * self.fpn_strides[level]) 514 | else: 515 | bbox_reg.append(torch.exp(bbox_pred)) 516 | return logits, bbox_reg 517 | -------------------------------------------------------------------------------- /playground/detection/coco/center.res50.fpn.coco.800size.3x_ms/net.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from cvpods.layers import ShapeSpec 4 | from cvpods.modeling.anchor_generator import ShiftGenerator 5 | from cvpods.modeling.backbone import Backbone 6 | from cvpods.modeling.backbone.fpn import build_retinanet_resnet_fpn_p5_backbone 7 | 8 | from fcos import FCOS 9 | 10 | 11 | def build_backbone(cfg, input_shape=None): 12 | """ 13 | Build a backbone from `cfg.MODEL.BACKBONE.NAME`. 14 | 15 | Returns: 16 | an instance of :class:`Backbone` 17 | """ 18 | if input_shape is None: 19 | input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) 20 | 21 | backbone = build_retinanet_resnet_fpn_p5_backbone(cfg, input_shape) 22 | assert isinstance(backbone, Backbone) 23 | return backbone 24 | 25 | 26 | def build_shift_generator(cfg, input_shape): 27 | 28 | return ShiftGenerator(cfg, input_shape) 29 | 30 | 31 | def build_model(cfg): 32 | 33 | cfg.build_backbone = build_backbone 34 | cfg.build_shift_generator = build_shift_generator 35 | 36 | model = FCOS(cfg) 37 | logger = logging.getLogger(__name__) 38 | logger.info("Model:\n{}".format(model)) 39 | return model 40 | -------------------------------------------------------------------------------- /playground/detection/coco/fcos.res50.fpn.coco.800size.3x_ms.wo_ctrness/README.md: -------------------------------------------------------------------------------- 1 | # fcos.res50.fpn.coco.800size.3x_ms.wo_ctrness 2 | 3 | seed: 47789800 4 | 5 | ## Evaluation results for bbox: 6 | 7 | ``` 8 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.409 9 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.602 10 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.441 11 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.241 12 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.452 13 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.524 14 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.333 15 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.548 16 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.584 17 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.382 18 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.623 19 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.731 20 | ``` 21 | | AP | AP50 | AP75 | APs | APm | APl | 22 | |:------:|:------:|:------:|:------:|:------:|:------:| 23 | | 40.944 | 60.167 | 44.113 | 24.072 | 45.182 | 52.421 | 24 | 25 | ### Per-category bbox AP: 26 | 27 | | category | AP | category | AP | category | AP | 28 | |:--------------|:-------|:-------------|:-------|:---------------|:-------| 29 | | person | 54.384 | bicycle | 31.306 | car | 44.309 | 30 | | motorcycle | 42.440 | airplane | 67.711 | bus | 66.455 | 31 | | train | 64.182 | truck | 35.696 | boat | 26.276 | 32 | | traffic light | 27.771 | fire hydrant | 67.507 | stop sign | 65.803 | 33 | | parking meter | 42.677 | bench | 22.744 | bird | 36.077 | 34 | | cat | 65.877 | dog | 61.872 | horse | 55.550 | 35 | | sheep | 53.451 | cow | 58.303 | elephant | 65.260 | 36 | | bear | 72.269 | zebra | 69.098 | giraffe | 66.824 | 37 | | backpack | 16.122 | umbrella | 38.972 | handbag | 15.176 | 38 | | tie | 33.290 | suitcase | 39.192 | frisbee | 65.786 | 39 | | skis | 21.490 | snowboard | 35.935 | sports ball | 47.188 | 40 | | kite | 44.031 | baseball bat | 28.208 | baseball glove | 36.072 | 41 | | skateboard | 52.649 | surfboard | 31.155 | tennis racket | 47.645 | 42 | | bottle | 38.152 | wine glass | 37.058 | cup | 41.280 | 43 | | fork | 33.254 | knife | 14.530 | spoon | 14.718 | 44 | | bowl | 37.720 | banana | 23.842 | apple | 19.079 | 45 | | sandwich | 33.285 | orange | 31.455 | broccoli | 23.420 | 46 | | carrot | 19.758 | hot dog | 32.770 | pizza | 51.021 | 47 | | donut | 45.210 | cake | 35.831 | chair | 27.455 | 48 | | couch | 43.319 | potted plant | 27.762 | bed | 40.798 | 49 | | dining table | 26.791 | toilet | 61.554 | tv | 55.279 | 50 | | laptop | 57.426 | mouse | 62.401 | remote | 31.136 | 51 | | keyboard | 47.982 | cell phone | 33.081 | microwave | 55.147 | 52 | | oven | 33.120 | toaster | 36.481 | sink | 38.436 | 53 | | refrigerator | 53.491 | book | 12.543 | clock | 48.801 | 54 | | vase | 37.689 | scissors | 26.647 | teddy bear | 44.404 | 55 | | hair drier | 6.932 | toothbrush | 17.679 | | | 56 | -------------------------------------------------------------------------------- /playground/detection/coco/fcos.res50.fpn.coco.800size.3x_ms.wo_ctrness/config.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from cvpods.configs.fcos_config import FCOSConfig 4 | 5 | _config_dict = dict( 6 | MODEL=dict( 7 | WEIGHTS="detectron2://ImageNetPretrained/MSRA/R-50.pkl", 8 | RESNETS=dict(DEPTH=50), 9 | SHIFT_GENERATOR=dict( 10 | NUM_SHIFTS=1, 11 | OFFSET=0.5, 12 | ), 13 | FCOS=dict( 14 | NORM_REG_TARGETS=True, 15 | NMS_THRESH_TEST=0.6, 16 | BBOX_REG_WEIGHTS=(1.0, 1.0, 1.0, 1.0), 17 | FOCAL_LOSS_GAMMA=2.0, 18 | FOCAL_LOSS_ALPHA=0.25, 19 | IOU_LOSS_TYPE="giou", 20 | CENTER_SAMPLING_RADIUS=1.5, 21 | OBJECT_SIZES_OF_INTEREST=[ 22 | [-1, 64], 23 | [64, 128], 24 | [128, 256], 25 | [256, 512], 26 | [512, float("inf")], 27 | ], 28 | ), 29 | ), 30 | DATASETS=dict( 31 | TRAIN=("coco_2017_train",), 32 | TEST=("coco_2017_val",), 33 | ), 34 | SOLVER=dict( 35 | CHECKPOINT_PERIOD=10000, 36 | LR_SCHEDULER=dict( 37 | MAX_ITER=270000, 38 | STEPS=(210000, 250000), 39 | ), 40 | OPTIMIZER=dict( 41 | BASE_LR=0.01, 42 | ), 43 | IMS_PER_BATCH=16, 44 | ), 45 | INPUT=dict( 46 | AUG=dict( 47 | TRAIN_PIPELINES=[ 48 | ("ResizeShortestEdge", 49 | dict(short_edge_length=(640, 672, 704, 736, 768, 800), max_size=1333, sample_style="choice")), 50 | ("RandomFlip", dict()), 51 | ], 52 | TEST_PIPELINES=[ 53 | ("ResizeShortestEdge", 54 | dict(short_edge_length=800, max_size=1333, sample_style="choice")), 55 | ], 56 | ) 57 | ), 58 | TEST=dict( 59 | EVAL_PEROID=10000, 60 | ), 61 | OUTPUT_DIR=osp.join( 62 | '/data/Outputs/model_logs/cvpods_playground', 63 | osp.split(osp.realpath(__file__))[0].split("playground/")[-1]), 64 | ) 65 | 66 | 67 | class CustomFCOSConfig(FCOSConfig): 68 | def __init__(self): 69 | super(CustomFCOSConfig, self).__init__() 70 | self._register_configuration(_config_dict) 71 | 72 | 73 | config = CustomFCOSConfig() 74 | -------------------------------------------------------------------------------- /playground/detection/coco/fcos.res50.fpn.coco.800size.3x_ms.wo_ctrness/fcos.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import math 3 | from typing import List 4 | 5 | import torch 6 | import torch.nn.functional as F 7 | from torch import nn 8 | 9 | from cvpods.layers import ShapeSpec, cat, generalized_batched_nms 10 | from cvpods.modeling.box_regression import Shift2BoxTransform 11 | from cvpods.modeling.losses import iou_loss, sigmoid_focal_loss_jit 12 | from cvpods.modeling.meta_arch.fcos import Scale 13 | from cvpods.modeling.meta_arch.retinanet import ( 14 | permute_to_N_HWA_K, 15 | permute_all_cls_and_box_to_N_HWA_K_and_concat 16 | ) 17 | from cvpods.modeling.postprocessing import detector_postprocess 18 | from cvpods.structures import Boxes, ImageList, Instances 19 | from cvpods.utils import comm, log_first_n 20 | 21 | 22 | class FCOS(nn.Module): 23 | """ 24 | Implement FCOS (https://arxiv.org/abs/1904.01355). 25 | """ 26 | def __init__(self, cfg): 27 | super().__init__() 28 | 29 | self.device = torch.device(cfg.MODEL.DEVICE) 30 | 31 | # fmt: off 32 | self.num_classes = cfg.MODEL.FCOS.NUM_CLASSES 33 | self.in_features = cfg.MODEL.FCOS.IN_FEATURES 34 | self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES 35 | # Loss parameters: 36 | self.focal_loss_alpha = cfg.MODEL.FCOS.FOCAL_LOSS_ALPHA 37 | self.focal_loss_gamma = cfg.MODEL.FCOS.FOCAL_LOSS_GAMMA 38 | self.iou_loss_type = cfg.MODEL.FCOS.IOU_LOSS_TYPE 39 | self.center_sampling_radius = cfg.MODEL.FCOS.CENTER_SAMPLING_RADIUS 40 | # Inference parameters: 41 | self.score_threshold = cfg.MODEL.FCOS.SCORE_THRESH_TEST 42 | self.topk_candidates = cfg.MODEL.FCOS.TOPK_CANDIDATES_TEST 43 | self.nms_threshold = cfg.MODEL.FCOS.NMS_THRESH_TEST 44 | self.nms_type = cfg.MODEL.NMS_TYPE 45 | self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE 46 | # fmt: on 47 | 48 | self.backbone = cfg.build_backbone( 49 | cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) 50 | 51 | backbone_shape = self.backbone.output_shape() 52 | feature_shapes = [backbone_shape[f] for f in self.in_features] 53 | self.head = FCOSHead(cfg, feature_shapes) 54 | self.shift_generator = cfg.build_shift_generator(cfg, feature_shapes) 55 | 56 | # Matching and loss 57 | self.shift2box_transform = Shift2BoxTransform( 58 | weights=cfg.MODEL.FCOS.BBOX_REG_WEIGHTS) 59 | self.object_sizes_of_interest = cfg.MODEL.FCOS.OBJECT_SIZES_OF_INTEREST 60 | 61 | pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 62 | 3, 1, 1) 63 | pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 64 | 3, 1, 1) 65 | self.normalizer = lambda x: (x - pixel_mean) / pixel_std 66 | self.to(self.device) 67 | 68 | def forward(self, batched_inputs): 69 | """ 70 | Args: 71 | batched_inputs: a list, batched outputs of :class:`DatasetMapper` . 72 | Each item in the list contains the inputs for one image. 73 | For now, each item in the list is a dict that contains: 74 | 75 | * image: Tensor, image in (C, H, W) format. 76 | * instances: Instances 77 | 78 | Other information that's included in the original dicts, such as: 79 | 80 | * "height", "width" (int): the output resolution of the model, used in inference. 81 | See :meth:`postprocess` for details. 82 | Returns: 83 | dict[str: Tensor]: 84 | mapping from a named loss to a tensor storing the loss. Used during training only. 85 | """ 86 | images = self.preprocess_image(batched_inputs) 87 | if "instances" in batched_inputs[0]: 88 | gt_instances = [ 89 | x["instances"].to(self.device) for x in batched_inputs 90 | ] 91 | elif "targets" in batched_inputs[0]: 92 | log_first_n( 93 | logging.WARN, 94 | "'targets' in the model inputs is now renamed to 'instances'!", 95 | n=10) 96 | gt_instances = [ 97 | x["targets"].to(self.device) for x in batched_inputs 98 | ] 99 | else: 100 | gt_instances = None 101 | 102 | features = self.backbone(images.tensor) 103 | features = [features[f] for f in self.in_features] 104 | box_cls, box_delta = self.head(features) 105 | shifts = self.shift_generator(features) 106 | 107 | if self.training: 108 | gt_classes, gt_shifts_reg_deltas = self.get_ground_truth( 109 | shifts, gt_instances) 110 | return self.losses(gt_classes, gt_shifts_reg_deltas, box_cls, box_delta) 111 | else: 112 | results = self.inference(box_cls, box_delta, shifts, images) 113 | processed_results = [] 114 | for results_per_image, input_per_image, image_size in zip( 115 | results, batched_inputs, images.image_sizes): 116 | height = input_per_image.get("height", image_size[0]) 117 | width = input_per_image.get("width", image_size[1]) 118 | r = detector_postprocess(results_per_image, height, width) 119 | processed_results.append({"instances": r}) 120 | return processed_results 121 | 122 | def losses(self, gt_classes, gt_shifts_deltas, pred_class_logits, 123 | pred_shift_deltas): 124 | """ 125 | Args: 126 | For `gt_classes` and `gt_shifts_deltas` parameters, see 127 | :meth:`FCOS.get_ground_truth`. 128 | Their shapes are (N, R) and (N, R, 4), respectively, where R is 129 | the total number of shifts across levels, i.e. sum(Hi x Wi) 130 | For `pred_class_logits` and `pred_shift_deltas`, see 131 | :meth:`FCOSHead.forward`. 132 | 133 | Returns: 134 | dict[str: Tensor]: 135 | mapping from a named loss to a scalar tensor 136 | storing the loss. Used during training only. The dict keys are: 137 | "loss_cls" and "loss_box_reg" 138 | """ 139 | pred_class_logits, pred_shift_deltas = \ 140 | permute_all_cls_and_box_to_N_HWA_K_and_concat( 141 | pred_class_logits, pred_shift_deltas, self.num_classes 142 | ) # Shapes: (N x R, K) and (N x R, 4), respectively. 143 | 144 | gt_classes = gt_classes.flatten() 145 | gt_shifts_deltas = gt_shifts_deltas.view(-1, 4) 146 | 147 | valid_idxs = gt_classes >= 0 148 | foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes) 149 | num_foreground = foreground_idxs.sum() 150 | 151 | gt_classes_target = torch.zeros_like(pred_class_logits) 152 | gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1 153 | 154 | num_foreground = comm.all_reduce(num_foreground) / float(comm.get_world_size()) 155 | 156 | # logits loss 157 | loss_cls = sigmoid_focal_loss_jit( 158 | pred_class_logits[valid_idxs], 159 | gt_classes_target[valid_idxs], 160 | alpha=self.focal_loss_alpha, 161 | gamma=self.focal_loss_gamma, 162 | reduction="sum", 163 | ) / max(1.0, num_foreground) 164 | 165 | # regression loss 166 | loss_box_reg = iou_loss( 167 | pred_shift_deltas[foreground_idxs], 168 | gt_shifts_deltas[foreground_idxs], 169 | box_mode="ltrb", 170 | loss_type=self.iou_loss_type, 171 | reduction="sum", 172 | ) / max(1.0, num_foreground) 173 | 174 | return { 175 | "loss_cls": loss_cls, 176 | "loss_box_reg": loss_box_reg, 177 | } 178 | 179 | @torch.no_grad() 180 | def get_ground_truth(self, shifts, targets): 181 | """ 182 | Args: 183 | shifts (list[list[Tensor]]): a list of N=#image elements. Each is a 184 | list of #feature level tensors. The tensors contains shifts of 185 | this image on the specific feature level. 186 | targets (list[Instances]): a list of N `Instances`s. The i-th 187 | `Instances` contains the ground-truth per-instance annotations 188 | for the i-th input image. Specify `targets` during training only. 189 | 190 | Returns: 191 | gt_classes (Tensor): 192 | An integer tensor of shape (N, R) storing ground-truth 193 | labels for each shift. 194 | R is the total number of shifts, i.e. the sum of Hi x Wi for all levels. 195 | Shifts in the valid boxes are assigned their corresponding label in the 196 | [0, K-1] range. Shifts in the background are assigned the label "K". 197 | Shifts in the ignore areas are assigned a label "-1", i.e. ignore. 198 | gt_shifts_deltas (Tensor): 199 | Shape (N, R, 4). 200 | The last dimension represents ground-truth shift2box transform 201 | targets (dl, dt, dr, db) that map each shift to its matched ground-truth box. 202 | The values in the tensor are meaningful only when the corresponding 203 | shift is labeled as foreground. 204 | """ 205 | gt_classes = [] 206 | gt_shifts_deltas = [] 207 | 208 | for shifts_per_image, targets_per_image in zip(shifts, targets): 209 | object_sizes_of_interest = torch.cat([ 210 | shifts_i.new_tensor(size).unsqueeze(0).expand( 211 | shifts_i.size(0), -1) for shifts_i, size in zip( 212 | shifts_per_image, self.object_sizes_of_interest) 213 | ], dim=0) 214 | 215 | shifts_over_all_feature_maps = torch.cat(shifts_per_image, dim=0) 216 | 217 | gt_boxes = targets_per_image.gt_boxes 218 | 219 | deltas = self.shift2box_transform.get_deltas( 220 | shifts_over_all_feature_maps, gt_boxes.tensor.unsqueeze(1)) 221 | 222 | if self.center_sampling_radius > 0: 223 | centers = gt_boxes.get_centers() 224 | is_in_boxes = [] 225 | for stride, shifts_i in zip(self.fpn_strides, shifts_per_image): 226 | radius = stride * self.center_sampling_radius 227 | center_boxes = torch.cat(( 228 | torch.max(centers - radius, gt_boxes.tensor[:, :2]), 229 | torch.min(centers + radius, gt_boxes.tensor[:, 2:]), 230 | ), dim=-1) 231 | center_deltas = self.shift2box_transform.get_deltas( 232 | shifts_i, center_boxes.unsqueeze(1)) 233 | is_in_boxes.append(center_deltas.min(dim=-1).values > 0) 234 | is_in_boxes = torch.cat(is_in_boxes, dim=1) 235 | else: 236 | # no center sampling, it will use all the locations within a ground-truth box 237 | is_in_boxes = deltas.min(dim=-1).values > 0 238 | 239 | max_deltas = deltas.max(dim=-1).values 240 | # limit the regression range for each location 241 | is_cared_in_the_level = \ 242 | (max_deltas >= object_sizes_of_interest[None, :, 0]) & \ 243 | (max_deltas <= object_sizes_of_interest[None, :, 1]) 244 | 245 | gt_positions_area = gt_boxes.area().unsqueeze(1).repeat( 246 | 1, shifts_over_all_feature_maps.size(0)) 247 | gt_positions_area[~is_in_boxes] = math.inf 248 | gt_positions_area[~is_cared_in_the_level] = math.inf 249 | 250 | # if there are still more than one objects for a position, 251 | # we choose the one with minimal area 252 | positions_min_area, gt_matched_idxs = gt_positions_area.min(dim=0) 253 | 254 | # ground truth box regression 255 | gt_shifts_reg_deltas_i = self.shift2box_transform.get_deltas( 256 | shifts_over_all_feature_maps, gt_boxes[gt_matched_idxs].tensor) 257 | 258 | # ground truth classes 259 | has_gt = len(targets_per_image) > 0 260 | if has_gt: 261 | gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs] 262 | # Shifts with area inf are treated as background. 263 | gt_classes_i[positions_min_area == math.inf] = self.num_classes 264 | else: 265 | gt_classes_i = torch.zeros_like( 266 | gt_matched_idxs) + self.num_classes 267 | 268 | gt_classes.append(gt_classes_i) 269 | gt_shifts_deltas.append(gt_shifts_reg_deltas_i) 270 | 271 | return torch.stack(gt_classes), torch.stack(gt_shifts_deltas) 272 | 273 | def inference(self, box_cls, box_delta, shifts, images): 274 | """ 275 | Arguments: 276 | box_cls, box_delta: Same as the output of :meth:`FCOSHead.forward` 277 | shifts (list[list[Tensor]): a list of #images elements. Each is a 278 | list of #feature level tensor. The tensor contain shifts of this 279 | image on the specific feature level. 280 | images (ImageList): the input images 281 | 282 | Returns: 283 | results (List[Instances]): a list of #images elements. 284 | """ 285 | assert len(shifts) == len(images) 286 | results = [] 287 | 288 | box_cls = [permute_to_N_HWA_K(x, self.num_classes) for x in box_cls] 289 | box_delta = [permute_to_N_HWA_K(x, 4) for x in box_delta] 290 | # list[Tensor], one per level, each has shape (N, Hi x Wi x A, K or 4) 291 | 292 | for img_idx, shifts_per_image in enumerate(shifts): 293 | image_size = images.image_sizes[img_idx] 294 | box_cls_per_image = [ 295 | box_cls_per_level[img_idx] for box_cls_per_level in box_cls 296 | ] 297 | box_reg_per_image = [ 298 | box_reg_per_level[img_idx] for box_reg_per_level in box_delta 299 | ] 300 | results_per_image = self.inference_single_image( 301 | box_cls_per_image, box_reg_per_image, shifts_per_image, 302 | tuple(image_size)) 303 | results.append(results_per_image) 304 | return results 305 | 306 | def inference_single_image(self, box_cls, box_delta, shifts, image_size): 307 | """ 308 | Single-image inference. Return bounding-box detection results by thresholding 309 | on scores and applying non-maximum suppression (NMS). 310 | 311 | Arguments: 312 | box_cls (list[Tensor]): list of #feature levels. Each entry contains 313 | tensor of size (H x W, K) 314 | box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4. 315 | shifts (list[Tensor]): list of #feature levels. Each entry contains 316 | a tensor, which contains all the shifts for that 317 | image in that feature level. 318 | image_size (tuple(H, W)): a tuple of the image height and width. 319 | 320 | Returns: 321 | Same as `inference`, but for only one image. 322 | """ 323 | boxes_all = [] 324 | scores_all = [] 325 | class_idxs_all = [] 326 | 327 | # Iterate over every feature level 328 | for box_cls_i, box_reg_i, shifts_i in zip(box_cls, box_delta, shifts): 329 | # (HxWxK,) 330 | box_cls_i = box_cls_i.sigmoid_().flatten() 331 | 332 | # Keep top k top scoring indices only. 333 | num_topk = min(self.topk_candidates, box_reg_i.size(0)) 334 | # torch.sort is actually faster than .topk (at least on GPUs) 335 | predicted_prob, topk_idxs = box_cls_i.sort(descending=True) 336 | predicted_prob = predicted_prob[:num_topk] 337 | topk_idxs = topk_idxs[:num_topk] 338 | 339 | # filter out the proposals with low confidence score 340 | keep_idxs = predicted_prob > self.score_threshold 341 | predicted_prob = predicted_prob[keep_idxs] 342 | topk_idxs = topk_idxs[keep_idxs] 343 | 344 | shift_idxs = topk_idxs // self.num_classes 345 | classes_idxs = topk_idxs % self.num_classes 346 | 347 | box_reg_i = box_reg_i[shift_idxs] 348 | shifts_i = shifts_i[shift_idxs] 349 | # predict boxes 350 | predicted_boxes = self.shift2box_transform.apply_deltas( 351 | box_reg_i, shifts_i) 352 | 353 | boxes_all.append(predicted_boxes) 354 | scores_all.append(predicted_prob) 355 | class_idxs_all.append(classes_idxs) 356 | 357 | boxes_all, scores_all, class_idxs_all = [ 358 | cat(x) for x in [boxes_all, scores_all, class_idxs_all] 359 | ] 360 | 361 | keep = generalized_batched_nms( 362 | boxes_all, scores_all, class_idxs_all, 363 | self.nms_threshold, nms_type=self.nms_type 364 | ) 365 | keep = keep[:self.max_detections_per_image] 366 | 367 | result = Instances(image_size) 368 | result.pred_boxes = Boxes(boxes_all[keep]) 369 | result.scores = scores_all[keep] 370 | result.pred_classes = class_idxs_all[keep] 371 | return result 372 | 373 | def preprocess_image(self, batched_inputs): 374 | """ 375 | Normalize, pad and batch the input images. 376 | """ 377 | images = [x["image"].to(self.device) for x in batched_inputs] 378 | images = [self.normalizer(x) for x in images] 379 | images = ImageList.from_tensors(images, 380 | self.backbone.size_divisibility) 381 | return images 382 | 383 | def _inference_for_ms_test(self, batched_inputs): 384 | """ 385 | function used for multiscale test, will be refactor in the future. 386 | The same input with `forward` function. 387 | """ 388 | assert not self.training, "inference mode with training=True" 389 | assert len(batched_inputs) == 1, "inference image number > 1" 390 | images = self.preprocess_image(batched_inputs) 391 | 392 | features = self.backbone(images.tensor) 393 | features = [features[f] for f in self.in_features] 394 | box_cls, box_delta = self.head(features) 395 | shifts = self.shift_generator(features) 396 | 397 | results = self.inference(box_cls, box_delta, shifts, images) 398 | for results_per_image, input_per_image, image_size in zip( 399 | results, batched_inputs, images.image_sizes 400 | ): 401 | height = input_per_image.get("height", image_size[0]) 402 | width = input_per_image.get("width", image_size[1]) 403 | processed_results = detector_postprocess(results_per_image, height, width) 404 | return processed_results 405 | 406 | 407 | class FCOSHead(nn.Module): 408 | """ 409 | The head used in FCOS for object classification and box regression. 410 | It has two subnets for the two tasks, with a common structure but separate parameters. 411 | """ 412 | def __init__(self, cfg, input_shape: List[ShapeSpec]): 413 | super().__init__() 414 | # fmt: off 415 | in_channels = input_shape[0].channels 416 | num_classes = cfg.MODEL.FCOS.NUM_CLASSES 417 | num_convs = cfg.MODEL.FCOS.NUM_CONVS 418 | prior_prob = cfg.MODEL.FCOS.PRIOR_PROB 419 | num_shifts = cfg.build_shift_generator(cfg, input_shape).num_cell_shifts 420 | self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES 421 | self.norm_reg_targets = cfg.MODEL.FCOS.NORM_REG_TARGETS 422 | # fmt: on 423 | assert len(set(num_shifts)) == 1, "using differenct num_shifts value is not supported" 424 | num_shifts = num_shifts[0] 425 | 426 | cls_subnet = [] 427 | bbox_subnet = [] 428 | for _ in range(num_convs): 429 | cls_subnet.append( 430 | nn.Conv2d(in_channels, 431 | in_channels, 432 | kernel_size=3, 433 | stride=1, 434 | padding=1)) 435 | cls_subnet.append(nn.GroupNorm(32, in_channels)) 436 | cls_subnet.append(nn.ReLU()) 437 | bbox_subnet.append( 438 | nn.Conv2d(in_channels, 439 | in_channels, 440 | kernel_size=3, 441 | stride=1, 442 | padding=1)) 443 | bbox_subnet.append(nn.GroupNorm(32, in_channels)) 444 | bbox_subnet.append(nn.ReLU()) 445 | 446 | self.cls_subnet = nn.Sequential(*cls_subnet) 447 | self.bbox_subnet = nn.Sequential(*bbox_subnet) 448 | self.cls_score = nn.Conv2d(in_channels, 449 | num_shifts * num_classes, 450 | kernel_size=3, 451 | stride=1, 452 | padding=1) 453 | self.bbox_pred = nn.Conv2d(in_channels, 454 | num_shifts * 4, 455 | kernel_size=3, 456 | stride=1, 457 | padding=1) 458 | 459 | # Initialization 460 | for modules in [ 461 | self.cls_subnet, self.bbox_subnet, self.cls_score, self.bbox_pred 462 | ]: 463 | for layer in modules.modules(): 464 | if isinstance(layer, nn.Conv2d): 465 | torch.nn.init.normal_(layer.weight, mean=0, std=0.01) 466 | torch.nn.init.constant_(layer.bias, 0) 467 | if isinstance(layer, nn.GroupNorm): 468 | torch.nn.init.constant_(layer.weight, 1) 469 | torch.nn.init.constant_(layer.bias, 0) 470 | 471 | # Use prior in model initialization to improve stability 472 | bias_value = -math.log((1 - prior_prob) / prior_prob) 473 | torch.nn.init.constant_(self.cls_score.bias, bias_value) 474 | 475 | self.scales = nn.ModuleList( 476 | [Scale(init_value=1.0) for _ in range(len(self.fpn_strides))]) 477 | 478 | def forward(self, features): 479 | """ 480 | Arguments: 481 | features (list[Tensor]): FPN feature map tensors in high to low resolution. 482 | Each tensor in the list correspond to different feature levels. 483 | 484 | Returns: 485 | logits (list[Tensor]): #lvl tensors, each has shape (N, K, Hi, Wi). 486 | The tensor predicts the classification probability 487 | at each spatial position for each of the K object classes. 488 | bbox_reg (list[Tensor]): #lvl tensors, each has shape (N, 4, Hi, Wi). 489 | The tensor predicts 4-vector (dl,dt,dr,db) box 490 | regression values for every shift. These values are the 491 | relative offset between the shift and the ground truth box. 492 | """ 493 | logits = [] 494 | bbox_reg = [] 495 | for level, feature in enumerate(features): 496 | cls_subnet = self.cls_subnet(feature) 497 | bbox_subnet = self.bbox_subnet(feature) 498 | 499 | logits.append(self.cls_score(cls_subnet)) 500 | 501 | bbox_pred = self.scales[level](self.bbox_pred(bbox_subnet)) 502 | if self.norm_reg_targets: 503 | bbox_reg.append(F.relu(bbox_pred) * self.fpn_strides[level]) 504 | else: 505 | bbox_reg.append(torch.exp(bbox_pred)) 506 | return logits, bbox_reg 507 | -------------------------------------------------------------------------------- /playground/detection/coco/fcos.res50.fpn.coco.800size.3x_ms.wo_ctrness/net.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from cvpods.layers import ShapeSpec 4 | from cvpods.modeling.anchor_generator import ShiftGenerator 5 | from cvpods.modeling.backbone import Backbone 6 | from cvpods.modeling.backbone.fpn import build_retinanet_resnet_fpn_p5_backbone 7 | 8 | from fcos import FCOS 9 | 10 | 11 | def build_backbone(cfg, input_shape=None): 12 | """ 13 | Build a backbone from `cfg.MODEL.BACKBONE.NAME`. 14 | 15 | Returns: 16 | an instance of :class:`Backbone` 17 | """ 18 | if input_shape is None: 19 | input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) 20 | 21 | backbone = build_retinanet_resnet_fpn_p5_backbone(cfg, input_shape) 22 | assert isinstance(backbone, Backbone) 23 | return backbone 24 | 25 | 26 | def build_shift_generator(cfg, input_shape): 27 | 28 | return ShiftGenerator(cfg, input_shape) 29 | 30 | 31 | def build_model(cfg): 32 | 33 | cfg.build_backbone = build_backbone 34 | cfg.build_shift_generator = build_shift_generator 35 | 36 | model = FCOS(cfg) 37 | logger = logging.getLogger(__name__) 38 | logger.info("Model:\n{}".format(model)) 39 | return model 40 | -------------------------------------------------------------------------------- /playground/detection/coco/fcos.res50.fpn.coco.800size.3x_ms/README.md: -------------------------------------------------------------------------------- 1 | # fcos.res50.fpn.coco.800size.3x_ms 2 | 3 | seed: 9476764 4 | 5 | ## Evaluation results for bbox: 6 | 7 | ``` 8 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.414 9 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.601 10 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.449 11 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.256 12 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.449 13 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.531 14 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.335 15 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.553 16 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.591 17 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.400 18 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.635 19 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.735 20 | ``` 21 | | AP | AP50 | AP75 | APs | APm | APl | 22 | |:------:|:------:|:------:|:------:|:------:|:------:| 23 | | 41.393 | 60.086 | 44.923 | 25.561 | 44.897 | 53.084 | 24 | 25 | ### Per-category bbox AP: 26 | 27 | | category | AP | category | AP | category | AP | 28 | |:--------------|:-------|:-------------|:-------|:---------------|:-------| 29 | | person | 55.999 | bicycle | 32.520 | car | 45.318 | 30 | | motorcycle | 43.277 | airplane | 67.218 | bus | 66.594 | 31 | | train | 63.735 | truck | 37.657 | boat | 24.362 | 32 | | traffic light | 27.385 | fire hydrant | 67.430 | stop sign | 63.445 | 33 | | parking meter | 43.762 | bench | 22.987 | bird | 36.695 | 34 | | cat | 67.516 | dog | 62.411 | horse | 56.741 | 35 | | sheep | 53.373 | cow | 58.669 | elephant | 64.608 | 36 | | bear | 71.341 | zebra | 69.199 | giraffe | 68.521 | 37 | | backpack | 16.543 | umbrella | 38.757 | handbag | 15.861 | 38 | | tie | 32.415 | suitcase | 39.008 | frisbee | 68.187 | 39 | | skis | 20.592 | snowboard | 32.193 | sports ball | 47.290 | 40 | | kite | 42.626 | baseball bat | 28.741 | baseball glove | 36.490 | 41 | | skateboard | 54.258 | surfboard | 33.234 | tennis racket | 49.328 | 42 | | bottle | 39.079 | wine glass | 37.518 | cup | 42.291 | 43 | | fork | 31.993 | knife | 18.649 | spoon | 15.694 | 44 | | bowl | 41.004 | banana | 24.253 | apple | 19.303 | 45 | | sandwich | 31.717 | orange | 31.743 | broccoli | 23.667 | 46 | | carrot | 21.484 | hot dog | 31.344 | pizza | 52.775 | 47 | | donut | 46.693 | cake | 37.320 | chair | 28.833 | 48 | | couch | 44.514 | potted plant | 28.510 | bed | 38.643 | 49 | | dining table | 26.747 | toilet | 59.289 | tv | 55.466 | 50 | | laptop | 57.641 | mouse | 62.759 | remote | 31.570 | 51 | | keyboard | 47.522 | cell phone | 35.813 | microwave | 52.229 | 52 | | oven | 32.445 | toaster | 41.552 | sink | 36.470 | 53 | | refrigerator | 53.942 | book | 13.845 | clock | 48.035 | 54 | | vase | 36.108 | scissors | 26.815 | teddy bear | 47.294 | 55 | | hair drier | 13.241 | toothbrush | 19.316 | | | 56 | -------------------------------------------------------------------------------- /playground/detection/coco/fcos.res50.fpn.coco.800size.3x_ms/config.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from cvpods.configs.fcos_config import FCOSConfig 4 | 5 | _config_dict = dict( 6 | MODEL=dict( 7 | WEIGHTS="detectron2://ImageNetPretrained/MSRA/R-50.pkl", 8 | RESNETS=dict(DEPTH=50), 9 | SHIFT_GENERATOR=dict( 10 | NUM_SHIFTS=1, 11 | OFFSET=0.5, 12 | ), 13 | FCOS=dict( 14 | CENTERNESS_ON_REG=True, 15 | NORM_REG_TARGETS=True, 16 | NMS_THRESH_TEST=0.6, 17 | BBOX_REG_WEIGHTS=(1.0, 1.0, 1.0, 1.0), 18 | FOCAL_LOSS_GAMMA=2.0, 19 | FOCAL_LOSS_ALPHA=0.25, 20 | IOU_LOSS_TYPE="giou", 21 | CENTER_SAMPLING_RADIUS=1.5, 22 | OBJECT_SIZES_OF_INTEREST=[ 23 | [-1, 64], 24 | [64, 128], 25 | [128, 256], 26 | [256, 512], 27 | [512, float("inf")], 28 | ], 29 | ), 30 | ), 31 | DATASETS=dict( 32 | TRAIN=("coco_2017_train",), 33 | TEST=("coco_2017_val",), 34 | ), 35 | SOLVER=dict( 36 | CHECKPOINT_PERIOD=10000, 37 | LR_SCHEDULER=dict( 38 | MAX_ITER=270000, 39 | STEPS=(210000, 250000), 40 | ), 41 | OPTIMIZER=dict( 42 | BASE_LR=0.01, 43 | ), 44 | IMS_PER_BATCH=16, 45 | ), 46 | INPUT=dict( 47 | AUG=dict( 48 | TRAIN_PIPELINES=[ 49 | ("ResizeShortestEdge", 50 | dict(short_edge_length=(640, 672, 704, 736, 768, 800), max_size=1333, sample_style="choice")), 51 | ("RandomFlip", dict()), 52 | ], 53 | TEST_PIPELINES=[ 54 | ("ResizeShortestEdge", 55 | dict(short_edge_length=800, max_size=1333, sample_style="choice")), 56 | ], 57 | ) 58 | ), 59 | TEST=dict( 60 | EVAL_PEROID=10000, 61 | ), 62 | OUTPUT_DIR=osp.join( 63 | '/data/Outputs/model_logs/cvpods_playground', 64 | osp.split(osp.realpath(__file__))[0].split("playground/")[-1]), 65 | ) 66 | 67 | 68 | class CustomFCOSConfig(FCOSConfig): 69 | def __init__(self): 70 | super(CustomFCOSConfig, self).__init__() 71 | self._register_configuration(_config_dict) 72 | 73 | 74 | config = CustomFCOSConfig() 75 | -------------------------------------------------------------------------------- /playground/detection/coco/fcos.res50.fpn.coco.800size.3x_ms/net.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from cvpods.layers import ShapeSpec 4 | from cvpods.modeling.anchor_generator import ShiftGenerator 5 | from cvpods.modeling.backbone import Backbone 6 | from cvpods.modeling.backbone.fpn import build_retinanet_resnet_fpn_p5_backbone 7 | from cvpods.modeling.meta_arch.fcos import FCOS 8 | 9 | 10 | def build_backbone(cfg, input_shape=None): 11 | """ 12 | Build a backbone from `cfg.MODEL.BACKBONE.NAME`. 13 | 14 | Returns: 15 | an instance of :class:`Backbone` 16 | """ 17 | if input_shape is None: 18 | input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) 19 | 20 | backbone = build_retinanet_resnet_fpn_p5_backbone(cfg, input_shape) 21 | assert isinstance(backbone, Backbone) 22 | return backbone 23 | 24 | 25 | def build_shift_generator(cfg, input_shape): 26 | 27 | return ShiftGenerator(cfg, input_shape) 28 | 29 | 30 | def build_model(cfg): 31 | 32 | cfg.build_backbone = build_backbone 33 | cfg.build_shift_generator = build_shift_generator 34 | 35 | model = FCOS(cfg) 36 | logger = logging.getLogger(__name__) 37 | logger.info("Model:\n{}".format(model)) 38 | return model 39 | -------------------------------------------------------------------------------- /playground/detection/coco/loss.res50.fpn.coco.800size.3x_ms/README.md: -------------------------------------------------------------------------------- 1 | # loss.res50.fpn.coco.800size.3x_ms 2 | 3 | seed: 3751988 4 | 5 | ## Evaluation results for bbox: 6 | 7 | ``` 8 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.387 9 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.549 10 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.427 11 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.238 12 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.424 13 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.489 14 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.327 15 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.565 16 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.622 17 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.419 18 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.656 19 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.788 20 | ``` 21 | | AP | AP50 | AP75 | APs | APm | APl | 22 | |:------:|:------:|:------:|:------:|:------:|:------:| 23 | | 38.708 | 54.872 | 42.708 | 23.793 | 42.364 | 48.888 | 24 | 25 | ### Per-category bbox AP: 26 | 27 | | category | AP | category | AP | category | AP | 28 | |:--------------|:-------|:-------------|:-------|:---------------|:-------| 29 | | person | 55.156 | bicycle | 28.500 | car | 44.210 | 30 | | motorcycle | 40.619 | airplane | 64.635 | bus | 62.010 | 31 | | train | 60.690 | truck | 31.329 | boat | 24.446 | 32 | | traffic light | 28.411 | fire hydrant | 63.136 | stop sign | 63.268 | 33 | | parking meter | 39.984 | bench | 22.074 | bird | 34.874 | 34 | | cat | 61.970 | dog | 57.768 | horse | 55.772 | 35 | | sheep | 51.466 | cow | 57.623 | elephant | 62.707 | 36 | | bear | 66.627 | zebra | 67.822 | giraffe | 67.217 | 37 | | backpack | 13.496 | umbrella | 37.257 | handbag | 13.215 | 38 | | tie | 30.037 | suitcase | 35.837 | frisbee | 63.655 | 39 | | skis | 20.689 | snowboard | 26.305 | sports ball | 48.801 | 40 | | kite | 42.445 | baseball bat | 22.402 | baseball glove | 33.640 | 41 | | skateboard | 48.489 | surfboard | 30.267 | tennis racket | 45.932 | 42 | | bottle | 37.132 | wine glass | 34.082 | cup | 39.278 | 43 | | fork | 26.000 | knife | 14.181 | spoon | 14.024 | 44 | | bowl | 37.208 | banana | 23.155 | apple | 18.371 | 45 | | sandwich | 31.738 | orange | 30.707 | broccoli | 23.113 | 46 | | carrot | 20.558 | hot dog | 31.242 | pizza | 46.054 | 47 | | donut | 45.652 | cake | 34.416 | chair | 25.191 | 48 | | couch | 39.924 | potted plant | 24.988 | bed | 36.558 | 49 | | dining table | 26.308 | toilet | 56.805 | tv | 53.605 | 50 | | laptop | 51.866 | mouse | 58.877 | remote | 25.243 | 51 | | keyboard | 48.115 | cell phone | 30.290 | microwave | 55.947 | 52 | | oven | 31.628 | toaster | 28.743 | sink | 33.973 | 53 | | refrigerator | 48.122 | book | 12.652 | clock | 47.065 | 54 | | vase | 35.688 | scissors | 25.522 | teddy bear | 42.451 | 55 | | hair drier | 8.780 | toothbrush | 16.637 | | | 56 | -------------------------------------------------------------------------------- /playground/detection/coco/loss.res50.fpn.coco.800size.3x_ms/config.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from cvpods.configs.fcos_config import FCOSConfig 4 | 5 | _config_dict = dict( 6 | MODEL=dict( 7 | WEIGHTS="detectron2://ImageNetPretrained/MSRA/R-50.pkl", 8 | RESNETS=dict(DEPTH=50), 9 | SHIFT_GENERATOR=dict( 10 | NUM_SHIFTS=1, 11 | OFFSET=0.5, 12 | ), 13 | FCOS=dict( 14 | NORM_REG_TARGETS=True, 15 | NMS_THRESH_TEST=1.0, # disable NMS when NMS threshold is 1.0 16 | BBOX_REG_WEIGHTS=(1.0, 1.0, 1.0, 1.0), 17 | FOCAL_LOSS_GAMMA=2.0, 18 | FOCAL_LOSS_ALPHA=0.25, 19 | IOU_LOSS_TYPE="giou", 20 | REG_WEIGHT=2.0, 21 | ), 22 | POTO=dict( 23 | CENTER_SAMPLING_RADIUS=1.5, 24 | ), 25 | NMS_TYPE=None, 26 | ), 27 | DATASETS=dict( 28 | TRAIN=("coco_2017_train",), 29 | TEST=("coco_2017_val",), 30 | ), 31 | SOLVER=dict( 32 | CHECKPOINT_PERIOD=10000, 33 | LR_SCHEDULER=dict( 34 | MAX_ITER=270000, 35 | STEPS=(210000, 250000), 36 | ), 37 | OPTIMIZER=dict( 38 | BASE_LR=0.01, 39 | ), 40 | IMS_PER_BATCH=16, 41 | ), 42 | INPUT=dict( 43 | AUG=dict( 44 | TRAIN_PIPELINES=[ 45 | ("ResizeShortestEdge", 46 | dict(short_edge_length=(640, 672, 704, 736, 768, 800), max_size=1333, sample_style="choice")), 47 | ("RandomFlip", dict()), 48 | ], 49 | TEST_PIPELINES=[ 50 | ("ResizeShortestEdge", 51 | dict(short_edge_length=800, max_size=1333, sample_style="choice")), 52 | ], 53 | ) 54 | ), 55 | TEST=dict( 56 | EVAL_PEROID=10000, 57 | ), 58 | OUTPUT_DIR=osp.join( 59 | '/data/Outputs/model_logs/cvpods_playground', 60 | osp.split(osp.realpath(__file__))[0].split("playground/")[-1]), 61 | ) 62 | 63 | 64 | class CustomFCOSConfig(FCOSConfig): 65 | def __init__(self): 66 | super(CustomFCOSConfig, self).__init__() 67 | self._register_configuration(_config_dict) 68 | 69 | 70 | config = CustomFCOSConfig() 71 | -------------------------------------------------------------------------------- /playground/detection/coco/loss.res50.fpn.coco.800size.3x_ms/net.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from cvpods.layers import ShapeSpec 4 | from cvpods.modeling.anchor_generator import ShiftGenerator 5 | from cvpods.modeling.backbone import Backbone 6 | from cvpods.modeling.backbone.fpn import build_retinanet_resnet_fpn_p5_backbone 7 | 8 | from fcos import FCOS 9 | 10 | 11 | def build_backbone(cfg, input_shape=None): 12 | """ 13 | Build a backbone from `cfg.MODEL.BACKBONE.NAME`. 14 | 15 | Returns: 16 | an instance of :class:`Backbone` 17 | """ 18 | if input_shape is None: 19 | input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) 20 | 21 | backbone = build_retinanet_resnet_fpn_p5_backbone(cfg, input_shape) 22 | assert isinstance(backbone, Backbone) 23 | return backbone 24 | 25 | 26 | def build_shift_generator(cfg, input_shape): 27 | 28 | return ShiftGenerator(cfg, input_shape) 29 | 30 | 31 | def build_model(cfg): 32 | 33 | cfg.build_backbone = build_backbone 34 | cfg.build_shift_generator = build_shift_generator 35 | 36 | model = FCOS(cfg) 37 | logger = logging.getLogger(__name__) 38 | logger.info("Model:\n{}".format(model)) 39 | return model 40 | -------------------------------------------------------------------------------- /playground/detection/coco/poto.res50.fpn.coco.800size.3x_ms.3dmf.aux/README.md: -------------------------------------------------------------------------------- 1 | # poto.res50.fpn.coco.800size.3x_ms.3dmf.aux 2 | 3 | seed: 9905538 4 | 5 | ## Evaluation results for bbox: 6 | 7 | ``` 8 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.414 9 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.595 10 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.456 11 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.261 12 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.449 13 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.520 14 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.331 15 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.565 16 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.615 17 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.423 18 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.648 19 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.759 20 | ``` 21 | | AP | AP50 | AP75 | APs | APm | APl | 22 | |:------:|:------:|:------:|:------:|:------:|:------:| 23 | | 41.443 | 59.520 | 45.650 | 26.075 | 44.914 | 52.025 | 24 | 25 | ### Per-category bbox AP: 26 | 27 | | category | AP | category | AP | category | AP | 28 | |:--------------|:-------|:-------------|:-------|:---------------|:-------| 29 | | person | 56.121 | bicycle | 31.877 | car | 46.088 | 30 | | motorcycle | 44.040 | airplane | 64.505 | bus | 67.009 | 31 | | train | 64.725 | truck | 36.420 | boat | 26.455 | 32 | | traffic light | 28.310 | fire hydrant | 66.097 | stop sign | 66.077 | 33 | | parking meter | 41.362 | bench | 23.638 | bird | 36.667 | 34 | | cat | 65.117 | dog | 60.452 | horse | 56.984 | 35 | | sheep | 54.113 | cow | 59.500 | elephant | 66.082 | 36 | | bear | 72.626 | zebra | 69.089 | giraffe | 68.849 | 37 | | backpack | 16.112 | umbrella | 39.704 | handbag | 16.575 | 38 | | tie | 32.851 | suitcase | 39.014 | frisbee | 66.197 | 39 | | skis | 23.084 | snowboard | 32.775 | sports ball | 49.004 | 40 | | kite | 43.987 | baseball bat | 25.279 | baseball glove | 37.227 | 41 | | skateboard | 53.510 | surfboard | 33.053 | tennis racket | 47.605 | 42 | | bottle | 38.319 | wine glass | 36.563 | cup | 43.496 | 43 | | fork | 32.541 | knife | 19.178 | spoon | 15.184 | 44 | | bowl | 41.310 | banana | 25.229 | apple | 19.220 | 45 | | sandwich | 34.396 | orange | 30.466 | broccoli | 22.790 | 46 | | carrot | 22.295 | hot dog | 33.964 | pizza | 50.737 | 47 | | donut | 48.532 | cake | 36.915 | chair | 28.639 | 48 | | couch | 42.555 | potted plant | 27.923 | bed | 41.984 | 49 | | dining table | 28.917 | toilet | 61.171 | tv | 55.547 | 50 | | laptop | 57.712 | mouse | 62.472 | remote | 31.074 | 51 | | keyboard | 46.964 | cell phone | 35.703 | microwave | 56.487 | 52 | | oven | 36.069 | toaster | 30.171 | sink | 35.591 | 53 | | refrigerator | 52.934 | book | 14.796 | clock | 52.402 | 54 | | vase | 40.550 | scissors | 23.294 | teddy bear | 45.177 | 55 | | hair drier | 8.282 | toothbrush | 19.696 | | | 56 | -------------------------------------------------------------------------------- /playground/detection/coco/poto.res50.fpn.coco.800size.3x_ms.3dmf.aux/config.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from cvpods.configs.fcos_config import FCOSConfig 4 | 5 | _config_dict = dict( 6 | MODEL=dict( 7 | WEIGHTS="detectron2://ImageNetPretrained/MSRA/R-50.pkl", 8 | RESNETS=dict(DEPTH=50), 9 | SHIFT_GENERATOR=dict( 10 | NUM_SHIFTS=1, 11 | OFFSET=0.5, 12 | ), 13 | FCOS=dict( 14 | NORM_REG_TARGETS=True, 15 | NMS_THRESH_TEST=1.0, # disable NMS when NMS threshold is 1.0 16 | BBOX_REG_WEIGHTS=(1.0, 1.0, 1.0, 1.0), 17 | FOCAL_LOSS_GAMMA=2.0, 18 | FOCAL_LOSS_ALPHA=0.25, 19 | IOU_LOSS_TYPE="giou", 20 | REG_WEIGHT=2.0, 21 | ), 22 | POTO=dict( 23 | ALPHA=0.8, 24 | CENTER_SAMPLING_RADIUS=1.5, 25 | AUX_TOPK=9, 26 | FILTER_KERNEL_SIZE=3, 27 | FILTER_TAU=2, 28 | ), 29 | ), 30 | DATASETS=dict( 31 | TRAIN=("coco_2017_train",), 32 | TEST=("coco_2017_val",), 33 | ), 34 | SOLVER=dict( 35 | CHECKPOINT_PERIOD=10000, 36 | LR_SCHEDULER=dict( 37 | MAX_ITER=270000, 38 | STEPS=(210000, 250000), 39 | ), 40 | OPTIMIZER=dict( 41 | BASE_LR=0.01, 42 | ), 43 | IMS_PER_BATCH=16, 44 | ), 45 | INPUT=dict( 46 | AUG=dict( 47 | TRAIN_PIPELINES=[ 48 | ("ResizeShortestEdge", 49 | dict(short_edge_length=(640, 672, 704, 736, 768, 800), max_size=1333, sample_style="choice")), 50 | ("RandomFlip", dict()), 51 | ], 52 | TEST_PIPELINES=[ 53 | ("ResizeShortestEdge", 54 | dict(short_edge_length=800, max_size=1333, sample_style="choice")), 55 | ], 56 | ) 57 | ), 58 | TEST=dict( 59 | EVAL_PEROID=10000, 60 | ), 61 | OUTPUT_DIR=osp.join( 62 | '/data/Outputs/model_logs/cvpods_playground', 63 | osp.split(osp.realpath(__file__))[0].split("playground/")[-1]), 64 | ) 65 | 66 | 67 | class CustomFCOSConfig(FCOSConfig): 68 | def __init__(self): 69 | super(CustomFCOSConfig, self).__init__() 70 | self._register_configuration(_config_dict) 71 | 72 | 73 | config = CustomFCOSConfig() 74 | -------------------------------------------------------------------------------- /playground/detection/coco/poto.res50.fpn.coco.800size.3x_ms.3dmf.aux/net.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from cvpods.layers import ShapeSpec 4 | from cvpods.modeling.anchor_generator import ShiftGenerator 5 | from cvpods.modeling.backbone import Backbone 6 | from cvpods.modeling.backbone.fpn import build_retinanet_resnet_fpn_p5_backbone 7 | 8 | from fcos import FCOS 9 | 10 | 11 | def build_backbone(cfg, input_shape=None): 12 | """ 13 | Build a backbone from `cfg.MODEL.BACKBONE.NAME`. 14 | 15 | Returns: 16 | an instance of :class:`Backbone` 17 | """ 18 | if input_shape is None: 19 | input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) 20 | 21 | backbone = build_retinanet_resnet_fpn_p5_backbone(cfg, input_shape) 22 | assert isinstance(backbone, Backbone) 23 | return backbone 24 | 25 | 26 | def build_shift_generator(cfg, input_shape): 27 | 28 | return ShiftGenerator(cfg, input_shape) 29 | 30 | 31 | def build_model(cfg): 32 | 33 | cfg.build_backbone = build_backbone 34 | cfg.build_shift_generator = build_shift_generator 35 | 36 | model = FCOS(cfg) 37 | logger = logging.getLogger(__name__) 38 | logger.info("Model:\n{}".format(model)) 39 | return model 40 | -------------------------------------------------------------------------------- /playground/detection/coco/poto.res50.fpn.coco.800size.3x_ms.3dmf/README.md: -------------------------------------------------------------------------------- 1 | # poto.res50.fpn.coco.800size.3x_ms.3dmf 2 | 3 | seed: 47909290 4 | 5 | ## Evaluation results for bbox: 6 | 7 | ``` 8 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.406 9 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.580 10 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.447 11 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.261 12 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.442 13 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.508 14 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.330 15 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.565 16 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.616 17 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.432 18 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.650 19 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.772 20 | ``` 21 | | AP | AP50 | AP75 | APs | APm | APl | 22 | |:------:|:------:|:------:|:------:|:------:|:------:| 23 | | 40.575 | 57.958 | 44.748 | 26.136 | 44.203 | 50.830 | 24 | 25 | ### Per-category bbox AP: 26 | 27 | | category | AP | category | AP | category | AP | 28 | |:--------------|:-------|:-------------|:-------|:---------------|:-------| 29 | | person | 56.011 | bicycle | 30.611 | car | 45.575 | 30 | | motorcycle | 44.060 | airplane | 64.475 | bus | 66.127 | 31 | | train | 61.872 | truck | 34.914 | boat | 26.082 | 32 | | traffic light | 27.353 | fire hydrant | 67.141 | stop sign | 62.605 | 33 | | parking meter | 37.636 | bench | 21.800 | bird | 36.636 | 34 | | cat | 63.188 | dog | 58.236 | horse | 56.319 | 35 | | sheep | 52.617 | cow | 59.001 | elephant | 63.909 | 36 | | bear | 71.724 | zebra | 69.096 | giraffe | 69.357 | 37 | | backpack | 15.925 | umbrella | 38.355 | handbag | 15.242 | 38 | | tie | 31.785 | suitcase | 38.321 | frisbee | 65.553 | 39 | | skis | 22.458 | snowboard | 29.646 | sports ball | 49.683 | 40 | | kite | 45.033 | baseball bat | 26.191 | baseball glove | 35.489 | 41 | | skateboard | 51.098 | surfboard | 33.133 | tennis racket | 47.758 | 42 | | bottle | 38.607 | wine glass | 35.685 | cup | 41.064 | 43 | | fork | 31.083 | knife | 16.317 | spoon | 16.181 | 44 | | bowl | 39.595 | banana | 23.849 | apple | 18.303 | 45 | | sandwich | 35.337 | orange | 31.568 | broccoli | 21.855 | 46 | | carrot | 22.811 | hot dog | 32.806 | pizza | 47.969 | 47 | | donut | 46.088 | cake | 36.073 | chair | 27.704 | 48 | | couch | 41.137 | potted plant | 29.213 | bed | 41.475 | 49 | | dining table | 28.768 | toilet | 58.593 | tv | 56.197 | 50 | | laptop | 55.163 | mouse | 62.155 | remote | 28.710 | 51 | | keyboard | 47.542 | cell phone | 33.312 | microwave | 57.314 | 52 | | oven | 36.153 | toaster | 30.023 | sink | 34.236 | 53 | | refrigerator | 52.814 | book | 14.329 | clock | 50.840 | 54 | | vase | 38.593 | scissors | 20.232 | teddy bear | 42.617 | 55 | | hair drier | 9.686 | toothbrush | 22.022 | | | 56 | -------------------------------------------------------------------------------- /playground/detection/coco/poto.res50.fpn.coco.800size.3x_ms.3dmf/config.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from cvpods.configs.fcos_config import FCOSConfig 4 | 5 | _config_dict = dict( 6 | MODEL=dict( 7 | WEIGHTS="detectron2://ImageNetPretrained/MSRA/R-50.pkl", 8 | RESNETS=dict(DEPTH=50), 9 | SHIFT_GENERATOR=dict( 10 | NUM_SHIFTS=1, 11 | OFFSET=0.5, 12 | ), 13 | FCOS=dict( 14 | NORM_REG_TARGETS=True, 15 | NMS_THRESH_TEST=1.0, # disable NMS when NMS threshold is 1.0 16 | BBOX_REG_WEIGHTS=(1.0, 1.0, 1.0, 1.0), 17 | FOCAL_LOSS_GAMMA=2.0, 18 | FOCAL_LOSS_ALPHA=0.25, 19 | IOU_LOSS_TYPE="giou", 20 | REG_WEIGHT=2.0, 21 | ), 22 | POTO=dict( 23 | ALPHA=0.8, 24 | CENTER_SAMPLING_RADIUS=1.5, 25 | FILTER_KERNEL_SIZE=3, 26 | FILTER_TAU=2, 27 | ), 28 | NMS_TYPE=None, 29 | ), 30 | DATASETS=dict( 31 | TRAIN=("coco_2017_train",), 32 | TEST=("coco_2017_val",), 33 | ), 34 | SOLVER=dict( 35 | CHECKPOINT_PERIOD=10000, 36 | LR_SCHEDULER=dict( 37 | MAX_ITER=270000, 38 | STEPS=(210000, 250000), 39 | ), 40 | OPTIMIZER=dict( 41 | BASE_LR=0.01, 42 | ), 43 | IMS_PER_BATCH=16, 44 | ), 45 | INPUT=dict( 46 | AUG=dict( 47 | TRAIN_PIPELINES=[ 48 | ("ResizeShortestEdge", 49 | dict(short_edge_length=(640, 672, 704, 736, 768, 800), max_size=1333, sample_style="choice")), 50 | ("RandomFlip", dict()), 51 | ], 52 | TEST_PIPELINES=[ 53 | ("ResizeShortestEdge", 54 | dict(short_edge_length=800, max_size=1333, sample_style="choice")), 55 | ], 56 | ) 57 | ), 58 | TEST=dict( 59 | EVAL_PEROID=10000, 60 | ), 61 | OUTPUT_DIR=osp.join( 62 | '/data/Outputs/model_logs/cvpods_playground', 63 | osp.split(osp.realpath(__file__))[0].split("playground/")[-1]), 64 | ) 65 | 66 | 67 | class CustomFCOSConfig(FCOSConfig): 68 | def __init__(self): 69 | super(CustomFCOSConfig, self).__init__() 70 | self._register_configuration(_config_dict) 71 | 72 | 73 | config = CustomFCOSConfig() 74 | -------------------------------------------------------------------------------- /playground/detection/coco/poto.res50.fpn.coco.800size.3x_ms.3dmf/net.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from cvpods.layers import ShapeSpec 4 | from cvpods.modeling.anchor_generator import ShiftGenerator 5 | from cvpods.modeling.backbone import Backbone 6 | from cvpods.modeling.backbone.fpn import build_retinanet_resnet_fpn_p5_backbone 7 | 8 | from fcos import FCOS 9 | 10 | 11 | def build_backbone(cfg, input_shape=None): 12 | """ 13 | Build a backbone from `cfg.MODEL.BACKBONE.NAME`. 14 | 15 | Returns: 16 | an instance of :class:`Backbone` 17 | """ 18 | if input_shape is None: 19 | input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) 20 | 21 | backbone = build_retinanet_resnet_fpn_p5_backbone(cfg, input_shape) 22 | assert isinstance(backbone, Backbone) 23 | return backbone 24 | 25 | 26 | def build_shift_generator(cfg, input_shape): 27 | 28 | return ShiftGenerator(cfg, input_shape) 29 | 30 | 31 | def build_model(cfg): 32 | 33 | cfg.build_backbone = build_backbone 34 | cfg.build_shift_generator = build_shift_generator 35 | 36 | model = FCOS(cfg) 37 | logger = logging.getLogger(__name__) 38 | logger.info("Model:\n{}".format(model)) 39 | return model 40 | -------------------------------------------------------------------------------- /playground/detection/coco/poto.res50.fpn.coco.800size.3x_ms.3dmf_wo_gn.aux/README.md: -------------------------------------------------------------------------------- 1 | # poto.res50.fpn.coco.800size.3x_ms.3dmf_wo_gn.aux 2 | 3 | seed: 48196309 4 | 5 | ## Evaluation results for bbox: 6 | 7 | ``` 8 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.415 9 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.596 10 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.455 11 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.264 12 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.447 13 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.528 14 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.334 15 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.566 16 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.615 17 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.426 18 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.645 19 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.780 20 | ``` 21 | | AP | AP50 | AP75 | APs | APm | APl | 22 | |:------:|:------:|:------:|:------:|:------:|:------:| 23 | | 41.537 | 59.599 | 45.482 | 26.398 | 44.711 | 52.790 | 24 | 25 | ### Per-category bbox AP: 26 | 27 | | category | AP | category | AP | category | AP | 28 | |:--------------|:-------|:-------------|:-------|:---------------|:-------| 29 | | person | 56.022 | bicycle | 31.748 | car | 45.843 | 30 | | motorcycle | 43.949 | airplane | 64.532 | bus | 65.273 | 31 | | train | 64.257 | truck | 34.849 | boat | 26.335 | 32 | | traffic light | 28.817 | fire hydrant | 68.268 | stop sign | 64.870 | 33 | | parking meter | 45.673 | bench | 23.891 | bird | 36.187 | 34 | | cat | 65.462 | dog | 61.240 | horse | 57.789 | 35 | | sheep | 54.343 | cow | 59.442 | elephant | 64.308 | 36 | | bear | 71.876 | zebra | 68.594 | giraffe | 69.073 | 37 | | backpack | 15.893 | umbrella | 40.820 | handbag | 15.851 | 38 | | tie | 34.247 | suitcase | 39.213 | frisbee | 67.964 | 39 | | skis | 22.769 | snowboard | 32.849 | sports ball | 50.388 | 40 | | kite | 43.838 | baseball bat | 26.389 | baseball glove | 36.528 | 41 | | skateboard | 50.424 | surfboard | 33.322 | tennis racket | 47.540 | 42 | | bottle | 39.076 | wine glass | 36.604 | cup | 42.789 | 43 | | fork | 32.285 | knife | 19.467 | spoon | 16.573 | 44 | | bowl | 40.635 | banana | 25.418 | apple | 18.561 | 45 | | sandwich | 33.519 | orange | 32.752 | broccoli | 24.165 | 46 | | carrot | 21.626 | hot dog | 33.744 | pizza | 49.567 | 47 | | donut | 47.892 | cake | 37.923 | chair | 27.820 | 48 | | couch | 43.414 | potted plant | 27.061 | bed | 42.497 | 49 | | dining table | 29.552 | toilet | 60.603 | tv | 55.802 | 50 | | laptop | 57.497 | mouse | 62.402 | remote | 30.887 | 51 | | keyboard | 47.057 | cell phone | 34.426 | microwave | 59.429 | 52 | | oven | 34.282 | toaster | 26.914 | sink | 37.059 | 53 | | refrigerator | 56.400 | book | 14.374 | clock | 51.286 | 54 | | vase | 39.864 | scissors | 24.366 | teddy bear | 44.885 | 55 | | hair drier | 6.745 | toothbrush | 25.060 | | | 56 | -------------------------------------------------------------------------------- /playground/detection/coco/poto.res50.fpn.coco.800size.3x_ms.3dmf_wo_gn.aux/config.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from cvpods.configs.fcos_config import FCOSConfig 4 | 5 | _config_dict = dict( 6 | MODEL=dict( 7 | WEIGHTS="detectron2://ImageNetPretrained/MSRA/R-50.pkl", 8 | RESNETS=dict(DEPTH=50), 9 | SHIFT_GENERATOR=dict( 10 | NUM_SHIFTS=1, 11 | OFFSET=0.5, 12 | ), 13 | FCOS=dict( 14 | NORM_REG_TARGETS=True, 15 | NMS_THRESH_TEST=1.0, # disable NMS when NMS threshold is 1.0 16 | BBOX_REG_WEIGHTS=(1.0, 1.0, 1.0, 1.0), 17 | FOCAL_LOSS_GAMMA=2.0, 18 | FOCAL_LOSS_ALPHA=0.25, 19 | IOU_LOSS_TYPE="giou", 20 | REG_WEIGHT=2.0, 21 | ), 22 | POTO=dict( 23 | ALPHA=0.8, 24 | CENTER_SAMPLING_RADIUS=1.5, 25 | AUX_TOPK=9, 26 | FILTER_KERNEL_SIZE=3, 27 | FILTER_TAU=2, 28 | ), 29 | ), 30 | DATASETS=dict( 31 | TRAIN=("coco_2017_train",), 32 | TEST=("coco_2017_val",), 33 | ), 34 | SOLVER=dict( 35 | CHECKPOINT_PERIOD=10000, 36 | LR_SCHEDULER=dict( 37 | MAX_ITER=270000, 38 | STEPS=(210000, 250000), 39 | ), 40 | OPTIMIZER=dict( 41 | BASE_LR=0.01, 42 | ), 43 | IMS_PER_BATCH=16, 44 | ), 45 | INPUT=dict( 46 | AUG=dict( 47 | TRAIN_PIPELINES=[ 48 | ("ResizeShortestEdge", 49 | dict(short_edge_length=(640, 672, 704, 736, 768, 800), max_size=1333, sample_style="choice")), 50 | ("RandomFlip", dict()), 51 | ], 52 | TEST_PIPELINES=[ 53 | ("ResizeShortestEdge", 54 | dict(short_edge_length=800, max_size=1333, sample_style="choice")), 55 | ], 56 | ) 57 | ), 58 | TEST=dict( 59 | EVAL_PEROID=10000, 60 | ), 61 | OUTPUT_DIR=osp.join( 62 | '/data/Outputs/model_logs/cvpods_playground', 63 | osp.split(osp.realpath(__file__))[0].split("playground/")[-1]), 64 | ) 65 | 66 | 67 | class CustomFCOSConfig(FCOSConfig): 68 | def __init__(self): 69 | super(CustomFCOSConfig, self).__init__() 70 | self._register_configuration(_config_dict) 71 | 72 | 73 | config = CustomFCOSConfig() 74 | -------------------------------------------------------------------------------- /playground/detection/coco/poto.res50.fpn.coco.800size.3x_ms.3dmf_wo_gn.aux/net.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from cvpods.layers import ShapeSpec 4 | from cvpods.modeling.anchor_generator import ShiftGenerator 5 | from cvpods.modeling.backbone import Backbone 6 | from cvpods.modeling.backbone.fpn import build_retinanet_resnet_fpn_p5_backbone 7 | 8 | from fcos import FCOS 9 | 10 | 11 | def build_backbone(cfg, input_shape=None): 12 | """ 13 | Build a backbone from `cfg.MODEL.BACKBONE.NAME`. 14 | 15 | Returns: 16 | an instance of :class:`Backbone` 17 | """ 18 | if input_shape is None: 19 | input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) 20 | 21 | backbone = build_retinanet_resnet_fpn_p5_backbone(cfg, input_shape) 22 | assert isinstance(backbone, Backbone) 23 | return backbone 24 | 25 | 26 | def build_shift_generator(cfg, input_shape): 27 | 28 | return ShiftGenerator(cfg, input_shape) 29 | 30 | 31 | def build_model(cfg): 32 | 33 | cfg.build_backbone = build_backbone 34 | cfg.build_shift_generator = build_shift_generator 35 | 36 | model = FCOS(cfg) 37 | logger = logging.getLogger(__name__) 38 | logger.info("Model:\n{}".format(model)) 39 | return model 40 | -------------------------------------------------------------------------------- /playground/detection/coco/poto.res50.fpn.coco.800size.3x_ms.3dmf_wo_gn/README.md: -------------------------------------------------------------------------------- 1 | # poto.res50.fpn.coco.800size.3x_ms.3dmf_wo_gn 2 | 3 | seed: 20416029 4 | 5 | ## Evaluation results for bbox: 6 | 7 | ``` 8 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.409 9 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.584 10 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.451 11 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.250 12 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.442 13 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.509 14 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.333 15 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.568 16 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.620 17 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.427 18 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.651 19 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.781 20 | ``` 21 | | AP | AP50 | AP75 | APs | APm | APl | 22 | |:------:|:------:|:------:|:------:|:------:|:------:| 23 | | 40.899 | 58.400 | 45.096 | 25.013 | 44.239 | 50.856 | 24 | 25 | ### Per-category bbox AP: 26 | 27 | | category | AP | category | AP | category | AP | 28 | |:--------------|:-------|:-------------|:-------|:---------------|:-------| 29 | | person | 56.046 | bicycle | 30.989 | car | 45.579 | 30 | | motorcycle | 43.111 | airplane | 65.106 | bus | 65.405 | 31 | | train | 62.920 | truck | 34.734 | boat | 25.357 | 32 | | traffic light | 28.115 | fire hydrant | 66.877 | stop sign | 64.047 | 33 | | parking meter | 42.062 | bench | 22.379 | bird | 36.070 | 34 | | cat | 64.435 | dog | 59.563 | horse | 56.937 | 35 | | sheep | 53.341 | cow | 59.032 | elephant | 66.209 | 36 | | bear | 74.630 | zebra | 70.397 | giraffe | 68.291 | 37 | | backpack | 14.514 | umbrella | 40.210 | handbag | 16.376 | 38 | | tie | 33.069 | suitcase | 39.480 | frisbee | 65.160 | 39 | | skis | 24.172 | snowboard | 29.357 | sports ball | 49.727 | 40 | | kite | 44.820 | baseball bat | 28.152 | baseball glove | 35.731 | 41 | | skateboard | 52.100 | surfboard | 32.695 | tennis racket | 46.665 | 42 | | bottle | 37.841 | wine glass | 35.470 | cup | 40.993 | 43 | | fork | 30.267 | knife | 18.351 | spoon | 14.851 | 44 | | bowl | 39.647 | banana | 25.875 | apple | 18.507 | 45 | | sandwich | 32.898 | orange | 31.784 | broccoli | 22.977 | 46 | | carrot | 22.787 | hot dog | 34.321 | pizza | 49.750 | 47 | | donut | 46.930 | cake | 36.820 | chair | 27.396 | 48 | | couch | 42.319 | potted plant | 26.744 | bed | 40.382 | 49 | | dining table | 27.526 | toilet | 61.973 | tv | 55.405 | 50 | | laptop | 55.293 | mouse | 62.157 | remote | 30.304 | 51 | | keyboard | 47.354 | cell phone | 32.881 | microwave | 52.690 | 52 | | oven | 35.054 | toaster | 35.407 | sink | 34.457 | 53 | | refrigerator | 52.187 | book | 14.121 | clock | 50.530 | 54 | | vase | 39.453 | scissors | 20.537 | teddy bear | 43.979 | 55 | | hair drier | 7.872 | toothbrush | 21.980 | | | 56 | -------------------------------------------------------------------------------- /playground/detection/coco/poto.res50.fpn.coco.800size.3x_ms.3dmf_wo_gn/config.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from cvpods.configs.fcos_config import FCOSConfig 4 | 5 | _config_dict = dict( 6 | MODEL=dict( 7 | WEIGHTS="detectron2://ImageNetPretrained/MSRA/R-50.pkl", 8 | RESNETS=dict(DEPTH=50), 9 | SHIFT_GENERATOR=dict( 10 | NUM_SHIFTS=1, 11 | OFFSET=0.5, 12 | ), 13 | FCOS=dict( 14 | NORM_REG_TARGETS=True, 15 | NMS_THRESH_TEST=1.0, # disable NMS when NMS threshold is 1.0 16 | BBOX_REG_WEIGHTS=(1.0, 1.0, 1.0, 1.0), 17 | FOCAL_LOSS_GAMMA=2.0, 18 | FOCAL_LOSS_ALPHA=0.25, 19 | IOU_LOSS_TYPE="giou", 20 | REG_WEIGHT=2.0, 21 | ), 22 | POTO=dict( 23 | ALPHA=0.8, 24 | CENTER_SAMPLING_RADIUS=1.5, 25 | FILTER_KERNEL_SIZE=3, 26 | FILTER_TAU=2, 27 | ), 28 | NMS_TYPE=None, 29 | ), 30 | DATASETS=dict( 31 | TRAIN=("coco_2017_train",), 32 | TEST=("coco_2017_val",), 33 | ), 34 | SOLVER=dict( 35 | CHECKPOINT_PERIOD=10000, 36 | LR_SCHEDULER=dict( 37 | MAX_ITER=270000, 38 | STEPS=(210000, 250000), 39 | ), 40 | OPTIMIZER=dict( 41 | BASE_LR=0.01, 42 | ), 43 | IMS_PER_BATCH=16, 44 | ), 45 | INPUT=dict( 46 | AUG=dict( 47 | TRAIN_PIPELINES=[ 48 | ("ResizeShortestEdge", 49 | dict(short_edge_length=(640, 672, 704, 736, 768, 800), max_size=1333, sample_style="choice")), 50 | ("RandomFlip", dict()), 51 | ], 52 | TEST_PIPELINES=[ 53 | ("ResizeShortestEdge", 54 | dict(short_edge_length=800, max_size=1333, sample_style="choice")), 55 | ], 56 | ) 57 | ), 58 | TEST=dict( 59 | EVAL_PEROID=10000, 60 | ), 61 | OUTPUT_DIR=osp.join( 62 | '/data/Outputs/model_logs/cvpods_playground', 63 | osp.split(osp.realpath(__file__))[0].split("playground/")[-1]), 64 | ) 65 | 66 | 67 | class CustomFCOSConfig(FCOSConfig): 68 | def __init__(self): 69 | super(CustomFCOSConfig, self).__init__() 70 | self._register_configuration(_config_dict) 71 | 72 | 73 | config = CustomFCOSConfig() 74 | -------------------------------------------------------------------------------- /playground/detection/coco/poto.res50.fpn.coco.800size.3x_ms.3dmf_wo_gn/net.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from cvpods.layers import ShapeSpec 4 | from cvpods.modeling.anchor_generator import ShiftGenerator 5 | from cvpods.modeling.backbone import Backbone 6 | from cvpods.modeling.backbone.fpn import build_retinanet_resnet_fpn_p5_backbone 7 | 8 | from fcos import FCOS 9 | 10 | 11 | def build_backbone(cfg, input_shape=None): 12 | """ 13 | Build a backbone from `cfg.MODEL.BACKBONE.NAME`. 14 | 15 | Returns: 16 | an instance of :class:`Backbone` 17 | """ 18 | if input_shape is None: 19 | input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) 20 | 21 | backbone = build_retinanet_resnet_fpn_p5_backbone(cfg, input_shape) 22 | assert isinstance(backbone, Backbone) 23 | return backbone 24 | 25 | 26 | def build_shift_generator(cfg, input_shape): 27 | 28 | return ShiftGenerator(cfg, input_shape) 29 | 30 | 31 | def build_model(cfg): 32 | 33 | cfg.build_backbone = build_backbone 34 | cfg.build_shift_generator = build_shift_generator 35 | 36 | model = FCOS(cfg) 37 | logger = logging.getLogger(__name__) 38 | logger.info("Model:\n{}".format(model)) 39 | return model 40 | -------------------------------------------------------------------------------- /playground/detection/coco/poto.res50.fpn.coco.800size.3x_ms.argmax/README.md: -------------------------------------------------------------------------------- 1 | # poto.res50.fpn.coco.800size.3x_ms.argmax 2 | 3 | seed: 28371048 4 | 5 | ## Evaluation results for bbox: 6 | 7 | ``` 8 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.392 9 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.565 10 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.429 11 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.251 12 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.425 13 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.486 14 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.332 15 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.561 16 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.611 17 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.425 18 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.642 19 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.771 20 | ``` 21 | | AP | AP50 | AP75 | APs | APm | APl | 22 | |:------:|:------:|:------:|:------:|:------:|:------:| 23 | | 39.173 | 56.541 | 42.864 | 25.063 | 42.544 | 48.556 | 24 | 25 | ### Per-category bbox AP: 26 | 27 | | category | AP | category | AP | category | AP | 28 | |:--------------|:-------|:-------------|:-------|:---------------|:-------| 29 | | person | 55.369 | bicycle | 29.024 | car | 44.949 | 30 | | motorcycle | 40.755 | airplane | 61.428 | bus | 63.706 | 31 | | train | 63.010 | truck | 33.339 | boat | 24.766 | 32 | | traffic light | 27.866 | fire hydrant | 65.189 | stop sign | 63.710 | 33 | | parking meter | 38.060 | bench | 21.041 | bird | 34.159 | 34 | | cat | 62.558 | dog | 58.592 | horse | 53.840 | 35 | | sheep | 51.898 | cow | 57.720 | elephant | 63.070 | 36 | | bear | 65.021 | zebra | 68.255 | giraffe | 66.300 | 37 | | backpack | 15.401 | umbrella | 36.794 | handbag | 13.792 | 38 | | tie | 32.077 | suitcase | 36.793 | frisbee | 63.497 | 39 | | skis | 20.846 | snowboard | 29.080 | sports ball | 49.068 | 40 | | kite | 45.513 | baseball bat | 23.079 | baseball glove | 35.174 | 41 | | skateboard | 48.845 | surfboard | 31.265 | tennis racket | 44.957 | 42 | | bottle | 37.024 | wine glass | 36.082 | cup | 39.056 | 43 | | fork | 28.172 | knife | 15.247 | spoon | 11.683 | 44 | | bowl | 38.294 | banana | 22.372 | apple | 17.241 | 45 | | sandwich | 32.986 | orange | 29.593 | broccoli | 22.183 | 46 | | carrot | 20.558 | hot dog | 30.988 | pizza | 46.334 | 47 | | donut | 45.065 | cake | 33.891 | chair | 25.763 | 48 | | couch | 38.314 | potted plant | 24.476 | bed | 38.726 | 49 | | dining table | 26.741 | toilet | 59.266 | tv | 54.473 | 50 | | laptop | 52.434 | mouse | 60.845 | remote | 27.208 | 51 | | keyboard | 47.558 | cell phone | 31.518 | microwave | 54.648 | 52 | | oven | 33.369 | toaster | 37.296 | sink | 35.792 | 53 | | refrigerator | 50.387 | book | 13.957 | clock | 50.504 | 54 | | vase | 36.301 | scissors | 22.354 | teddy bear | 40.904 | 55 | | hair drier | 3.695 | toothbrush | 20.750 | | | 56 | -------------------------------------------------------------------------------- /playground/detection/coco/poto.res50.fpn.coco.800size.3x_ms.argmax/config.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from cvpods.configs.fcos_config import FCOSConfig 4 | 5 | _config_dict = dict( 6 | MODEL=dict( 7 | WEIGHTS="detectron2://ImageNetPretrained/MSRA/R-50.pkl", 8 | RESNETS=dict(DEPTH=50), 9 | SHIFT_GENERATOR=dict( 10 | NUM_SHIFTS=1, 11 | OFFSET=0.5, 12 | ), 13 | FCOS=dict( 14 | NORM_REG_TARGETS=True, 15 | NMS_THRESH_TEST=1.0, # disable NMS when NMS threshold is 1.0 16 | BBOX_REG_WEIGHTS=(1.0, 1.0, 1.0, 1.0), 17 | FOCAL_LOSS_GAMMA=2.0, 18 | FOCAL_LOSS_ALPHA=0.25, 19 | IOU_LOSS_TYPE="giou", 20 | REG_WEIGHT=2.0, 21 | ), 22 | POTO=dict( 23 | ALPHA=0.8, 24 | CENTER_SAMPLING_RADIUS=1.5, 25 | ), 26 | NMS_TYPE=None, 27 | ), 28 | DATASETS=dict( 29 | TRAIN=("coco_2017_train",), 30 | TEST=("coco_2017_val",), 31 | ), 32 | SOLVER=dict( 33 | CHECKPOINT_PERIOD=10000, 34 | LR_SCHEDULER=dict( 35 | MAX_ITER=270000, 36 | STEPS=(210000, 250000), 37 | ), 38 | OPTIMIZER=dict( 39 | BASE_LR=0.01, 40 | ), 41 | IMS_PER_BATCH=16, 42 | ), 43 | INPUT=dict( 44 | AUG=dict( 45 | TRAIN_PIPELINES=[ 46 | ("ResizeShortestEdge", 47 | dict(short_edge_length=(640, 672, 704, 736, 768, 800), max_size=1333, sample_style="choice")), 48 | ("RandomFlip", dict()), 49 | ], 50 | TEST_PIPELINES=[ 51 | ("ResizeShortestEdge", 52 | dict(short_edge_length=800, max_size=1333, sample_style="choice")), 53 | ], 54 | ) 55 | ), 56 | TEST=dict( 57 | EVAL_PEROID=10000, 58 | ), 59 | OUTPUT_DIR=osp.join( 60 | '/data/Outputs/model_logs/cvpods_playground', 61 | osp.split(osp.realpath(__file__))[0].split("playground/")[-1]), 62 | ) 63 | 64 | 65 | class CustomFCOSConfig(FCOSConfig): 66 | def __init__(self): 67 | super(CustomFCOSConfig, self).__init__() 68 | self._register_configuration(_config_dict) 69 | 70 | 71 | config = CustomFCOSConfig() 72 | -------------------------------------------------------------------------------- /playground/detection/coco/poto.res50.fpn.coco.800size.3x_ms.argmax/net.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from cvpods.layers import ShapeSpec 4 | from cvpods.modeling.anchor_generator import ShiftGenerator 5 | from cvpods.modeling.backbone import Backbone 6 | from cvpods.modeling.backbone.fpn import build_retinanet_resnet_fpn_p5_backbone 7 | 8 | from fcos import FCOS 9 | 10 | 11 | def build_backbone(cfg, input_shape=None): 12 | """ 13 | Build a backbone from `cfg.MODEL.BACKBONE.NAME`. 14 | 15 | Returns: 16 | an instance of :class:`Backbone` 17 | """ 18 | if input_shape is None: 19 | input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) 20 | 21 | backbone = build_retinanet_resnet_fpn_p5_backbone(cfg, input_shape) 22 | assert isinstance(backbone, Backbone) 23 | return backbone 24 | 25 | 26 | def build_shift_generator(cfg, input_shape): 27 | 28 | return ShiftGenerator(cfg, input_shape) 29 | 30 | 31 | def build_model(cfg): 32 | 33 | cfg.build_backbone = build_backbone 34 | cfg.build_shift_generator = build_shift_generator 35 | 36 | model = FCOS(cfg) 37 | logger = logging.getLogger(__name__) 38 | logger.info("Model:\n{}".format(model)) 39 | return model 40 | -------------------------------------------------------------------------------- /playground/detection/coco/poto.res50.fpn.coco.800size.3x_ms/README.md: -------------------------------------------------------------------------------- 1 | # poto.res50.fpn.coco.800size.3x_ms 2 | 3 | seed: 46353074 4 | 5 | ## Evaluation results for bbox: 6 | 7 | ``` 8 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.392 9 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.565 10 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.427 11 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.246 12 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.428 13 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.494 14 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.331 15 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.564 16 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.617 17 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.430 18 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.645 19 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.769 20 | ``` 21 | | AP | AP50 | AP75 | APs | APm | APl | 22 | |:------:|:------:|:------:|:------:|:------:|:------:| 23 | | 39.177 | 56.537 | 42.744 | 24.604 | 42.761 | 49.431 | 24 | 25 | ### Per-category bbox AP: 26 | 27 | | category | AP | category | AP | category | AP | 28 | |:--------------|:-------|:-------------|:-------|:---------------|:-------| 29 | | person | 55.077 | bicycle | 28.963 | car | 44.556 | 30 | | motorcycle | 40.411 | airplane | 62.600 | bus | 64.460 | 31 | | train | 61.501 | truck | 32.743 | boat | 24.498 | 32 | | traffic light | 27.148 | fire hydrant | 66.437 | stop sign | 66.019 | 33 | | parking meter | 41.497 | bench | 22.068 | bird | 32.931 | 34 | | cat | 63.692 | dog | 55.942 | horse | 53.431 | 35 | | sheep | 49.970 | cow | 57.115 | elephant | 61.551 | 36 | | bear | 68.866 | zebra | 69.122 | giraffe | 67.020 | 37 | | backpack | 13.829 | umbrella | 35.503 | handbag | 13.864 | 38 | | tie | 30.570 | suitcase | 35.649 | frisbee | 63.528 | 39 | | skis | 21.487 | snowboard | 29.169 | sports ball | 48.539 | 40 | | kite | 43.850 | baseball bat | 25.057 | baseball glove | 33.055 | 41 | | skateboard | 49.974 | surfboard | 32.123 | tennis racket | 45.815 | 42 | | bottle | 36.810 | wine glass | 33.975 | cup | 39.115 | 43 | | fork | 28.607 | knife | 15.567 | spoon | 13.952 | 44 | | bowl | 39.732 | banana | 22.992 | apple | 18.159 | 45 | | sandwich | 32.600 | orange | 30.111 | broccoli | 22.669 | 46 | | carrot | 20.644 | hot dog | 29.884 | pizza | 48.627 | 47 | | donut | 47.200 | cake | 33.947 | chair | 25.801 | 48 | | couch | 39.867 | potted plant | 24.210 | bed | 38.148 | 49 | | dining table | 26.551 | toilet | 57.943 | tv | 53.389 | 50 | | laptop | 51.368 | mouse | 61.005 | remote | 27.733 | 51 | | keyboard | 49.490 | cell phone | 31.307 | microwave | 54.264 | 52 | | oven | 32.144 | toaster | 26.384 | sink | 34.782 | 53 | | refrigerator | 51.548 | book | 12.528 | clock | 49.467 | 54 | | vase | 36.095 | scissors | 22.305 | teddy bear | 43.729 | 55 | | hair drier | 8.966 | toothbrush | 20.880 | | | 56 | -------------------------------------------------------------------------------- /playground/detection/coco/poto.res50.fpn.coco.800size.3x_ms/config.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from cvpods.configs.fcos_config import FCOSConfig 4 | 5 | _config_dict = dict( 6 | MODEL=dict( 7 | WEIGHTS="detectron2://ImageNetPretrained/MSRA/R-50.pkl", 8 | RESNETS=dict(DEPTH=50), 9 | SHIFT_GENERATOR=dict( 10 | NUM_SHIFTS=1, 11 | OFFSET=0.5, 12 | ), 13 | FCOS=dict( 14 | NORM_REG_TARGETS=True, 15 | NMS_THRESH_TEST=1.0, # disable NMS when NMS threshold is 1.0 16 | BBOX_REG_WEIGHTS=(1.0, 1.0, 1.0, 1.0), 17 | FOCAL_LOSS_GAMMA=2.0, 18 | FOCAL_LOSS_ALPHA=0.25, 19 | IOU_LOSS_TYPE="giou", 20 | REG_WEIGHT=2.0, 21 | ), 22 | POTO=dict( 23 | ALPHA=0.8, 24 | CENTER_SAMPLING_RADIUS=1.5, 25 | ), 26 | NMS_TYPE=None, 27 | ), 28 | DATASETS=dict( 29 | TRAIN=("coco_2017_train",), 30 | TEST=("coco_2017_val",), 31 | ), 32 | SOLVER=dict( 33 | CHECKPOINT_PERIOD=10000, 34 | LR_SCHEDULER=dict( 35 | MAX_ITER=270000, 36 | STEPS=(210000, 250000), 37 | ), 38 | OPTIMIZER=dict( 39 | BASE_LR=0.01, 40 | ), 41 | IMS_PER_BATCH=16, 42 | ), 43 | INPUT=dict( 44 | AUG=dict( 45 | TRAIN_PIPELINES=[ 46 | ("ResizeShortestEdge", 47 | dict(short_edge_length=(640, 672, 704, 736, 768, 800), max_size=1333, sample_style="choice")), 48 | ("RandomFlip", dict()), 49 | ], 50 | TEST_PIPELINES=[ 51 | ("ResizeShortestEdge", 52 | dict(short_edge_length=800, max_size=1333, sample_style="choice")), 53 | ], 54 | ) 55 | ), 56 | TEST=dict( 57 | EVAL_PEROID=10000, 58 | ), 59 | OUTPUT_DIR=osp.join( 60 | '/data/Outputs/model_logs/cvpods_playground', 61 | osp.split(osp.realpath(__file__))[0].split("playground/")[-1]), 62 | ) 63 | 64 | 65 | class CustomFCOSConfig(FCOSConfig): 66 | def __init__(self): 67 | super(CustomFCOSConfig, self).__init__() 68 | self._register_configuration(_config_dict) 69 | 70 | 71 | config = CustomFCOSConfig() 72 | -------------------------------------------------------------------------------- /playground/detection/coco/poto.res50.fpn.coco.800size.3x_ms/net.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from cvpods.layers import ShapeSpec 4 | from cvpods.modeling.anchor_generator import ShiftGenerator 5 | from cvpods.modeling.backbone import Backbone 6 | from cvpods.modeling.backbone.fpn import build_retinanet_resnet_fpn_p5_backbone 7 | 8 | from fcos import FCOS 9 | 10 | 11 | def build_backbone(cfg, input_shape=None): 12 | """ 13 | Build a backbone from `cfg.MODEL.BACKBONE.NAME`. 14 | 15 | Returns: 16 | an instance of :class:`Backbone` 17 | """ 18 | if input_shape is None: 19 | input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) 20 | 21 | backbone = build_retinanet_resnet_fpn_p5_backbone(cfg, input_shape) 22 | assert isinstance(backbone, Backbone) 23 | return backbone 24 | 25 | 26 | def build_shift_generator(cfg, input_shape): 27 | 28 | return ShiftGenerator(cfg, input_shape) 29 | 30 | 31 | def build_model(cfg): 32 | 33 | cfg.build_backbone = build_backbone 34 | cfg.build_shift_generator = build_shift_generator 35 | 36 | model = FCOS(cfg) 37 | logger = logging.getLogger(__name__) 38 | logger.info("Model:\n{}".format(model)) 39 | return model 40 | -------------------------------------------------------------------------------- /playground/detection/coco/poto.res50.fpn.coco.800size.6x_ms/README.md: -------------------------------------------------------------------------------- 1 | # poto.res50.fpn.coco.800size.6x_ms 2 | 3 | seed: 36847828 4 | 5 | ## Evaluation results for bbox: 6 | 7 | ``` 8 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.400 9 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.573 10 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.438 11 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.256 12 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.432 13 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.506 14 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.334 15 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.568 16 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.619 17 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.433 18 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.650 19 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.778 20 | ``` 21 | | AP | AP50 | AP75 | APs | APm | APl | 22 | |:------:|:------:|:------:|:------:|:------:|:------:| 23 | | 40.043 | 57.341 | 43.808 | 25.557 | 43.185 | 50.584 | 24 | 25 | ### Per-category bbox AP: 26 | 27 | | category | AP | category | AP | category | AP | 28 | |:--------------|:-------|:-------------|:-------|:---------------|:-------| 29 | | person | 56.090 | bicycle | 29.735 | car | 45.618 | 30 | | motorcycle | 43.291 | airplane | 62.310 | bus | 64.722 | 31 | | train | 62.188 | truck | 33.790 | boat | 24.878 | 32 | | traffic light | 27.185 | fire hydrant | 67.150 | stop sign | 63.966 | 33 | | parking meter | 37.816 | bench | 21.362 | bird | 35.355 | 34 | | cat | 64.725 | dog | 56.913 | horse | 56.220 | 35 | | sheep | 52.330 | cow | 57.441 | elephant | 63.153 | 36 | | bear | 69.075 | zebra | 68.448 | giraffe | 66.663 | 37 | | backpack | 15.567 | umbrella | 37.764 | handbag | 14.582 | 38 | | tie | 33.100 | suitcase | 37.147 | frisbee | 67.163 | 39 | | skis | 23.415 | snowboard | 27.146 | sports ball | 49.088 | 40 | | kite | 44.871 | baseball bat | 25.033 | baseball glove | 35.477 | 41 | | skateboard | 51.668 | surfboard | 32.272 | tennis racket | 46.320 | 42 | | bottle | 38.550 | wine glass | 36.127 | cup | 41.104 | 43 | | fork | 31.559 | knife | 18.517 | spoon | 14.022 | 44 | | bowl | 39.755 | banana | 23.037 | apple | 18.548 | 45 | | sandwich | 33.698 | orange | 30.408 | broccoli | 22.510 | 46 | | carrot | 20.917 | hot dog | 31.513 | pizza | 48.674 | 47 | | donut | 43.536 | cake | 35.415 | chair | 27.109 | 48 | | couch | 41.040 | potted plant | 23.775 | bed | 41.336 | 49 | | dining table | 27.386 | toilet | 59.928 | tv | 54.313 | 50 | | laptop | 53.567 | mouse | 62.256 | remote | 28.094 | 51 | | keyboard | 47.250 | cell phone | 32.362 | microwave | 55.658 | 52 | | oven | 33.503 | toaster | 33.645 | sink | 36.360 | 53 | | refrigerator | 52.201 | book | 13.569 | clock | 51.146 | 54 | | vase | 37.907 | scissors | 20.398 | teddy bear | 41.900 | 55 | | hair drier | 8.710 | toothbrush | 21.119 | | | 56 | -------------------------------------------------------------------------------- /playground/detection/coco/poto.res50.fpn.coco.800size.6x_ms/config.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from cvpods.configs.fcos_config import FCOSConfig 4 | 5 | _config_dict = dict( 6 | MODEL=dict( 7 | WEIGHTS="detectron2://ImageNetPretrained/MSRA/R-50.pkl", 8 | RESNETS=dict(DEPTH=50), 9 | SHIFT_GENERATOR=dict( 10 | NUM_SHIFTS=1, 11 | OFFSET=0.5, 12 | ), 13 | FCOS=dict( 14 | NORM_REG_TARGETS=True, 15 | NMS_THRESH_TEST=1.0, # disable NMS when NMS threshold is 1.0 16 | BBOX_REG_WEIGHTS=(1.0, 1.0, 1.0, 1.0), 17 | FOCAL_LOSS_GAMMA=2.0, 18 | FOCAL_LOSS_ALPHA=0.25, 19 | IOU_LOSS_TYPE="giou", 20 | REG_WEIGHT=2.0, 21 | ), 22 | POTO=dict( 23 | ALPHA=0.8, 24 | CENTER_SAMPLING_RADIUS=1.5, 25 | ), 26 | NMS_TYPE=None, 27 | ), 28 | DATASETS=dict( 29 | TRAIN=("coco_2017_train",), 30 | TEST=("coco_2017_val",), 31 | ), 32 | SOLVER=dict( 33 | CHECKPOINT_PERIOD=10000, 34 | LR_SCHEDULER=dict( 35 | MAX_ITER=540000, 36 | STEPS=(480000, 520000), 37 | ), 38 | OPTIMIZER=dict( 39 | BASE_LR=0.01, 40 | ), 41 | IMS_PER_BATCH=16, 42 | ), 43 | INPUT=dict( 44 | AUG=dict( 45 | TRAIN_PIPELINES=[ 46 | ("ResizeShortestEdge", 47 | dict(short_edge_length=(640, 672, 704, 736, 768, 800), max_size=1333, sample_style="choice")), 48 | ("RandomFlip", dict()), 49 | ], 50 | TEST_PIPELINES=[ 51 | ("ResizeShortestEdge", 52 | dict(short_edge_length=800, max_size=1333, sample_style="choice")), 53 | ], 54 | ) 55 | ), 56 | TEST=dict( 57 | EVAL_PEROID=10000, 58 | ), 59 | OUTPUT_DIR=osp.join( 60 | '/data/Outputs/model_logs/cvpods_playground', 61 | osp.split(osp.realpath(__file__))[0].split("playground/")[-1]), 62 | ) 63 | 64 | 65 | class CustomFCOSConfig(FCOSConfig): 66 | def __init__(self): 67 | super(CustomFCOSConfig, self).__init__() 68 | self._register_configuration(_config_dict) 69 | 70 | 71 | config = CustomFCOSConfig() 72 | -------------------------------------------------------------------------------- /playground/detection/coco/poto.res50.fpn.coco.800size.6x_ms/net.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from cvpods.layers import ShapeSpec 4 | from cvpods.modeling.anchor_generator import ShiftGenerator 5 | from cvpods.modeling.backbone import Backbone 6 | from cvpods.modeling.backbone.fpn import build_retinanet_resnet_fpn_p5_backbone 7 | 8 | from fcos import FCOS 9 | 10 | 11 | def build_backbone(cfg, input_shape=None): 12 | """ 13 | Build a backbone from `cfg.MODEL.BACKBONE.NAME`. 14 | 15 | Returns: 16 | an instance of :class:`Backbone` 17 | """ 18 | if input_shape is None: 19 | input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) 20 | 21 | backbone = build_retinanet_resnet_fpn_p5_backbone(cfg, input_shape) 22 | assert isinstance(backbone, Backbone) 23 | return backbone 24 | 25 | 26 | def build_shift_generator(cfg, input_shape): 27 | 28 | return ShiftGenerator(cfg, input_shape) 29 | 30 | 31 | def build_model(cfg): 32 | 33 | cfg.build_backbone = build_backbone 34 | cfg.build_shift_generator = build_shift_generator 35 | 36 | model = FCOS(cfg) 37 | logger = logging.getLogger(__name__) 38 | logger.info("Model:\n{}".format(model)) 39 | return model 40 | -------------------------------------------------------------------------------- /playground/detection/coco/poto.res50.fpn.coco.800size.9x_ms/README.md: -------------------------------------------------------------------------------- 1 | # poto.res50.fpn.coco.800size.9x_ms 2 | 3 | seed: 55805791 4 | 5 | ## Evaluation results for bbox: 6 | 7 | ``` 8 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.402 9 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.576 10 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.440 11 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.255 12 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.433 13 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.497 14 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.335 15 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.572 16 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.623 17 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.429 18 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.652 19 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.785 20 | ``` 21 | | AP | AP50 | AP75 | APs | APm | APl | 22 | |:------:|:------:|:------:|:------:|:------:|:------:| 23 | | 40.211 | 57.556 | 43.974 | 25.479 | 43.254 | 49.724 | 24 | 25 | ### Per-category bbox AP: 26 | 27 | | category | AP | category | AP | category | AP | 28 | |:--------------|:-------|:-------------|:-------|:---------------|:-------| 29 | | person | 56.461 | bicycle | 29.751 | car | 45.531 | 30 | | motorcycle | 41.378 | airplane | 66.387 | bus | 65.418 | 31 | | train | 63.702 | truck | 33.320 | boat | 25.286 | 32 | | traffic light | 27.916 | fire hydrant | 65.679 | stop sign | 65.608 | 33 | | parking meter | 43.400 | bench | 21.921 | bird | 35.702 | 34 | | cat | 64.798 | dog | 58.762 | horse | 55.727 | 35 | | sheep | 53.283 | cow | 59.048 | elephant | 63.088 | 36 | | bear | 69.722 | zebra | 68.877 | giraffe | 66.154 | 37 | | backpack | 14.874 | umbrella | 38.954 | handbag | 14.525 | 38 | | tie | 32.606 | suitcase | 38.137 | frisbee | 63.666 | 39 | | skis | 21.814 | snowboard | 29.888 | sports ball | 48.920 | 40 | | kite | 44.437 | baseball bat | 26.805 | baseball glove | 34.829 | 41 | | skateboard | 51.811 | surfboard | 32.650 | tennis racket | 46.308 | 42 | | bottle | 38.541 | wine glass | 34.170 | cup | 41.653 | 43 | | fork | 31.612 | knife | 16.877 | spoon | 14.055 | 44 | | bowl | 39.468 | banana | 23.297 | apple | 19.211 | 45 | | sandwich | 30.157 | orange | 29.881 | broccoli | 21.950 | 46 | | carrot | 21.754 | hot dog | 29.781 | pizza | 48.315 | 47 | | donut | 46.185 | cake | 35.278 | chair | 27.498 | 48 | | couch | 40.332 | potted plant | 26.441 | bed | 38.089 | 49 | | dining table | 27.612 | toilet | 60.464 | tv | 54.461 | 50 | | laptop | 55.213 | mouse | 62.109 | remote | 29.338 | 51 | | keyboard | 47.787 | cell phone | 31.711 | microwave | 54.977 | 52 | | oven | 34.952 | toaster | 35.348 | sink | 34.886 | 53 | | refrigerator | 48.997 | book | 14.370 | clock | 51.591 | 54 | | vase | 40.057 | scissors | 21.971 | teddy bear | 43.441 | 55 | | hair drier | 5.964 | toothbrush | 19.921 | | | 56 | -------------------------------------------------------------------------------- /playground/detection/coco/poto.res50.fpn.coco.800size.9x_ms/config.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from cvpods.configs.fcos_config import FCOSConfig 4 | 5 | _config_dict = dict( 6 | MODEL=dict( 7 | WEIGHTS="detectron2://ImageNetPretrained/MSRA/R-50.pkl", 8 | RESNETS=dict(DEPTH=50), 9 | SHIFT_GENERATOR=dict( 10 | NUM_SHIFTS=1, 11 | OFFSET=0.5, 12 | ), 13 | FCOS=dict( 14 | NORM_REG_TARGETS=True, 15 | NMS_THRESH_TEST=1.0, # disable NMS when NMS threshold is 1.0 16 | BBOX_REG_WEIGHTS=(1.0, 1.0, 1.0, 1.0), 17 | FOCAL_LOSS_GAMMA=2.0, 18 | FOCAL_LOSS_ALPHA=0.25, 19 | IOU_LOSS_TYPE="giou", 20 | REG_WEIGHT=2.0, 21 | ), 22 | POTO=dict( 23 | ALPHA=0.8, 24 | CENTER_SAMPLING_RADIUS=1.5, 25 | ), 26 | NMS_TYPE=None, 27 | ), 28 | DATASETS=dict( 29 | TRAIN=("coco_2017_train",), 30 | TEST=("coco_2017_val",), 31 | ), 32 | SOLVER=dict( 33 | CHECKPOINT_PERIOD=10000, 34 | LR_SCHEDULER=dict( 35 | MAX_ITER=810000, 36 | STEPS=(750000, 790000), 37 | ), 38 | OPTIMIZER=dict( 39 | BASE_LR=0.01, 40 | ), 41 | IMS_PER_BATCH=16, 42 | ), 43 | INPUT=dict( 44 | AUG=dict( 45 | TRAIN_PIPELINES=[ 46 | ("ResizeShortestEdge", 47 | dict(short_edge_length=(640, 672, 704, 736, 768, 800), max_size=1333, sample_style="choice")), 48 | ("RandomFlip", dict()), 49 | ], 50 | TEST_PIPELINES=[ 51 | ("ResizeShortestEdge", 52 | dict(short_edge_length=800, max_size=1333, sample_style="choice")), 53 | ], 54 | ) 55 | ), 56 | TEST=dict( 57 | EVAL_PEROID=10000, 58 | ), 59 | OUTPUT_DIR=osp.join( 60 | '/data/Outputs/model_logs/cvpods_playground', 61 | osp.split(osp.realpath(__file__))[0].split("playground/")[-1]), 62 | ) 63 | 64 | 65 | class CustomFCOSConfig(FCOSConfig): 66 | def __init__(self): 67 | super(CustomFCOSConfig, self).__init__() 68 | self._register_configuration(_config_dict) 69 | 70 | 71 | config = CustomFCOSConfig() 72 | -------------------------------------------------------------------------------- /playground/detection/coco/poto.res50.fpn.coco.800size.9x_ms/net.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from cvpods.layers import ShapeSpec 4 | from cvpods.modeling.anchor_generator import ShiftGenerator 5 | from cvpods.modeling.backbone import Backbone 6 | from cvpods.modeling.backbone.fpn import build_retinanet_resnet_fpn_p5_backbone 7 | 8 | from fcos import FCOS 9 | 10 | 11 | def build_backbone(cfg, input_shape=None): 12 | """ 13 | Build a backbone from `cfg.MODEL.BACKBONE.NAME`. 14 | 15 | Returns: 16 | an instance of :class:`Backbone` 17 | """ 18 | if input_shape is None: 19 | input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) 20 | 21 | backbone = build_retinanet_resnet_fpn_p5_backbone(cfg, input_shape) 22 | assert isinstance(backbone, Backbone) 23 | return backbone 24 | 25 | 26 | def build_shift_generator(cfg, input_shape): 27 | 28 | return ShiftGenerator(cfg, input_shape) 29 | 30 | 31 | def build_model(cfg): 32 | 33 | cfg.build_backbone = build_backbone 34 | cfg.build_shift_generator = build_shift_generator 35 | 36 | model = FCOS(cfg) 37 | logger = logging.getLogger(__name__) 38 | logger.info("Model:\n{}".format(model)) 39 | return model 40 | -------------------------------------------------------------------------------- /playground/detection/crowdhuman/atss.res50.fpn.crowdhuman.800size.30k/README.md: -------------------------------------------------------------------------------- 1 | # atss.res50.fpn.crowdhuman.800size.30k 2 | 3 | | AP | mMR | Recall | 4 | |:-----:|:-----:|:--------:| 5 | | 0.872 | 0.497 | 0.940 | 6 | -------------------------------------------------------------------------------- /playground/detection/crowdhuman/atss.res50.fpn.crowdhuman.800size.30k/atss.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) BaseDetection, Inc. and its affiliates. All Rights Reserved 4 | 5 | import logging 6 | 7 | import torch 8 | import torch.nn.functional as F 9 | from torch import nn 10 | 11 | from cvpods.layers import ShapeSpec, cat, generalized_batched_nms 12 | from cvpods.modeling.box_regression import Shift2BoxTransform 13 | from cvpods.modeling.losses import iou_loss, sigmoid_focal_loss_jit 14 | from cvpods.modeling.meta_arch.fcos import FCOSHead, permute_all_cls_and_box_to_N_HWA_K_and_concat 15 | from cvpods.modeling.meta_arch.retinanet import permute_to_N_HWA_K 16 | from cvpods.modeling.postprocessing import detector_postprocess 17 | from cvpods.structures import Boxes, ImageList, Instances, pairwise_iou 18 | from cvpods.utils import comm, log_first_n 19 | 20 | 21 | class ATSS(nn.Module): 22 | """ 23 | Implement ATSS (https://arxiv.org/abs/1912.02424). 24 | """ 25 | def __init__(self, cfg): 26 | super().__init__() 27 | 28 | self.device = torch.device(cfg.MODEL.DEVICE) 29 | 30 | # fmt: off 31 | self.num_classes = cfg.MODEL.FCOS.NUM_CLASSES 32 | self.in_features = cfg.MODEL.FCOS.IN_FEATURES 33 | self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES 34 | # Loss parameters: 35 | self.focal_loss_alpha = cfg.MODEL.FCOS.FOCAL_LOSS_ALPHA 36 | self.focal_loss_gamma = cfg.MODEL.FCOS.FOCAL_LOSS_GAMMA 37 | self.iou_loss_type = cfg.MODEL.FCOS.IOU_LOSS_TYPE 38 | self.reg_weight = cfg.MODEL.FCOS.REG_WEIGHT 39 | # Inference parameters: 40 | self.score_threshold = cfg.MODEL.FCOS.SCORE_THRESH_TEST 41 | self.topk_candidates = cfg.MODEL.FCOS.TOPK_CANDIDATES_TEST 42 | self.nms_threshold = cfg.MODEL.FCOS.NMS_THRESH_TEST 43 | self.nms_type = cfg.MODEL.NMS_TYPE 44 | self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE 45 | # fmt: on 46 | 47 | self.backbone = cfg.build_backbone( 48 | cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) 49 | 50 | backbone_shape = self.backbone.output_shape() 51 | feature_shapes = [backbone_shape[f] for f in self.in_features] 52 | self.head = FCOSHead(cfg, feature_shapes) 53 | self.shift_generator = cfg.build_shift_generator(cfg, feature_shapes) 54 | 55 | # Matching and loss 56 | self.shift2box_transform = Shift2BoxTransform( 57 | weights=cfg.MODEL.FCOS.BBOX_REG_WEIGHTS) 58 | self.anchor_scale = cfg.MODEL.ATSS.ANCHOR_SCALE 59 | self.atss_topk = cfg.MODEL.ATSS.TOPK 60 | 61 | pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 62 | 3, 1, 1) 63 | pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 64 | 3, 1, 1) 65 | self.normalizer = lambda x: (x - pixel_mean) / pixel_std 66 | self.to(self.device) 67 | 68 | def forward(self, batched_inputs): 69 | """ 70 | Args: 71 | batched_inputs: a list, batched outputs of :class:`DatasetMapper` . 72 | Each item in the list contains the inputs for one image. 73 | For now, each item in the list is a dict that contains: 74 | 75 | * image: Tensor, image in (C, H, W) format. 76 | * instances: Instances 77 | 78 | Other information that's included in the original dicts, such as: 79 | 80 | * "height", "width" (int): the output resolution of the model, used in inference. 81 | See :meth:`postprocess` for details. 82 | Returns: 83 | dict[str: Tensor]: 84 | mapping from a named loss to a tensor storing the loss. Used during training only. 85 | """ 86 | images = self.preprocess_image(batched_inputs) 87 | if "instances" in batched_inputs[0]: 88 | gt_instances = [ 89 | x["instances"].to(self.device) for x in batched_inputs 90 | ] 91 | elif "targets" in batched_inputs[0]: 92 | log_first_n( 93 | logging.WARN, 94 | "'targets' in the model inputs is now renamed to 'instances'!", 95 | n=10) 96 | gt_instances = [ 97 | x["targets"].to(self.device) for x in batched_inputs 98 | ] 99 | else: 100 | gt_instances = None 101 | 102 | features = self.backbone(images.tensor) 103 | features = [features[f] for f in self.in_features] 104 | box_cls, box_delta, box_center = self.head(features) 105 | shifts = self.shift_generator(features) 106 | 107 | if self.training: 108 | # remove gt_instances with ignore label 109 | gt_instances = [inst[inst.gt_classes >= 0] for inst in gt_instances] 110 | gt_classes, gt_shifts_reg_deltas, gt_centerness = self.get_ground_truth( 111 | shifts, gt_instances) 112 | return self.losses(gt_classes, gt_shifts_reg_deltas, gt_centerness, 113 | box_cls, box_delta, box_center) 114 | else: 115 | results = self.inference(box_cls, box_delta, box_center, shifts, 116 | images) 117 | processed_results = [] 118 | for results_per_image, input_per_image, image_size in zip( 119 | results, batched_inputs, images.image_sizes): 120 | height = input_per_image.get("height", image_size[0]) 121 | width = input_per_image.get("width", image_size[1]) 122 | r = detector_postprocess(results_per_image, height, width) 123 | processed_results.append({"instances": r}) 124 | return processed_results 125 | 126 | def losses(self, gt_classes, gt_shifts_deltas, gt_centerness, 127 | pred_class_logits, pred_shift_deltas, pred_centerness): 128 | """ 129 | Args: 130 | For `gt_classes`, `gt_shifts_deltas` and `gt_centerness` parameters, see 131 | :meth:`FCOS.get_ground_truth`. 132 | Their shapes are (N, R) and (N, R, 4), respectively, where R is 133 | the total number of shifts across levels, i.e. sum(Hi x Wi) 134 | For `pred_class_logits`, `pred_shift_deltas` and `pred_centerness`, see 135 | :meth:`FCOSHead.forward`. 136 | 137 | Returns: 138 | dict[str: Tensor]: 139 | mapping from a named loss to a scalar tensor 140 | storing the loss. Used during training only. The dict keys are: 141 | "loss_cls" and "loss_box_reg" 142 | """ 143 | pred_class_logits, pred_shift_deltas, pred_centerness = \ 144 | permute_all_cls_and_box_to_N_HWA_K_and_concat( 145 | pred_class_logits, pred_shift_deltas, pred_centerness, 146 | self.num_classes 147 | ) # Shapes: (N x R, K) and (N x R, 4), respectively. 148 | 149 | gt_classes = gt_classes.flatten() 150 | gt_shifts_deltas = gt_shifts_deltas.view(-1, 4) 151 | gt_centerness = gt_centerness.view(-1, 1) 152 | 153 | valid_idxs = gt_classes >= 0 154 | foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes) 155 | num_foreground = foreground_idxs.sum() 156 | 157 | gt_classes_target = torch.zeros_like(pred_class_logits) 158 | gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1 159 | 160 | num_foreground = comm.all_reduce(num_foreground) / float(comm.get_world_size()) 161 | num_foreground_centerness = gt_centerness[foreground_idxs].sum() 162 | num_targets = comm.all_reduce(num_foreground_centerness) / float(comm.get_world_size()) 163 | 164 | # logits loss 165 | loss_cls = sigmoid_focal_loss_jit( 166 | pred_class_logits[valid_idxs], 167 | gt_classes_target[valid_idxs], 168 | alpha=self.focal_loss_alpha, 169 | gamma=self.focal_loss_gamma, 170 | reduction="sum", 171 | ) / max(1.0, num_foreground) 172 | 173 | # regression loss 174 | loss_box_reg = iou_loss( 175 | pred_shift_deltas[foreground_idxs], 176 | gt_shifts_deltas[foreground_idxs], 177 | gt_centerness[foreground_idxs], 178 | box_mode="ltrb", 179 | loss_type=self.iou_loss_type, 180 | reduction="sum", 181 | ) / max(1.0, num_targets) * self.reg_weight 182 | # ) / max(1.0, num_foreground) * self.reg_weight 183 | 184 | # centerness loss 185 | loss_centerness = F.binary_cross_entropy_with_logits( 186 | pred_centerness[foreground_idxs], 187 | gt_centerness[foreground_idxs], 188 | reduction="sum", 189 | ) / max(1, num_foreground) 190 | 191 | return { 192 | "loss_cls": loss_cls, 193 | "loss_box_reg": loss_box_reg, 194 | "loss_centerness": loss_centerness 195 | } 196 | 197 | @torch.no_grad() 198 | def get_ground_truth(self, shifts, targets): 199 | """ 200 | Args: 201 | shifts (list[list[Tensor]]): a list of N=#image elements. Each is a 202 | list of #feature level tensors. The tensors contains shifts of 203 | this image on the specific feature level. 204 | targets (list[Instances]): a list of N `Instances`s. The i-th 205 | `Instances` contains the ground-truth per-instance annotations 206 | for the i-th input image. Specify `targets` during training only. 207 | 208 | Returns: 209 | gt_classes (Tensor): 210 | An integer tensor of shape (N, R) storing ground-truth 211 | labels for each shift. 212 | R is the total number of shifts, i.e. the sum of Hi x Wi for all levels. 213 | Shifts in the valid boxes are assigned their corresponding label in the 214 | [0, K-1] range. Shifts in the background are assigned the label "K". 215 | Shifts in the ignore areas are assigned a label "-1", i.e. ignore. 216 | gt_shifts_deltas (Tensor): 217 | Shape (N, R, 4). 218 | The last dimension represents ground-truth shift2box transform 219 | targets (dl, dt, dr, db) that map each shift to its matched ground-truth box. 220 | The values in the tensor are meaningful only when the corresponding 221 | shift is labeled as foreground. 222 | gt_centerness (Tensor): 223 | An float tensor (0, 1) of shape (N, R) whose values in [0, 1] 224 | storing ground-truth centerness for each shift. 225 | 226 | """ 227 | gt_classes = [] 228 | gt_shifts_deltas = [] 229 | gt_centerness = [] 230 | 231 | for shifts_per_image, targets_per_image in zip(shifts, targets): 232 | shifts_over_all_feature_maps = torch.cat(shifts_per_image, dim=0) 233 | 234 | gt_boxes = targets_per_image.gt_boxes 235 | 236 | is_in_boxes = self.shift2box_transform.get_deltas( 237 | shifts_over_all_feature_maps, gt_boxes.tensor.unsqueeze(1) 238 | ).min(dim=-1).values > 0 239 | 240 | gt_positions_iou = [] 241 | candidate_idxs = [] 242 | base = 0 243 | for stride, shifts_i in zip(self.fpn_strides, shifts_per_image): 244 | gt_positions_iou.append(pairwise_iou( 245 | gt_boxes, 246 | Boxes(torch.cat(( 247 | shifts_i - stride * self.anchor_scale / 2, 248 | shifts_i + stride * self.anchor_scale / 2, 249 | ), dim=1)) 250 | )) 251 | 252 | distances = ( 253 | gt_boxes.get_centers().unsqueeze(1) - shifts_i 254 | ).pow_(2).sum(dim=-1).sqrt_() 255 | _, topk_idxs = distances.topk( 256 | self.atss_topk, dim=1, largest=False) 257 | candidate_idxs.append(base + topk_idxs) 258 | base += len(shifts_i) 259 | gt_positions_iou = torch.cat(gt_positions_iou, dim=1) 260 | candidate_idxs = torch.cat(candidate_idxs, dim=1) 261 | 262 | candidate_ious = gt_positions_iou.gather(1, candidate_idxs) 263 | ious_thr = (candidate_ious.mean(dim=1, keepdim=True) 264 | + candidate_ious.std(dim=1, keepdim=True)) 265 | is_foreground = torch.zeros_like( 266 | is_in_boxes).scatter_(1, candidate_idxs, True) 267 | is_foreground &= gt_positions_iou >= ious_thr 268 | 269 | gt_positions_iou[~is_in_boxes] = -1 270 | gt_positions_iou[~is_foreground] = -1 271 | 272 | # if there are still more than one objects for a position, 273 | # we choose the one with maximum iou 274 | positions_max_iou, gt_matched_idxs = gt_positions_iou.max(dim=0) 275 | 276 | # ground truth box regression 277 | gt_shifts_reg_deltas_i = self.shift2box_transform.get_deltas( 278 | shifts_over_all_feature_maps, gt_boxes[gt_matched_idxs].tensor) 279 | 280 | # ground truth classes 281 | has_gt = len(targets_per_image) > 0 282 | if has_gt: 283 | gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs] 284 | # Shifts with iou -1 are treated as background. 285 | gt_classes_i[positions_max_iou == -1] = self.num_classes 286 | else: 287 | gt_classes_i = torch.zeros_like( 288 | gt_matched_idxs) + self.num_classes 289 | 290 | # ground truth centerness 291 | left_right = gt_shifts_reg_deltas_i[:, [0, 2]] 292 | top_bottom = gt_shifts_reg_deltas_i[:, [1, 3]] 293 | gt_centerness_i = torch.sqrt( 294 | (left_right.min(dim=-1).values / left_right.max(dim=-1).values).clamp_(min=0) 295 | * (top_bottom.min(dim=-1).values / top_bottom.max(dim=-1).values).clamp_(min=0) 296 | ) 297 | 298 | gt_classes.append(gt_classes_i) 299 | gt_shifts_deltas.append(gt_shifts_reg_deltas_i) 300 | gt_centerness.append(gt_centerness_i) 301 | 302 | return torch.stack(gt_classes), torch.stack( 303 | gt_shifts_deltas), torch.stack(gt_centerness) 304 | 305 | def inference(self, box_cls, box_delta, box_center, shifts, images): 306 | """ 307 | Arguments: 308 | box_cls, box_delta, box_center: Same as the output of :meth:`FCOSHead.forward` 309 | shifts (list[list[Tensor]): a list of #images elements. Each is a 310 | list of #feature level tensor. The tensor contain shifts of this 311 | image on the specific feature level. 312 | images (ImageList): the input images 313 | 314 | Returns: 315 | results (List[Instances]): a list of #images elements. 316 | """ 317 | assert len(shifts) == len(images) 318 | results = [] 319 | 320 | box_cls = [permute_to_N_HWA_K(x, self.num_classes) for x in box_cls] 321 | box_delta = [permute_to_N_HWA_K(x, 4) for x in box_delta] 322 | box_center = [permute_to_N_HWA_K(x, 1) for x in box_center] 323 | # list[Tensor], one per level, each has shape (N, Hi x Wi, K or 4) 324 | 325 | for img_idx, shifts_per_image in enumerate(shifts): 326 | image_size = images.image_sizes[img_idx] 327 | box_cls_per_image = [ 328 | box_cls_per_level[img_idx] for box_cls_per_level in box_cls 329 | ] 330 | box_reg_per_image = [ 331 | box_reg_per_level[img_idx] for box_reg_per_level in box_delta 332 | ] 333 | box_ctr_per_image = [ 334 | box_ctr_per_level[img_idx] for box_ctr_per_level in box_center 335 | ] 336 | results_per_image = self.inference_single_image( 337 | box_cls_per_image, box_reg_per_image, box_ctr_per_image, 338 | shifts_per_image, tuple(image_size)) 339 | results.append(results_per_image) 340 | return results 341 | 342 | def inference_single_image(self, box_cls, box_delta, box_center, shifts, 343 | image_size): 344 | """ 345 | Single-image inference. Return bounding-box detection results by thresholding 346 | on scores and applying non-maximum suppression (NMS). 347 | 348 | Arguments: 349 | box_cls (list[Tensor]): list of #feature levels. Each entry contains 350 | tensor of size (H x W, K) 351 | box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4. 352 | box_center (list[Tensor]): Same shape as 'box_cls' except that K becomes 1. 353 | shifts (list[Tensor]): list of #feature levels. Each entry contains 354 | a tensor, which contains all the shifts for that 355 | image in that feature level. 356 | image_size (tuple(H, W)): a tuple of the image height and width. 357 | 358 | Returns: 359 | Same as `inference`, but for only one image. 360 | """ 361 | boxes_all = [] 362 | scores_all = [] 363 | class_idxs_all = [] 364 | 365 | # Iterate over every feature level 366 | for box_cls_i, box_reg_i, box_ctr_i, shifts_i in zip( 367 | box_cls, box_delta, box_center, shifts): 368 | # (HxWxK,) 369 | box_cls_i = box_cls_i.flatten().sigmoid_() 370 | 371 | # Keep top k top scoring indices only. 372 | num_topk = min(self.topk_candidates, box_reg_i.size(0)) 373 | # torch.sort is actually faster than .topk (at least on GPUs) 374 | predicted_prob, topk_idxs = box_cls_i.sort(descending=True) 375 | predicted_prob = predicted_prob[:num_topk] 376 | topk_idxs = topk_idxs[:num_topk] 377 | 378 | # filter out the proposals with low confidence score 379 | keep_idxs = predicted_prob > self.score_threshold 380 | predicted_prob = predicted_prob[keep_idxs] 381 | topk_idxs = topk_idxs[keep_idxs] 382 | 383 | shift_idxs = topk_idxs // self.num_classes 384 | classes_idxs = topk_idxs % self.num_classes 385 | 386 | box_reg_i = box_reg_i[shift_idxs] 387 | shifts_i = shifts_i[shift_idxs] 388 | # predict boxes 389 | predicted_boxes = self.shift2box_transform.apply_deltas( 390 | box_reg_i, shifts_i) 391 | 392 | box_ctr_i = box_ctr_i.flatten().sigmoid_()[shift_idxs] 393 | predicted_prob = torch.sqrt(predicted_prob * box_ctr_i) 394 | 395 | boxes_all.append(predicted_boxes) 396 | scores_all.append(predicted_prob) 397 | class_idxs_all.append(classes_idxs) 398 | 399 | boxes_all, scores_all, class_idxs_all = [ 400 | cat(x) for x in [boxes_all, scores_all, class_idxs_all] 401 | ] 402 | 403 | keep = generalized_batched_nms( 404 | boxes_all, scores_all, class_idxs_all, 405 | self.nms_threshold, nms_type=self.nms_type 406 | ) 407 | keep = keep[:self.max_detections_per_image] 408 | 409 | result = Instances(image_size) 410 | result.pred_boxes = Boxes(boxes_all[keep]) 411 | result.scores = scores_all[keep] 412 | result.pred_classes = class_idxs_all[keep] 413 | return result 414 | 415 | def preprocess_image(self, batched_inputs): 416 | """ 417 | Normalize, pad and batch the input images. 418 | """ 419 | images = [x["image"].to(self.device) for x in batched_inputs] 420 | images = [self.normalizer(x) for x in images] 421 | images = ImageList.from_tensors(images, 422 | self.backbone.size_divisibility) 423 | return images 424 | 425 | def _inference_for_ms_test(self, batched_inputs): 426 | """ 427 | function used for multiscale test, will be refactor in the future. 428 | The same input with `forward` function. 429 | """ 430 | assert not self.training, "inference mode with training=True" 431 | assert len(batched_inputs) == 1, "inference image number > 1" 432 | images = self.preprocess_image(batched_inputs) 433 | 434 | features = self.backbone(images.tensor) 435 | features = [features[f] for f in self.in_features] 436 | box_cls, box_delta, box_center = self.head(features) 437 | shifts = self.shift_generator(features) 438 | 439 | results = self.inference(box_cls, box_delta, box_center, shifts, images) 440 | for results_per_image, input_per_image, image_size in zip( 441 | results, batched_inputs, images.image_sizes 442 | ): 443 | height = input_per_image.get("height", image_size[0]) 444 | width = input_per_image.get("width", image_size[1]) 445 | processed_results = detector_postprocess(results_per_image, height, width) 446 | return processed_results 447 | -------------------------------------------------------------------------------- /playground/detection/crowdhuman/atss.res50.fpn.crowdhuman.800size.30k/config.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from cvpods.configs.fcos_config import FCOSConfig 4 | 5 | _config_dict = dict( 6 | MODEL=dict( 7 | WEIGHTS="detectron2://ImageNetPretrained/MSRA/R-50.pkl", 8 | RESNETS=dict(DEPTH=50), 9 | SHIFT_GENERATOR=dict( 10 | NUM_SHIFTS=1, 11 | OFFSET=0.5, 12 | ), 13 | FCOS=dict( 14 | NUM_CLASSES=1, 15 | CENTERNESS_ON_REG=True, 16 | NORM_REG_TARGETS=True, 17 | NMS_THRESH_TEST=0.6, 18 | BBOX_REG_WEIGHTS=(1.0, 1.0, 1.0, 1.0), 19 | FOCAL_LOSS_GAMMA=2.0, 20 | FOCAL_LOSS_ALPHA=0.25, 21 | IOU_LOSS_TYPE="giou", 22 | REG_WEIGHT=2.0, 23 | ), 24 | ATSS=dict( 25 | ANCHOR_SCALE=8, 26 | TOPK=9, 27 | ), 28 | ), 29 | DATASETS=dict( 30 | TRAIN=("crowdhuman_train",), 31 | TEST=("crowdhuman_val",), 32 | ), 33 | SOLVER=dict( 34 | CHECKPOINT_PERIOD=5000, 35 | LR_SCHEDULER=dict( 36 | MAX_ITER=30000, 37 | STEPS=(20000, 25000), 38 | ), 39 | OPTIMIZER=dict( 40 | BASE_LR=0.01, 41 | ), 42 | IMS_PER_BATCH=16, 43 | ), 44 | INPUT=dict( 45 | AUG=dict( 46 | TRAIN_PIPELINES=[ 47 | ("ResizeShortestEdge", dict(short_edge_length=(800,), max_size=1400, sample_style="choice")), 48 | ("RandomFlip", dict()), 49 | ], 50 | TEST_PIPELINES=[ 51 | ("ResizeShortestEdge", dict(short_edge_length=800, max_size=1400, sample_style="choice")), 52 | ], 53 | ) 54 | ), 55 | TEST=dict( 56 | DETECTIONS_PER_IMAGE=500, 57 | EVAL_PEROID=5000, 58 | ), 59 | OUTPUT_DIR=osp.join( 60 | '/data/Outputs/model_logs/cvpods_playground', 61 | osp.split(osp.realpath(__file__))[0].split("playground/")[-1]), 62 | ) 63 | 64 | 65 | class CustomFCOSConfig(FCOSConfig): 66 | def __init__(self): 67 | super(CustomFCOSConfig, self).__init__() 68 | self._register_configuration(_config_dict) 69 | 70 | 71 | config = CustomFCOSConfig() 72 | -------------------------------------------------------------------------------- /playground/detection/crowdhuman/atss.res50.fpn.crowdhuman.800size.30k/net.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from cvpods.layers import ShapeSpec 4 | from cvpods.modeling.anchor_generator import ShiftGenerator 5 | from cvpods.modeling.backbone import Backbone 6 | from cvpods.modeling.backbone.fpn import build_retinanet_resnet_fpn_p5_backbone 7 | from atss import ATSS 8 | 9 | 10 | def build_backbone(cfg, input_shape=None): 11 | """ 12 | Build a backbone from `cfg.MODEL.BACKBONE.NAME`. 13 | 14 | Returns: 15 | an instance of :class:`Backbone` 16 | """ 17 | if input_shape is None: 18 | input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) 19 | 20 | backbone = build_retinanet_resnet_fpn_p5_backbone(cfg, input_shape) 21 | assert isinstance(backbone, Backbone) 22 | return backbone 23 | 24 | 25 | def build_shift_generator(cfg, input_shape): 26 | 27 | return ShiftGenerator(cfg, input_shape) 28 | 29 | 30 | def build_model(cfg): 31 | 32 | cfg.build_backbone = build_backbone 33 | cfg.build_shift_generator = build_shift_generator 34 | 35 | model = ATSS(cfg) 36 | logger = logging.getLogger(__name__) 37 | logger.info("Model:\n{}".format(model)) 38 | return model 39 | -------------------------------------------------------------------------------- /playground/detection/crowdhuman/fcos.res50.fpn.crowdhuman.800size.30k/README.md: -------------------------------------------------------------------------------- 1 | # fcos.res50.fpn.crowdhuman.800size.30k 2 | 3 | | AP | mMR | Recall | 4 | |:-----:|:-----:|:--------:| 5 | | 0.861 | 0.549 | 0.942 | 6 | -------------------------------------------------------------------------------- /playground/detection/crowdhuman/fcos.res50.fpn.crowdhuman.800size.30k/config.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from cvpods.configs.fcos_config import FCOSConfig 4 | 5 | _config_dict = dict( 6 | MODEL=dict( 7 | WEIGHTS="detectron2://ImageNetPretrained/MSRA/R-50.pkl", 8 | RESNETS=dict(DEPTH=50), 9 | SHIFT_GENERATOR=dict( 10 | NUM_SHIFTS=1, 11 | OFFSET=0.5, 12 | ), 13 | FCOS=dict( 14 | NUM_CLASSES=1, 15 | CENTERNESS_ON_REG=True, 16 | NORM_REG_TARGETS=True, 17 | NMS_THRESH_TEST=0.6, 18 | BBOX_REG_WEIGHTS=(1.0, 1.0, 1.0, 1.0), 19 | FOCAL_LOSS_GAMMA=2.0, 20 | FOCAL_LOSS_ALPHA=0.25, 21 | IOU_LOSS_TYPE="giou", 22 | CENTER_SAMPLING_RADIUS=1.5, 23 | OBJECT_SIZES_OF_INTEREST=[ 24 | [-1, 64], 25 | [64, 128], 26 | [128, 256], 27 | [256, 512], 28 | [512, float("inf")], 29 | ], 30 | ), 31 | ), 32 | DATASETS=dict( 33 | TRAIN=("crowdhuman_train",), 34 | TEST=("crowdhuman_val",), 35 | ), 36 | SOLVER=dict( 37 | CHECKPOINT_PERIOD=5000, 38 | LR_SCHEDULER=dict( 39 | MAX_ITER=30000, 40 | STEPS=(20000, 25000), 41 | ), 42 | OPTIMIZER=dict( 43 | BASE_LR=0.01, 44 | ), 45 | IMS_PER_BATCH=16, 46 | ), 47 | INPUT=dict( 48 | AUG=dict( 49 | TRAIN_PIPELINES=[ 50 | ("ResizeShortestEdge", dict(short_edge_length=(800,), max_size=1400, sample_style="choice")), 51 | ("RandomFlip", dict()), 52 | ], 53 | TEST_PIPELINES=[ 54 | ("ResizeShortestEdge", dict(short_edge_length=800, max_size=1400, sample_style="choice")), 55 | ], 56 | ) 57 | ), 58 | TEST=dict( 59 | DETECTIONS_PER_IMAGE=500, 60 | EVAL_PEROID=5000, 61 | ), 62 | OUTPUT_DIR=osp.join( 63 | '/data/Outputs/model_logs/cvpods_playground', 64 | osp.split(osp.realpath(__file__))[0].split("playground/")[-1]), 65 | ) 66 | 67 | 68 | class CustomFCOSConfig(FCOSConfig): 69 | def __init__(self): 70 | super(CustomFCOSConfig, self).__init__() 71 | self._register_configuration(_config_dict) 72 | 73 | 74 | config = CustomFCOSConfig() 75 | -------------------------------------------------------------------------------- /playground/detection/crowdhuman/fcos.res50.fpn.crowdhuman.800size.30k/net.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from cvpods.layers import ShapeSpec 4 | from cvpods.modeling.anchor_generator import ShiftGenerator 5 | from cvpods.modeling.backbone import Backbone 6 | from cvpods.modeling.backbone.fpn import build_retinanet_resnet_fpn_p5_backbone 7 | from fcos import FCOS 8 | 9 | 10 | def build_backbone(cfg, input_shape=None): 11 | """ 12 | Build a backbone from `cfg.MODEL.BACKBONE.NAME`. 13 | 14 | Returns: 15 | an instance of :class:`Backbone` 16 | """ 17 | if input_shape is None: 18 | input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) 19 | 20 | backbone = build_retinanet_resnet_fpn_p5_backbone(cfg, input_shape) 21 | assert isinstance(backbone, Backbone) 22 | return backbone 23 | 24 | 25 | def build_shift_generator(cfg, input_shape): 26 | 27 | return ShiftGenerator(cfg, input_shape) 28 | 29 | 30 | def build_model(cfg): 31 | 32 | cfg.build_backbone = build_backbone 33 | cfg.build_shift_generator = build_shift_generator 34 | 35 | model = FCOS(cfg) 36 | logger = logging.getLogger(__name__) 37 | logger.info("Model:\n{}".format(model)) 38 | return model 39 | -------------------------------------------------------------------------------- /playground/detection/crowdhuman/poto.res50.fpn.crowdhuman.800size.30k.3dmf.aux/README.md: -------------------------------------------------------------------------------- 1 | # poto.res50.fpn.crowdhuman.800size.30k.3dmf.aux 2 | 3 | | AP | mMR | Recall | 4 | |:-----:|:-----:|:--------:| 5 | | 0.891 | 0.489 | 0.965 | 6 | -------------------------------------------------------------------------------- /playground/detection/crowdhuman/poto.res50.fpn.crowdhuman.800size.30k.3dmf.aux/config.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from cvpods.configs.fcos_config import FCOSConfig 4 | 5 | _config_dict = dict( 6 | MODEL=dict( 7 | WEIGHTS="detectron2://ImageNetPretrained/MSRA/R-50.pkl", 8 | RESNETS=dict(DEPTH=50), 9 | SHIFT_GENERATOR=dict( 10 | NUM_SHIFTS=1, 11 | OFFSET=0.5, 12 | ), 13 | FCOS=dict( 14 | NUM_CLASSES=1, 15 | NORM_REG_TARGETS=True, 16 | NMS_THRESH_TEST=1.0, # disable NMS when NMS threshold is 1.0 17 | BBOX_REG_WEIGHTS=(1.0, 1.0, 1.0, 1.0), 18 | FOCAL_LOSS_GAMMA=2.0, 19 | FOCAL_LOSS_ALPHA=0.25, 20 | IOU_LOSS_TYPE="giou", 21 | REG_WEIGHT=2.0, 22 | ), 23 | POTO=dict( 24 | ALPHA=0.8, 25 | CENTER_SAMPLING_RADIUS=0.0, # inside gt box 26 | AUX_TOPK=9, 27 | FILTER_KERNEL_SIZE=3, 28 | FILTER_TAU=2, 29 | ), 30 | NMS_TYPE=None, 31 | ), 32 | DATASETS=dict( 33 | TRAIN=("crowdhuman_train",), 34 | TEST=("crowdhuman_val",), 35 | ), 36 | SOLVER=dict( 37 | CHECKPOINT_PERIOD=5000, 38 | LR_SCHEDULER=dict( 39 | MAX_ITER=30000, 40 | STEPS=(20000, 25000), 41 | ), 42 | OPTIMIZER=dict( 43 | BASE_LR=0.01, 44 | ), 45 | IMS_PER_BATCH=16, 46 | ), 47 | INPUT=dict( 48 | AUG=dict( 49 | TRAIN_PIPELINES=[ 50 | ("ResizeShortestEdge", dict(short_edge_length=(800,), max_size=1400, sample_style="choice")), 51 | ("RandomFlip", dict()), 52 | ], 53 | TEST_PIPELINES=[ 54 | ("ResizeShortestEdge", dict(short_edge_length=800, max_size=1400, sample_style="choice")), 55 | ], 56 | ) 57 | ), 58 | TEST=dict( 59 | DETECTIONS_PER_IMAGE=500, 60 | EVAL_PEROID=5000, 61 | ), 62 | OUTPUT_DIR=osp.join( 63 | '/data/Outputs/model_logs/cvpods_playground', 64 | osp.split(osp.realpath(__file__))[0].split("playground/")[-1]), 65 | ) 66 | 67 | 68 | class CustomFCOSConfig(FCOSConfig): 69 | def __init__(self): 70 | super(CustomFCOSConfig, self).__init__() 71 | self._register_configuration(_config_dict) 72 | 73 | 74 | config = CustomFCOSConfig() 75 | -------------------------------------------------------------------------------- /playground/detection/crowdhuman/poto.res50.fpn.crowdhuman.800size.30k.3dmf.aux/net.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from cvpods.layers import ShapeSpec 4 | from cvpods.modeling.anchor_generator import ShiftGenerator 5 | from cvpods.modeling.backbone import Backbone 6 | from cvpods.modeling.backbone.fpn import build_retinanet_resnet_fpn_p5_backbone 7 | 8 | from fcos import FCOS 9 | 10 | 11 | def build_backbone(cfg, input_shape=None): 12 | """ 13 | Build a backbone from `cfg.MODEL.BACKBONE.NAME`. 14 | 15 | Returns: 16 | an instance of :class:`Backbone` 17 | """ 18 | if input_shape is None: 19 | input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) 20 | 21 | backbone = build_retinanet_resnet_fpn_p5_backbone(cfg, input_shape) 22 | assert isinstance(backbone, Backbone) 23 | return backbone 24 | 25 | 26 | def build_shift_generator(cfg, input_shape): 27 | 28 | return ShiftGenerator(cfg, input_shape) 29 | 30 | 31 | def build_model(cfg): 32 | 33 | cfg.build_backbone = build_backbone 34 | cfg.build_shift_generator = build_shift_generator 35 | 36 | model = FCOS(cfg) 37 | logger = logging.getLogger(__name__) 38 | logger.info("Model:\n{}".format(model)) 39 | return model 40 | -------------------------------------------------------------------------------- /playground/detection/crowdhuman/poto.res50.fpn.crowdhuman.800size.30k.3dmf/README.md: -------------------------------------------------------------------------------- 1 | # poto.res50.fpn.crowdhuman.800size.30k.3dmf 2 | 3 | | AP | mMR | Recall | 4 | |:-----:|:-----:|:--------:| 5 | | 0.888 | 0.510 | 0.966 | 6 | -------------------------------------------------------------------------------- /playground/detection/crowdhuman/poto.res50.fpn.crowdhuman.800size.30k.3dmf/config.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from cvpods.configs.fcos_config import FCOSConfig 4 | 5 | _config_dict = dict( 6 | MODEL=dict( 7 | WEIGHTS="detectron2://ImageNetPretrained/MSRA/R-50.pkl", 8 | RESNETS=dict(DEPTH=50), 9 | SHIFT_GENERATOR=dict( 10 | NUM_SHIFTS=1, 11 | OFFSET=0.5, 12 | ), 13 | FCOS=dict( 14 | NUM_CLASSES=1, 15 | NORM_REG_TARGETS=True, 16 | NMS_THRESH_TEST=1.0, # disable NMS when NMS threshold is 1.0 17 | BBOX_REG_WEIGHTS=(1.0, 1.0, 1.0, 1.0), 18 | FOCAL_LOSS_GAMMA=2.0, 19 | FOCAL_LOSS_ALPHA=0.25, 20 | IOU_LOSS_TYPE="giou", 21 | REG_WEIGHT=2.0, 22 | ), 23 | POTO=dict( 24 | ALPHA=0.8, 25 | CENTER_SAMPLING_RADIUS=0.0, # inside gt box 26 | FILTER_KERNEL_SIZE=3, 27 | FILTER_TAU=2, 28 | ), 29 | NMS_TYPE=None, 30 | ), 31 | DATASETS=dict( 32 | TRAIN=("crowdhuman_train",), 33 | TEST=("crowdhuman_val",), 34 | ), 35 | SOLVER=dict( 36 | CHECKPOINT_PERIOD=5000, 37 | LR_SCHEDULER=dict( 38 | MAX_ITER=30000, 39 | STEPS=(20000, 25000), 40 | ), 41 | OPTIMIZER=dict( 42 | BASE_LR=0.01, 43 | ), 44 | IMS_PER_BATCH=16, 45 | ), 46 | INPUT=dict( 47 | AUG=dict( 48 | TRAIN_PIPELINES=[ 49 | ("ResizeShortestEdge", dict(short_edge_length=(800,), max_size=1400, sample_style="choice")), 50 | ("RandomFlip", dict()), 51 | ], 52 | TEST_PIPELINES=[ 53 | ("ResizeShortestEdge", dict(short_edge_length=800, max_size=1400, sample_style="choice")), 54 | ], 55 | ) 56 | ), 57 | TEST=dict( 58 | DETECTIONS_PER_IMAGE=500, 59 | EVAL_PEROID=5000, 60 | ), 61 | OUTPUT_DIR=osp.join( 62 | '/data/Outputs/model_logs/cvpods_playground', 63 | osp.split(osp.realpath(__file__))[0].split("playground/")[-1]), 64 | ) 65 | 66 | 67 | class CustomFCOSConfig(FCOSConfig): 68 | def __init__(self): 69 | super(CustomFCOSConfig, self).__init__() 70 | self._register_configuration(_config_dict) 71 | 72 | 73 | config = CustomFCOSConfig() 74 | -------------------------------------------------------------------------------- /playground/detection/crowdhuman/poto.res50.fpn.crowdhuman.800size.30k.3dmf/net.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from cvpods.layers import ShapeSpec 4 | from cvpods.modeling.anchor_generator import ShiftGenerator 5 | from cvpods.modeling.backbone import Backbone 6 | from cvpods.modeling.backbone.fpn import build_retinanet_resnet_fpn_p5_backbone 7 | 8 | from fcos import FCOS 9 | 10 | 11 | def build_backbone(cfg, input_shape=None): 12 | """ 13 | Build a backbone from `cfg.MODEL.BACKBONE.NAME`. 14 | 15 | Returns: 16 | an instance of :class:`Backbone` 17 | """ 18 | if input_shape is None: 19 | input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) 20 | 21 | backbone = build_retinanet_resnet_fpn_p5_backbone(cfg, input_shape) 22 | assert isinstance(backbone, Backbone) 23 | return backbone 24 | 25 | 26 | def build_shift_generator(cfg, input_shape): 27 | 28 | return ShiftGenerator(cfg, input_shape) 29 | 30 | 31 | def build_model(cfg): 32 | 33 | cfg.build_backbone = build_backbone 34 | cfg.build_shift_generator = build_shift_generator 35 | 36 | model = FCOS(cfg) 37 | logger = logging.getLogger(__name__) 38 | logger.info("Model:\n{}".format(model)) 39 | return model 40 | -------------------------------------------------------------------------------- /playground/detection/crowdhuman/poto.res50.fpn.crowdhuman.800size.30k/README.md: -------------------------------------------------------------------------------- 1 | # poto.res50.fpn.crowdhuman.800size.30k 2 | 3 | | AP | mMR | Recall | 4 | |:-----:|:-----:|:--------:| 5 | | 0.885 | 0.522 | 0.963 | 6 | -------------------------------------------------------------------------------- /playground/detection/crowdhuman/poto.res50.fpn.crowdhuman.800size.30k/config.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from cvpods.configs.fcos_config import FCOSConfig 4 | 5 | _config_dict = dict( 6 | MODEL=dict( 7 | WEIGHTS="detectron2://ImageNetPretrained/MSRA/R-50.pkl", 8 | RESNETS=dict(DEPTH=50), 9 | SHIFT_GENERATOR=dict( 10 | NUM_SHIFTS=1, 11 | OFFSET=0.5, 12 | ), 13 | FCOS=dict( 14 | NUM_CLASSES=1, 15 | NORM_REG_TARGETS=True, 16 | NMS_THRESH_TEST=1.0, # disable NMS when NMS threshold is 1.0 17 | BBOX_REG_WEIGHTS=(1.0, 1.0, 1.0, 1.0), 18 | FOCAL_LOSS_GAMMA=2.0, 19 | FOCAL_LOSS_ALPHA=0.25, 20 | IOU_LOSS_TYPE="giou", 21 | REG_WEIGHT=2.0, 22 | ), 23 | POTO=dict( 24 | ALPHA=0.8, 25 | CENTER_SAMPLING_RADIUS=0.0, # inside gt box 26 | ), 27 | NMS_TYPE=None, 28 | ), 29 | DATASETS=dict( 30 | TRAIN=("crowdhuman_train",), 31 | TEST=("crowdhuman_val",), 32 | ), 33 | SOLVER=dict( 34 | CHECKPOINT_PERIOD=5000, 35 | LR_SCHEDULER=dict( 36 | MAX_ITER=30000, 37 | STEPS=(20000, 25000), 38 | ), 39 | OPTIMIZER=dict( 40 | BASE_LR=0.01, 41 | ), 42 | IMS_PER_BATCH=16, 43 | ), 44 | INPUT=dict( 45 | AUG=dict( 46 | TRAIN_PIPELINES=[ 47 | ("ResizeShortestEdge", dict(short_edge_length=(800,), max_size=1400, sample_style="choice")), 48 | ("RandomFlip", dict()), 49 | ], 50 | TEST_PIPELINES=[ 51 | ("ResizeShortestEdge", dict(short_edge_length=800, max_size=1400, sample_style="choice")), 52 | ], 53 | ) 54 | ), 55 | TEST=dict( 56 | DETECTIONS_PER_IMAGE=500, 57 | EVAL_PEROID=5000, 58 | ), 59 | OUTPUT_DIR=osp.join( 60 | '/data/Outputs/model_logs/cvpods_playground', 61 | osp.split(osp.realpath(__file__))[0].split("playground/")[-1]), 62 | ) 63 | 64 | 65 | class CustomFCOSConfig(FCOSConfig): 66 | def __init__(self): 67 | super(CustomFCOSConfig, self).__init__() 68 | self._register_configuration(_config_dict) 69 | 70 | 71 | config = CustomFCOSConfig() 72 | -------------------------------------------------------------------------------- /playground/detection/crowdhuman/poto.res50.fpn.crowdhuman.800size.30k/net.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from cvpods.layers import ShapeSpec 4 | from cvpods.modeling.anchor_generator import ShiftGenerator 5 | from cvpods.modeling.backbone import Backbone 6 | from cvpods.modeling.backbone.fpn import build_retinanet_resnet_fpn_p5_backbone 7 | 8 | from fcos import FCOS 9 | 10 | 11 | def build_backbone(cfg, input_shape=None): 12 | """ 13 | Build a backbone from `cfg.MODEL.BACKBONE.NAME`. 14 | 15 | Returns: 16 | an instance of :class:`Backbone` 17 | """ 18 | if input_shape is None: 19 | input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) 20 | 21 | backbone = build_retinanet_resnet_fpn_p5_backbone(cfg, input_shape) 22 | assert isinstance(backbone, Backbone) 23 | return backbone 24 | 25 | 26 | def build_shift_generator(cfg, input_shape): 27 | 28 | return ShiftGenerator(cfg, input_shape) 29 | 30 | 31 | def build_model(cfg): 32 | 33 | cfg.build_backbone = build_backbone 34 | cfg.build_shift_generator = build_shift_generator 35 | 36 | model = FCOS(cfg) 37 | logger = logging.getLogger(__name__) 38 | logger.info("Model:\n{}".format(model)) 39 | return model 40 | --------------------------------------------------------------------------------