├── .assets ├── d-cube_logo.png └── teaser.png ├── .gitignore ├── LICENSE ├── README.md ├── d_cube ├── __init__.py ├── d3.py ├── data_util.py └── vis_util.py ├── doc.md ├── eval_sota ├── README.md ├── groundingdino.py ├── owl_vit.py └── sphinx.py ├── qa.md ├── requirements.txt ├── scripts ├── eval_and_analysis_json.py ├── eval_json_example.py └── get_d3_stat.py └── setup.py /.assets/d-cube_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shikras/d-cube/fa0ccd6358b2bb958e8dcf810fc758717f18e4ec/.assets/d-cube_logo.png -------------------------------------------------------------------------------- /.assets/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shikras/d-cube/fa0ccd6358b2bb958e8dcf810fc758717f18e4ec/.assets/teaser.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | .vscode/* 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # poetry 99 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 100 | # This is especially recommended for binary packages to ensure reproducibility, and is more 101 | # commonly ignored for libraries. 102 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 103 | #poetry.lock 104 | 105 | # pdm 106 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 107 | #pdm.lock 108 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 109 | # in version control. 110 | # https://pdm.fming.dev/#use-with-ide 111 | .pdm.toml 112 | 113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 114 | __pypackages__/ 115 | 116 | # Celery stuff 117 | celerybeat-schedule 118 | celerybeat.pid 119 | 120 | # SageMath parsed files 121 | *.sage.py 122 | 123 | # Environments 124 | .env 125 | .venv 126 | env/ 127 | venv/ 128 | ENV/ 129 | env.bak/ 130 | venv.bak/ 131 | 132 | # Spyder project settings 133 | .spyderproject 134 | .spyproject 135 | 136 | # Rope project settings 137 | .ropeproject 138 | 139 | # mkdocs documentation 140 | /site 141 | 142 | # mypy 143 | .mypy_cache/ 144 | .dmypy.json 145 | dmypy.json 146 | 147 | # Pyre type checker 148 | .pyre/ 149 | 150 | # pytype static type analyzer 151 | .pytype/ 152 | 153 | # Cython debug symbols 154 | cython_debug/ 155 | 156 | # PyCharm 157 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 158 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 159 | # and can be added to the global gitignore or merged into this file. For a more nuclear 160 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 161 | #.idea/ 162 | 163 | # mac system 164 | *.DS_Store 165 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Attribution-NonCommercial 4.0 International 2 | 3 | Creative Commons Corporation ("Creative Commons") is not a law firm and 4 | does not provide legal services or legal advice. Distribution of 5 | Creative Commons public licenses does not create a lawyer-client or 6 | other relationship. Creative Commons makes its licenses and related 7 | information available on an "as-is" basis. Creative Commons gives no 8 | warranties regarding its licenses, any material licensed under their 9 | terms and conditions, or any related information. Creative Commons 10 | disclaims all liability for damages resulting from their use to the 11 | fullest extent possible. 12 | 13 | Using Creative Commons Public Licenses 14 | 15 | Creative Commons public licenses provide a standard set of terms and 16 | conditions that creators and other rights holders may use to share 17 | original works of authorship and other material subject to copyright and 18 | certain other rights specified in the public license below. The 19 | following considerations are for informational purposes only, are not 20 | exhaustive, and do not form part of our licenses. 21 | 22 | - Considerations for licensors: Our public licenses are intended for 23 | use by those authorized to give the public permission to use 24 | material in ways otherwise restricted by copyright and certain other 25 | rights. Our licenses are irrevocable. Licensors should read and 26 | understand the terms and conditions of the license they choose 27 | before applying it. Licensors should also secure all rights 28 | necessary before applying our licenses so that the public can reuse 29 | the material as expected. Licensors should clearly mark any material 30 | not subject to the license. This includes other CC-licensed 31 | material, or material used under an exception or limitation to 32 | copyright. More considerations for licensors : 33 | wiki.creativecommons.org/Considerations_for_licensors 34 | 35 | - Considerations for the public: By using one of our public licenses, 36 | a licensor grants the public permission to use the licensed material 37 | under specified terms and conditions. If the licensor's permission 38 | is not necessary for any reason–for example, because of any 39 | applicable exception or limitation to copyright–then that use is not 40 | regulated by the license. Our licenses grant only permissions under 41 | copyright and certain other rights that a licensor has authority to 42 | grant. Use of the licensed material may still be restricted for 43 | other reasons, including because others have copyright or other 44 | rights in the material. A licensor may make special requests, such 45 | as asking that all changes be marked or described. Although not 46 | required by our licenses, you are encouraged to respect those 47 | requests where reasonable. More considerations for the public : 48 | wiki.creativecommons.org/Considerations_for_licensees 49 | 50 | Creative Commons Attribution-NonCommercial 4.0 International Public 51 | License 52 | 53 | By exercising the Licensed Rights (defined below), You accept and agree 54 | to be bound by the terms and conditions of this Creative Commons 55 | Attribution-NonCommercial 4.0 International Public License ("Public 56 | License"). To the extent this Public License may be interpreted as a 57 | contract, You are granted the Licensed Rights in consideration of Your 58 | acceptance of these terms and conditions, and the Licensor grants You 59 | such rights in consideration of benefits the Licensor receives from 60 | making the Licensed Material available under these terms and conditions. 61 | 62 | - Section 1 – Definitions. 63 | 64 | - a. Adapted Material means material subject to Copyright and 65 | Similar Rights that is derived from or based upon the Licensed 66 | Material and in which the Licensed Material is translated, 67 | altered, arranged, transformed, or otherwise modified in a 68 | manner requiring permission under the Copyright and Similar 69 | Rights held by the Licensor. For purposes of this Public 70 | License, where the Licensed Material is a musical work, 71 | performance, or sound recording, Adapted Material is always 72 | produced where the Licensed Material is synched in timed 73 | relation with a moving image. 74 | - b. Adapter's License means the license You apply to Your 75 | Copyright and Similar Rights in Your contributions to Adapted 76 | Material in accordance with the terms and conditions of this 77 | Public License. 78 | - c. Copyright and Similar Rights means copyright and/or similar 79 | rights closely related to copyright including, without 80 | limitation, performance, broadcast, sound recording, and Sui 81 | Generis Database Rights, without regard to how the rights are 82 | labeled or categorized. For purposes of this Public License, the 83 | rights specified in Section 2(b)(1)-(2) are not Copyright and 84 | Similar Rights. 85 | - d. Effective Technological Measures means those measures that, 86 | in the absence of proper authority, may not be circumvented 87 | under laws fulfilling obligations under Article 11 of the WIPO 88 | Copyright Treaty adopted on December 20, 1996, and/or similar 89 | international agreements. 90 | - e. Exceptions and Limitations means fair use, fair dealing, 91 | and/or any other exception or limitation to Copyright and 92 | Similar Rights that applies to Your use of the Licensed 93 | Material. 94 | - f. Licensed Material means the artistic or literary work, 95 | database, or other material to which the Licensor applied this 96 | Public License. 97 | - g. Licensed Rights means the rights granted to You subject to 98 | the terms and conditions of this Public License, which are 99 | limited to all Copyright and Similar Rights that apply to Your 100 | use of the Licensed Material and that the Licensor has authority 101 | to license. 102 | - h. Licensor means the individual(s) or entity(ies) granting 103 | rights under this Public License. 104 | - i. NonCommercial means not primarily intended for or directed 105 | towards commercial advantage or monetary compensation. For 106 | purposes of this Public License, the exchange of the Licensed 107 | Material for other material subject to Copyright and Similar 108 | Rights by digital file-sharing or similar means is NonCommercial 109 | provided there is no payment of monetary compensation in 110 | connection with the exchange. 111 | - j. Share means to provide material to the public by any means or 112 | process that requires permission under the Licensed Rights, such 113 | as reproduction, public display, public performance, 114 | distribution, dissemination, communication, or importation, and 115 | to make material available to the public including in ways that 116 | members of the public may access the material from a place and 117 | at a time individually chosen by them. 118 | - k. Sui Generis Database Rights means rights other than copyright 119 | resulting from Directive 96/9/EC of the European Parliament and 120 | of the Council of 11 March 1996 on the legal protection of 121 | databases, as amended and/or succeeded, as well as other 122 | essentially equivalent rights anywhere in the world. 123 | - l. You means the individual or entity exercising the Licensed 124 | Rights under this Public License. Your has a corresponding 125 | meaning. 126 | 127 | - Section 2 – Scope. 128 | 129 | - a. License grant. 130 | - 1. Subject to the terms and conditions of this Public 131 | License, the Licensor hereby grants You a worldwide, 132 | royalty-free, non-sublicensable, non-exclusive, irrevocable 133 | license to exercise the Licensed Rights in the Licensed 134 | Material to: 135 | - A. reproduce and Share the Licensed Material, in whole 136 | or in part, for NonCommercial purposes only; and 137 | - B. produce, reproduce, and Share Adapted Material for 138 | NonCommercial purposes only. 139 | - 2. Exceptions and Limitations. For the avoidance of doubt, 140 | where Exceptions and Limitations apply to Your use, this 141 | Public License does not apply, and You do not need to comply 142 | with its terms and conditions. 143 | - 3. Term. The term of this Public License is specified in 144 | Section 6(a). 145 | - 4. Media and formats; technical modifications allowed. The 146 | Licensor authorizes You to exercise the Licensed Rights in 147 | all media and formats whether now known or hereafter 148 | created, and to make technical modifications necessary to do 149 | so. The Licensor waives and/or agrees not to assert any 150 | right or authority to forbid You from making technical 151 | modifications necessary to exercise the Licensed Rights, 152 | including technical modifications necessary to circumvent 153 | Effective Technological Measures. For purposes of this 154 | Public License, simply making modifications authorized by 155 | this Section 2(a)(4) never produces Adapted Material. 156 | - 5. Downstream recipients. 157 | - A. Offer from the Licensor – Licensed Material. Every 158 | recipient of the Licensed Material automatically 159 | receives an offer from the Licensor to exercise the 160 | Licensed Rights under the terms and conditions of this 161 | Public License. 162 | - B. No downstream restrictions. You may not offer or 163 | impose any additional or different terms or conditions 164 | on, or apply any Effective Technological Measures to, 165 | the Licensed Material if doing so restricts exercise of 166 | the Licensed Rights by any recipient of the Licensed 167 | Material. 168 | - 6. No endorsement. Nothing in this Public License 169 | constitutes or may be construed as permission to assert or 170 | imply that You are, or that Your use of the Licensed 171 | Material is, connected with, or sponsored, endorsed, or 172 | granted official status by, the Licensor or others 173 | designated to receive attribution as provided in Section 174 | 3(a)(1)(A)(i). 175 | - b. Other rights. 176 | - 1. Moral rights, such as the right of integrity, are not 177 | licensed under this Public License, nor are publicity, 178 | privacy, and/or other similar personality rights; however, 179 | to the extent possible, the Licensor waives and/or agrees 180 | not to assert any such rights held by the Licensor to the 181 | limited extent necessary to allow You to exercise the 182 | Licensed Rights, but not otherwise. 183 | - 2. Patent and trademark rights are not licensed under this 184 | Public License. 185 | - 3. To the extent possible, the Licensor waives any right to 186 | collect royalties from You for the exercise of the Licensed 187 | Rights, whether directly or through a collecting society 188 | under any voluntary or waivable statutory or compulsory 189 | licensing scheme. In all other cases the Licensor expressly 190 | reserves any right to collect such royalties, including when 191 | the Licensed Material is used other than for NonCommercial 192 | purposes. 193 | 194 | - Section 3 – License Conditions. 195 | 196 | Your exercise of the Licensed Rights is expressly made subject to 197 | the following conditions. 198 | 199 | - a. Attribution. 200 | - 1. If You Share the Licensed Material (including in modified 201 | form), You must: 202 | - A. retain the following if it is supplied by the 203 | Licensor with the Licensed Material: 204 | - i. identification of the creator(s) of the Licensed 205 | Material and any others designated to receive 206 | attribution, in any reasonable manner requested by 207 | the Licensor (including by pseudonym if designated); 208 | - ii. a copyright notice; 209 | - iii. a notice that refers to this Public License; 210 | - iv. a notice that refers to the disclaimer of 211 | warranties; 212 | - v. a URI or hyperlink to the Licensed Material to 213 | the extent reasonably practicable; 214 | - B. indicate if You modified the Licensed Material and 215 | retain an indication of any previous modifications; and 216 | - C. indicate the Licensed Material is licensed under this 217 | Public License, and include the text of, or the URI or 218 | hyperlink to, this Public License. 219 | - 2. You may satisfy the conditions in Section 3(a)(1) in any 220 | reasonable manner based on the medium, means, and context in 221 | which You Share the Licensed Material. For example, it may 222 | be reasonable to satisfy the conditions by providing a URI 223 | or hyperlink to a resource that includes the required 224 | information. 225 | - 3. If requested by the Licensor, You must remove any of the 226 | information required by Section 3(a)(1)(A) to the extent 227 | reasonably practicable. 228 | - 4. If You Share Adapted Material You produce, the Adapter's 229 | License You apply must not prevent recipients of the Adapted 230 | Material from complying with this Public License. 231 | 232 | - Section 4 – Sui Generis Database Rights. 233 | 234 | Where the Licensed Rights include Sui Generis Database Rights that 235 | apply to Your use of the Licensed Material: 236 | 237 | - a. for the avoidance of doubt, Section 2(a)(1) grants You the 238 | right to extract, reuse, reproduce, and Share all or a 239 | substantial portion of the contents of the database for 240 | NonCommercial purposes only; 241 | - b. if You include all or a substantial portion of the database 242 | contents in a database in which You have Sui Generis Database 243 | Rights, then the database in which You have Sui Generis Database 244 | Rights (but not its individual contents) is Adapted Material; 245 | and 246 | - c. You must comply with the conditions in Section 3(a) if You 247 | Share all or a substantial portion of the contents of the 248 | database. 249 | 250 | For the avoidance of doubt, this Section 4 supplements and does not 251 | replace Your obligations under this Public License where the 252 | Licensed Rights include other Copyright and Similar Rights. 253 | 254 | - Section 5 – Disclaimer of Warranties and Limitation of Liability. 255 | 256 | - a. Unless otherwise separately undertaken by the Licensor, to 257 | the extent possible, the Licensor offers the Licensed Material 258 | as-is and as-available, and makes no representations or 259 | warranties of any kind concerning the Licensed Material, whether 260 | express, implied, statutory, or other. This includes, without 261 | limitation, warranties of title, merchantability, fitness for a 262 | particular purpose, non-infringement, absence of latent or other 263 | defects, accuracy, or the presence or absence of errors, whether 264 | or not known or discoverable. Where disclaimers of warranties 265 | are not allowed in full or in part, this disclaimer may not 266 | apply to You. 267 | - b. To the extent possible, in no event will the Licensor be 268 | liable to You on any legal theory (including, without 269 | limitation, negligence) or otherwise for any direct, special, 270 | indirect, incidental, consequential, punitive, exemplary, or 271 | other losses, costs, expenses, or damages arising out of this 272 | Public License or use of the Licensed Material, even if the 273 | Licensor has been advised of the possibility of such losses, 274 | costs, expenses, or damages. Where a limitation of liability is 275 | not allowed in full or in part, this limitation may not apply to 276 | You. 277 | - c. The disclaimer of warranties and limitation of liability 278 | provided above shall be interpreted in a manner that, to the 279 | extent possible, most closely approximates an absolute 280 | disclaimer and waiver of all liability. 281 | 282 | - Section 6 – Term and Termination. 283 | 284 | - a. This Public License applies for the term of the Copyright and 285 | Similar Rights licensed here. However, if You fail to comply 286 | with this Public License, then Your rights under this Public 287 | License terminate automatically. 288 | - b. Where Your right to use the Licensed Material has terminated 289 | under Section 6(a), it reinstates: 290 | 291 | - 1. automatically as of the date the violation is cured, 292 | provided it is cured within 30 days of Your discovery of the 293 | violation; or 294 | - 2. upon express reinstatement by the Licensor. 295 | 296 | For the avoidance of doubt, this Section 6(b) does not affect 297 | any right the Licensor may have to seek remedies for Your 298 | violations of this Public License. 299 | 300 | - c. For the avoidance of doubt, the Licensor may also offer the 301 | Licensed Material under separate terms or conditions or stop 302 | distributing the Licensed Material at any time; however, doing 303 | so will not terminate this Public License. 304 | - d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 305 | License. 306 | 307 | - Section 7 – Other Terms and Conditions. 308 | 309 | - a. The Licensor shall not be bound by any additional or 310 | different terms or conditions communicated by You unless 311 | expressly agreed. 312 | - b. Any arrangements, understandings, or agreements regarding the 313 | Licensed Material not stated herein are separate from and 314 | independent of the terms and conditions of this Public License. 315 | 316 | - Section 8 – Interpretation. 317 | 318 | - a. For the avoidance of doubt, this Public License does not, and 319 | shall not be interpreted to, reduce, limit, restrict, or impose 320 | conditions on any use of the Licensed Material that could 321 | lawfully be made without permission under this Public License. 322 | - b. To the extent possible, if any provision of this Public 323 | License is deemed unenforceable, it shall be automatically 324 | reformed to the minimum extent necessary to make it enforceable. 325 | If the provision cannot be reformed, it shall be severed from 326 | this Public License without affecting the enforceability of the 327 | remaining terms and conditions. 328 | - c. No term or condition of this Public License will be waived 329 | and no failure to comply consented to unless expressly agreed to 330 | by the Licensor. 331 | - d. Nothing in this Public License constitutes or may be 332 | interpreted as a limitation upon, or waiver of, any privileges 333 | and immunities that apply to the Licensor or You, including from 334 | the legal processes of any jurisdiction or authority. 335 | 336 | Creative Commons is not a party to its public licenses. Notwithstanding, 337 | Creative Commons may elect to apply one of its public licenses to 338 | material it publishes and in those instances will be considered the 339 | "Licensor." The text of the Creative Commons public licenses is 340 | dedicated to the public domain under the CC0 Public Domain Dedication. 341 | Except for the limited purpose of indicating that material is shared 342 | under a Creative Commons public license or as otherwise permitted by the 343 | Creative Commons policies published at creativecommons.org/policies, 344 | Creative Commons does not authorize the use of the trademark "Creative 345 | Commons" or any other trademark or logo of Creative Commons without its 346 | prior written consent including, without limitation, in connection with 347 | any unauthorized modifications to any of its public licenses or any 348 | other arrangements, understandings, or agreements concerning use of 349 | licensed material. For the avoidance of doubt, this paragraph does not 350 | form part of the public licenses. 351 | 352 | Creative Commons may be contacted at creativecommons.org. 353 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 |
3 |

4 | 5 | Logo 6 |

A detection/segmentation dataset with class names characterized by intricate and flexible expressions

7 |

8 | The repo is the toolbox for D3 9 |
10 | [Doc 📚] 11 | 12 | [Paper (DOD) 📄] 13 | [Paper (GRES) 📄] 14 | [Awesome-DOD 🕶️] 15 |
16 |

17 |

18 | 19 | *** 20 | Description Detection Dataset ($D^3$, /dikju:b/) is an attempt at creating a next-generation object detection dataset. 21 | Unlike traditional detection datasets, the class names of the objects are no longer just simple nouns or noun phrases, but rather complex and descriptive descriptions, such as `a dog not being held by a leash`. 22 | For each image in the dataset, any object that matches the description is annotated. 23 | The dataset provides annotations such as bounding boxes and finely crafted instance masks. 24 | We believe it will contribute to computer vision and vision-language communities. 25 | 26 | 27 | 28 | # News 29 | - [02/14/2024] Evaluation on several SOTA methods (SPHNX (the first MLLM evaluated!), G-DINO, UNINEXT, etc.) are released, together with a [leaderboard](https://github.com/shikras/d-cube/tree/main/eval_sota) for $D^3$. :fire::fire: 30 | 31 | - [10/12/2023] We released an [awesome-described-object-detection](https://github.com/Charles-Xie/awesome-described-object-detection) list to collect and track related works. 32 | 33 | - [09/22/2023] Our DOD [paper](https://arxiv.org/abs/2307.12813) just got accepted by NeurIPS 2023! :fire: 34 | 35 | - [07/25/2023] This toolkit is available on PyPI now. You can install this repo with `pip install ddd-dataset`. 36 | 37 | - [07/25/2023] The [paper preprint](https://arxiv.org/abs/2307.12813) introducing the DOD task and the $D^3$ dataset, is available on arxiv. Check it out! 38 | 39 | - [07/18/2023] We have released our Description Detection Dataset ($D^3$) and the first version of $D^3$ toolbox. You can download it now for your project. 40 | 41 | - [07/14/2023] Our GRES [paper](https://arxiv.org/abs/2305.12452) has been accepted by ICCV 2023. 42 | 43 | 44 | 45 | # Contents 46 | - [Dataset Highlight](#task-and-dataset-highlight) 47 | - [Download](#download) 48 | - [Installation](#installation) 49 | - [Usage](#usage) 50 | 51 | 52 | 53 | # Task and Dataset Highlight 54 | 55 | The $D^3$ dataset is meant for the Described Object Detection (DOD) task. In the image below we show the difference between Referring Expression Comprehension (REC), Object Detection/Open-Vocabulary Detection (OVD) and Described Object Detection (DOD). OVD detect object based on category name, and each category can have zero to multiple instances; REC grounds one region based on a language description, whether the object truly exits or not; DOD detect all instances on each image in the dataset, based on a flexible reference. Related works are tracked in the [awesome-DOD](https://github.com/Charles-Xie/awesome-described-object-detection) list. 56 | 57 | ![Dataset Highlight](.assets/teaser.png "Highlight of the task & dataset") 58 | 59 | For more information on the characteristics of this dataset, please refer to our paper. 60 | 61 | 62 | 63 | # Download 64 | Currently we host the $D^3$ dataset on cloud drives. You can download the dataset from [Google Drive](https://drive.google.com/drive/folders/11kfY12NzKPwsliLEcIYki1yUqt7PbMEi?usp=sharing) or [Baidu Pan](). 65 | 66 | After downloading the `d3_images.zip` (images in the dataset), `d3_pkl.zip` (dataset information for this toolkit) and `d3_json.zip` (annotation for evaluation), please extract these 3 zip files to your custom `IMG_ROOT`, `PKL_PATH` and `JSON_ANNO_PATH` directory. These paths will be used when you perform inference or evaluation on this dataset. 67 | 68 | 69 | 70 | # Installation 71 | 72 | ## Prerequisites 73 | This toolkit requires a few python packages like `numpy` and `pycocotools`. Other packages like `matplotlib` and `opencv-python` may also be required if you want to utilize the visualization scripts. 74 | 75 | 76 | 77 | There are multiple ways to install $D^3$ toolbox, as listed below: 78 | 79 | 80 | ## Install with pip 81 | ```bash 82 | pip install ddd-dataset 83 | ``` 84 | 85 | ## Install from source 86 | ```bash 87 | git clone https://github.com/shikra/d-cube.git 88 | # option 1: install it as a python package 89 | cd d-cube 90 | python -m pip install . 91 | # done 92 | 93 | # option 2: just put the d-cube/d_cube directory in the root directory of your local repository 94 | ``` 95 | 96 | 100 | 101 | 102 | 103 | # Usage 104 | Please refer to the [documentation 📚](doc.md) for more details. 105 | Our toolbox is similar to [cocoapi](https://github.com/cocodataset/cocoapi) in style. 106 | 107 | Here is a quick example of how to use $D^3$. 108 | ```python 109 | from d_cube import D3 110 | d3 = D3(IMG_ROOT, PKL_ANNO_PATH) 111 | all_img_ids = d3.get_img_ids() # get the image ids in the dataset 112 | all_img_info = d3.load_imgs(all_img_ids) # load images by passing a list of some image ids 113 | img_path = all_img_info[0]["file_name"] # obtain one image path so you can load it and inference 114 | ``` 115 | 116 | Some frequently asked questions are answered in [this Q&A file](./qa.md). 117 | 118 | # Citation 119 | 120 | If you use our $D^3$ dataset, this toolbox, or otherwise find our work valuable, please cite [our paper](https://arxiv.org/abs/2307.12813): 121 | 122 | ```bibtex 123 | @inproceedings{xie2023DOD, 124 | title={Described Object Detection: Liberating Object Detection with Flexible Expressions}, 125 | author={Xie, Chi and Zhang, Zhao and Wu, Yixuan and Zhu, Feng and Zhao, Rui and Liang, Shuang}, 126 | booktitle={Thirty-seventh Conference on Neural Information Processing Systems (NeurIPS)}, 127 | year={2023} 128 | } 129 | 130 | @inproceedings{wu2023gres, 131 | title={Advancing Referring Expression Segmentation Beyond Single Image}, 132 | author={Wu, Yixuan and Zhang, Zhao and Xie, Chi and Zhu, Feng and Zhao, Rui}, 133 | booktitle={International Conference on Computer Vision (ICCV)}, 134 | year={2023} 135 | } 136 | ``` 137 | 138 | More works related to Described Object Detection are tracked in this list: [awesome-described-object-detection](https://github.com/Charles-Xie/awesome-described-object-detection). 139 | -------------------------------------------------------------------------------- /d_cube/__init__.py: -------------------------------------------------------------------------------- 1 | from .d3 import D3 2 | -------------------------------------------------------------------------------- /d_cube/d3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = "Chi Xie and Zhao Zhang" 3 | __maintainer__ = "Chi Xie" 4 | # this is the core of the d-cube toolkit 5 | import os 6 | import os.path as osp 7 | import json 8 | from collections import defaultdict 9 | 10 | import numpy as np 11 | from pycocotools import mask 12 | import cv2 13 | import matplotlib.pyplot as plt 14 | 15 | 16 | from .data_util import * 17 | 18 | 19 | class D3: 20 | def __init__(self, img_root, anno_root): 21 | self.image_dir = img_root 22 | self.anno_dir = anno_root 23 | self.load_data() 24 | 25 | def load_data(self): 26 | file_names = ["sentences.pkl", "annotations.pkl", "images.pkl", "groups.pkl"] 27 | self.data = { 28 | name.split(".")[0]: load_pkl(osp.join(self.anno_dir, name)) 29 | for name in file_names 30 | } 31 | 32 | def get_sent_ids(self, anno_ids=[], img_ids=[], group_ids=[], sent_ids=[]): 33 | """get sentence ids for D-cube. 34 | 35 | Args: 36 | anno_ids (list, optional): annotation ids to get sentence ids. Defaults to []. 37 | img_ids (list, optional): image ids to get sentence ids. Defaults to []. 38 | group_ids (list, optional): group ids to get sentence ids. Defaults to []. 39 | sent_ids (list, optional): additional sentence ids you want to include. Defaults to []. 40 | 41 | Raises: 42 | Exception: anno_ids, img_ids and group_ids cannot be used together. 43 | 44 | Returns: 45 | list: sentence ids. 46 | """ 47 | img_ids = img_ids if isinstance(img_ids, list) else [img_ids] 48 | anno_ids = anno_ids if isinstance(anno_ids, list) else [anno_ids] 49 | group_ids = group_ids if isinstance(group_ids, list) else [group_ids] 50 | sent_ids = sent_ids if isinstance(sent_ids, list) else [sent_ids] 51 | 52 | if not any([img_ids, anno_ids, group_ids, sent_ids]): 53 | return list(self.data["sentences"].keys()) 54 | 55 | if ( 56 | (anno_ids and img_ids) 57 | or (anno_ids and group_ids) 58 | or (img_ids and group_ids) 59 | ): 60 | raise Exception("anno_ids, img_ids, group_ids can only be used alone") 61 | 62 | out_ids_set = set() 63 | if img_ids: 64 | for img_id in img_ids: 65 | imganno_ids = self.data["images"][img_id]["anno_id"] 66 | for ianno_id in imganno_ids: 67 | out_ids_set |= set(self.data["annotations"][ianno_id]["sent_id"]) 68 | 69 | if group_ids: 70 | for group_id in group_ids: 71 | out_ids_set |= set(self.data["groups"][group_id]["inner_sent_id"]) 72 | 73 | if anno_ids: 74 | for ianno_id in anno_ids: 75 | out_ids_set |= set(self.data["annotations"][ianno_id]["sent_id"]) 76 | 77 | if sent_ids: 78 | out_ids_set &= set(sent_ids) 79 | 80 | return list(out_ids_set) 81 | 82 | def get_anno_ids(self, anno_ids=[], img_ids=[], group_ids=[], sent_ids=[]): 83 | """get annotation ids for D-cube. 84 | 85 | Args: 86 | anno_ids (list, optional): additional annotation ids you want to include. Defaults to []. 87 | img_ids (list, optional): image ids to get annotation ids. Defaults to []. 88 | group_ids (list, optional): group ids to get annotation ids. Defaults to []. 89 | sent_ids (list, optional): sentence ids to get annotation ids. Defaults to []. 90 | 91 | Raises: 92 | Exception: img_ids and group_ids cannot be used together. 93 | 94 | Returns: 95 | list: annotation ids. 96 | """ 97 | img_ids = img_ids if isinstance(img_ids, list) else [img_ids] 98 | anno_ids = anno_ids if isinstance(anno_ids, list) else [anno_ids] 99 | group_ids = group_ids if isinstance(group_ids, list) else [group_ids] 100 | sent_ids = sent_ids if isinstance(sent_ids, list) else [sent_ids] 101 | 102 | if not any([img_ids, anno_ids, group_ids, sent_ids]): 103 | return list(self.data["annotations"].keys()) 104 | 105 | if img_ids and group_ids: 106 | raise Exception("img_ids, group_ids can only be used alone") 107 | 108 | out_ids_set = set() 109 | if img_ids: 110 | for img_id in img_ids: 111 | out_ids_set |= set(self.data["images"][img_id]["anno_id"]) 112 | 113 | if group_ids: 114 | for group_id in group_ids: 115 | for groupimg_id in self.data["groups"][group_id]["img_id"]: 116 | out_ids_set |= set(self.data["images"][groupimg_id]["anno_id"]) 117 | 118 | if sent_ids and img_ids: 119 | for sent_id in sent_ids: 120 | out_ids_set &= set(self.data["sentences"][sent_id]["anno_id"]) 121 | else: 122 | for sent_id in sent_ids: 123 | out_ids_set |= set(self.data["sentences"][sent_id]["anno_id"]) 124 | 125 | if anno_ids: 126 | out_ids_set &= set(anno_ids) 127 | 128 | return list(out_ids_set) 129 | 130 | def get_img_ids(self, anno_ids=[], img_ids=[], group_ids=[], sent_ids=[]): 131 | """get image ids for D-cube. 132 | 133 | Args: 134 | anno_ids (list, optional): annotation ids to get image ids. Defaults to []. 135 | img_ids (list, optional): additional image ids you want to include. Defaults to []. 136 | group_ids (list, optional): group ids to get image ids. Defaults to []. 137 | sent_ids (list, optional): sentence ids to get image ids. Defaults to []. 138 | 139 | Raises: 140 | Exception: anno_ids and img_ids cannot be used together. 141 | Exception: anno_ids and group_ids cannot be used together. 142 | 143 | Returns: 144 | list: image ids. 145 | """ 146 | img_ids = img_ids if isinstance(img_ids, list) else [img_ids] 147 | anno_ids = anno_ids if isinstance(anno_ids, list) else [anno_ids] 148 | group_ids = group_ids if isinstance(group_ids, list) else [group_ids] 149 | sent_ids = sent_ids if isinstance(sent_ids, list) else [sent_ids] 150 | 151 | if not any([img_ids, anno_ids, group_ids, sent_ids]): 152 | return list(self.data["images"].keys()) 153 | 154 | if anno_ids and img_ids: 155 | raise Exception("anno_ids and img_ids can only be used alone") 156 | if anno_ids and group_ids: 157 | raise Exception("anno_ids and group_ids can only be used alone") 158 | 159 | out_ids_set = set() 160 | if anno_ids: 161 | for ianno_id in anno_ids: 162 | out_ids_set.add(self.data["annotations"][ianno_id]["image_id"]) 163 | 164 | if group_ids: 165 | for group_id in group_ids: 166 | out_ids_set |= set(self.data["groups"][group_id]["img_id"]) 167 | 168 | if sent_ids: 169 | for sent_id in sent_ids: 170 | for sentanno_id in self.data["sentences"][sent_id]["anno_id"]: 171 | out_ids_set.add(self.data["annotations"][sentanno_id]["image_id"]) 172 | 173 | if img_ids: 174 | out_ids_set &= set(img_ids) 175 | 176 | return list(out_ids_set) 177 | 178 | def get_group_ids(self, anno_ids=[], img_ids=[], group_ids=[], sent_ids=[]): 179 | """get group ids for D-cube. 180 | 181 | Args: 182 | anno_ids (list, optional): annotation ids to get group ids. Defaults to []. 183 | img_ids (list, optional): image ids to get group ids. Defaults to []. 184 | group_ids (list, optional): additional group_ids you want to include. Defaults to []. 185 | sent_ids (list, optional): sentence ids to get group ids. Defaults to []. 186 | 187 | Raises: 188 | Exception: anno_ids, img_ids and sent_ids cannot be used together. 189 | 190 | Returns: 191 | list: group ids. 192 | """ 193 | img_ids = img_ids if isinstance(img_ids, list) else [img_ids] 194 | anno_ids = anno_ids if isinstance(anno_ids, list) else [anno_ids] 195 | group_ids = group_ids if isinstance(group_ids, list) else [group_ids] 196 | sent_ids = sent_ids if isinstance(sent_ids, list) else [sent_ids] 197 | 198 | if not any([img_ids, anno_ids, group_ids, sent_ids]): 199 | return list(self.data["groups"].keys()) 200 | 201 | if anno_ids and img_ids: 202 | raise Exception("anno_ids and img_ids can only be used alone") 203 | if anno_ids and sent_ids: 204 | raise Exception("anno_ids and sent_ids can only be used alone") 205 | if img_ids and sent_ids: 206 | raise Exception("img_ids and sent_ids can only be used alone") 207 | 208 | out_ids_set = set() 209 | if img_ids: 210 | for img_id in img_ids: 211 | out_ids_set.add(self.data["images"][img_id]["group_id"]) 212 | 213 | if anno_ids: 214 | for anno_id in anno_ids: 215 | out_ids_set.add(self.data["annotations"][anno_id]["group_id"]) 216 | 217 | if sent_ids: 218 | for sent_id in sent_ids: 219 | out_ids_set |= set(self.data["sentences"][sent_id]["group_id"]) 220 | 221 | if group_ids: 222 | out_ids_set &= set(group_ids) 223 | 224 | return list(out_ids_set) 225 | 226 | def load_sents(self, sent_ids=None): 227 | """load sentence info. 228 | 229 | Args: 230 | sent_ids (list, int, optional): sentence ids. Defaults to None. 231 | 232 | Returns: 233 | list: a list of sentence info. 234 | """ 235 | if sent_ids is not None and not isinstance(sent_ids, list): 236 | sent_ids = [sent_ids] 237 | if isinstance(sent_ids, list): 238 | return [self.data["sentences"][sent_id] for sent_id in sent_ids] 239 | else: 240 | return list(self.data["sentences"].values()) 241 | 242 | def load_annos(self, anno_ids=None): 243 | """load annotation info. 244 | 245 | Args: 246 | anno_ids (list, int, optional): annotation ids. Defaults to None. 247 | 248 | Returns: 249 | list: a list of annotation info. 250 | """ 251 | if anno_ids is not None and not isinstance(anno_ids, list): 252 | anno_ids = [anno_ids] 253 | if isinstance(anno_ids, list): 254 | return [self.data["annotations"][anno_id] for anno_id in anno_ids] 255 | else: 256 | return list(self.data["annotations"].values()) 257 | 258 | def load_imgs(self, img_ids=None): 259 | """load image info. 260 | 261 | Args: 262 | img_ids (list, int, optional): image ids. Defaults to None. 263 | 264 | Returns: 265 | list: a list of image info. 266 | """ 267 | if img_ids is not None and not isinstance(img_ids, list): 268 | img_ids = [img_ids] 269 | if isinstance(img_ids, list): 270 | return [self.data["images"][img_ids] for img_ids in img_ids] 271 | else: 272 | return list(self.data["images"].values()) 273 | 274 | def load_groups(self, group_ids=None): 275 | """load group info. 276 | 277 | Args: 278 | group_ids (list, int, optional): group ids. Defaults to None. 279 | 280 | Returns: 281 | list: a list of group info. 282 | """ 283 | if group_ids is not None and not isinstance(group_ids, list): 284 | group_ids = [group_ids] 285 | if isinstance(group_ids, list): 286 | return [self.data["groups"][group_ids] for group_ids in group_ids] 287 | else: 288 | return list(self.data["groups"].values()) 289 | 290 | def get_mask(self, anno): 291 | rle = anno[0]["segmentation"] 292 | m = mask.decode(rle) 293 | m = np.sum( 294 | m, axis=2 295 | ) # sometimes there are multiple binary map (corresponding to multiple segs) 296 | m = m.astype(np.uint8) # convert to np.uint8 297 | # compute area 298 | area = sum(mask.area(rle)) # should be close to ann['area'] 299 | return {"mask": m, "area": area} 300 | 301 | def show_mask(self, anno): 302 | M = self.get_mask(anno) 303 | msk = M["mask"] 304 | ax = plt.gca() 305 | ax.imshow(msk) 306 | 307 | def show_image_seg( 308 | self, 309 | img_ids=[], 310 | save_dir=None, 311 | show_sent=False, 312 | on_image=False, 313 | checkerboard_bg=False, 314 | is_instance=True, 315 | ): 316 | if is_instance and checkerboard_bg: 317 | raise ValueError( 318 | "Cannot apply both is_instance and checkboard_bg at the same time." 319 | ) 320 | img_infos = self.load_imgs(img_ids=img_ids) 321 | for img_idx, img_info in enumerate(img_infos): 322 | img = cv2.imread(osp.join(self.image_dir, img_info["file_name"])) 323 | anno_infos = self.load_annos(img_info["anno_id"]) 324 | 325 | bm_canvas = defaultdict(list) 326 | merge_canvas = defaultdict(list) 327 | for anno_info in anno_infos: 328 | for sent_id in anno_info["sent_id"]: 329 | bm_canvas[sent_id].append(anno_info["segmentation"]) 330 | 331 | for sent_id, bm_list in bm_canvas.items(): 332 | merge_canvas[sent_id] = merge_rle( 333 | bm_list, is_instance=is_instance, on_image=on_image 334 | ) 335 | 336 | cv2.imwrite(osp.join(save_dir, f"{img_info['id']}.png"), img) 337 | for sent_id, merge_mask in merge_canvas.items(): 338 | if checkerboard_bg: 339 | merge_mask = add_checkerboard_bg(img, merge_mask) 340 | elif on_image: 341 | merge_mask = visualize_mask_on_image(img, merge_mask, add_edge=True) 342 | if show_sent: 343 | sent_en = self.load_sents(sent_ids=sent_id)[0]["raw_sent"] 344 | merge_mask = paste_text(merge_mask, sent_en) 345 | cv2.imwrite( 346 | osp.join(save_dir, f"{img_info['id']}_{sent_id}.png"), merge_mask 347 | ) 348 | 349 | return merge_canvas 350 | 351 | def show_group_seg( 352 | self, 353 | group_ids, 354 | save_root, 355 | show_sent=True, 356 | is_instance=True, 357 | on_image=False, 358 | checkerboard_bg=False, 359 | ): 360 | group_infos = self.load_groups(group_ids=group_ids) 361 | for group_info in group_infos: 362 | save_dir = osp.join(save_root, group_info["group_name"]) 363 | os.makedirs(save_dir, exist_ok=True) 364 | self.show_image_seg( 365 | img_ids=group_info["img_id"], 366 | save_dir=save_dir, 367 | show_sent=show_sent, 368 | is_instance=is_instance, 369 | on_image=on_image, 370 | checkerboard_bg=checkerboard_bg, 371 | ) 372 | 373 | def show_image_seg_bbox( 374 | self, 375 | img_ids=[], 376 | save_dir=None, 377 | show_sent=False, 378 | on_image=False, 379 | checkerboard_bg=False, 380 | is_instance=True, 381 | ): 382 | if is_instance and checkerboard_bg: 383 | raise ValueError( 384 | "Cannot apply both is_instance and checkboard_bg at the same time." 385 | ) 386 | img_infos = self.load_imgs(img_ids=img_ids) 387 | for img_idx, img_info in enumerate(img_infos): 388 | img = cv2.imread(osp.join(self.image_dir, img_info["file_name"])) 389 | anno_infos = self.load_annos(img_info["anno_id"]) 390 | 391 | bm_canvas = defaultdict(list) 392 | merge_canvas = defaultdict(list) 393 | sent_boxes = defaultdict(list) 394 | for anno_info in anno_infos: 395 | for sent_id in anno_info["sent_id"]: 396 | bm_canvas[sent_id].append(anno_info["segmentation"]) 397 | sent_boxes[sent_id].append(anno_info["bbox"][0].tolist()) 398 | 399 | for sent_id, bm_list in bm_canvas.items(): 400 | merge_canvas[sent_id] = merge_rle( 401 | bm_list, is_instance=is_instance, on_image=on_image 402 | ) 403 | 404 | cv2.imwrite(osp.join(save_dir, f"{img_info['id']}.png"), img) 405 | for sent_id, merge_mask in merge_canvas.items(): 406 | # vis mask 407 | if checkerboard_bg: 408 | merge_mask = add_checkerboard_bg(img, merge_mask) 409 | elif on_image: 410 | merge_mask = visualize_mask_on_image(img, merge_mask, add_edge=True) 411 | # vis box 412 | bboxes = sent_boxes[sent_id] 413 | merge_mask = visualize_bbox_on_image(merge_mask, bboxes) 414 | # vis sent 415 | if show_sent: 416 | sent_en = self.load_sents(sent_ids=sent_id)[0]["raw_sent"] 417 | merge_mask = paste_text(merge_mask, sent_en) 418 | cv2.imwrite( 419 | osp.join(save_dir, f"{img_info['id']}_{sent_id}.png"), merge_mask 420 | ) 421 | 422 | return merge_canvas 423 | 424 | def show_group_seg_bbox( 425 | self, 426 | group_ids, 427 | save_root, 428 | show_sent=True, 429 | is_instance=True, 430 | on_image=False, 431 | checkerboard_bg=False, 432 | ): 433 | group_infos = self.load_groups(group_ids=group_ids) 434 | for group_info in group_infos: 435 | save_dir = osp.join(save_root, group_info["group_name"]) 436 | os.makedirs(save_dir, exist_ok=True) 437 | self.show_image_seg_bbox( 438 | img_ids=group_info["img_id"], 439 | save_dir=save_dir, 440 | show_sent=show_sent, 441 | is_instance=is_instance, 442 | on_image=on_image, 443 | checkerboard_bg=checkerboard_bg, 444 | ) 445 | 446 | def show_image_bbox(self, img_ids=[], save_dir=None, show_sent=False): 447 | img_infos = self.load_imgs(img_ids=img_ids) 448 | for img_idx, img_info in enumerate(img_infos): 449 | img = cv2.imread(osp.join(self.image_dir, img_info["file_name"])) 450 | anno_infos = self.load_annos(img_info["anno_id"]) 451 | 452 | sent_boxes = defaultdict(list) 453 | for anno_info in anno_infos: 454 | for sent_id in anno_info["sent_id"]: 455 | sent_boxes[sent_id].append(anno_info["bbox"][0].tolist()) 456 | 457 | cv2.imwrite(osp.join(save_dir, f"{img_info['id']}.png"), img) 458 | for sent_id, bboxes in sent_boxes.items(): 459 | merge_img = visualize_bbox_on_image(img, bboxes) 460 | if show_sent: 461 | sent_en = self.load_sents(sent_ids=sent_id)[0]["raw_sent"] 462 | merge_img = paste_text(merge_img, sent_en) 463 | cv2.imwrite( 464 | osp.join(save_dir, f"{img_info['id']}_{sent_id}.png"), merge_img 465 | ) 466 | 467 | def show_group_bbox(self, group_ids, save_root, show_sent=True): 468 | group_infos = self.load_groups(group_ids=group_ids) 469 | for group_info in group_infos: 470 | save_dir = osp.join(save_root, group_info["group_name"]) 471 | os.makedirs(save_dir, exist_ok=True) 472 | self.show_image_bbox( 473 | img_ids=group_info["img_id"], save_dir=save_dir, show_sent=show_sent 474 | ) 475 | 476 | def stat_description(self, with_rev=False, inter_group=False): 477 | """calculate and print dataset statistics. 478 | 479 | Args: 480 | with_rev (bool, optional): consider absence descriptions or not. Defaults to False. 481 | inter_group (bool, optional): calculate under intra- or inter-group settings. Defaults to False. 482 | """ 483 | stat_dict = {} 484 | # Number of sents 485 | sent_ids = list(self.data["sentences"].keys()) 486 | if not with_rev: 487 | sent_ids = [sent_id for sent_id in sent_ids if not self.is_revsent(sent_id)] 488 | stat_dict["nsent"] = len(sent_ids) 489 | # Number of annos / instance # TODO: rm rev 490 | stat_dict["nanno"] = len(self.data["annotations"].keys()) 491 | # Number of images 492 | stat_dict["nimg"] = len(self.data["images"].keys()) 493 | # Number of groups 494 | stat_dict["ngroup"] = len(self.data["groups"].keys()) 495 | 496 | # Number of img-sent pair 497 | num_img_sent = 0 498 | for img_id in self.data["images"].keys(): 499 | anno_ids = self.get_anno_ids(img_ids=img_id) 500 | anno_infos = self.load_annos(anno_ids=anno_ids) 501 | cur_sent_set = set() 502 | group_sent_ids = set( 503 | self.load_groups(self.get_group_ids(img_ids=img_id))[0]["inner_sent_id"] 504 | ) 505 | for anno_info in anno_infos: 506 | cur_sent_set |= set( 507 | [i for i in anno_info["sent_id"] if i in group_sent_ids] 508 | ) 509 | if not with_rev: 510 | cur_sent_set = [ 511 | sent_id for sent_id in cur_sent_set if not self.is_revsent(sent_id) 512 | ] 513 | num_img_sent += len(cur_sent_set) 514 | stat_dict["num_img_sent"] = num_img_sent 515 | 516 | # Number of absence img-sent pair 517 | num_anti_img_sent = 0 518 | for img_id in self.data["images"].keys(): 519 | anno_ids = self.get_anno_ids(img_ids=img_id) 520 | anno_infos = self.load_annos(anno_ids=anno_ids) 521 | cur_sent_set = set() 522 | group_sent_ids = set( 523 | self.load_groups(self.get_group_ids(img_ids=img_id))[0]["inner_sent_id"] 524 | ) 525 | for anno_info in anno_infos: 526 | cur_sent_set |= set( 527 | [i for i in anno_info["sent_id"] if i in group_sent_ids] 528 | ) 529 | assert group_sent_ids.issuperset( 530 | cur_sent_set 531 | ), f"{group_sent_ids}, {cur_sent_set}" 532 | cur_anti_sent_set = group_sent_ids - cur_sent_set 533 | if not with_rev: 534 | cur_anti_sent_set = [ 535 | sent_id 536 | for sent_id in cur_anti_sent_set 537 | if not self.is_revsent(sent_id) 538 | ] 539 | num_anti_img_sent += len(cur_anti_sent_set) 540 | stat_dict["num_anti_img_sent"] = num_anti_img_sent 541 | 542 | # Number of anno-sent pair 543 | num_anno_sent = 0 544 | anno_infos = self.load_annos() 545 | for anno_info in anno_infos: 546 | if inter_group: 547 | anno_sent_ids = [i for i in anno_info["sent_id"]] 548 | else: 549 | group_sent_ids = set( 550 | self.load_groups(anno_info["group_id"])[0]["inner_sent_id"] 551 | ) 552 | anno_sent_ids = [i for i in anno_info["sent_id"] if i in group_sent_ids] 553 | if not with_rev: 554 | anno_sent_ids = [ 555 | sent_id for sent_id in anno_sent_ids if not self.is_revsent(sent_id) 556 | ] 557 | num_anno_sent += len(anno_sent_ids) 558 | 559 | stat_dict["num_anno_sent"] = num_anno_sent 560 | 561 | # Number of anti anno-sent pair 562 | num_anti_anno_sent = 0 563 | anno_infos = self.load_annos() 564 | for anno_info in anno_infos: 565 | if inter_group: 566 | all_sent_ids = set(self.get_sent_ids()) 567 | anno_sent_ids = anno_info["sent_id"] 568 | 569 | anti_sent_ids = [ 570 | sent_id for sent_id in all_sent_ids if sent_id not in anno_sent_ids 571 | ] 572 | else: 573 | group_sent_ids = set( 574 | self.load_groups(anno_info["group_id"])[0]["inner_sent_id"] 575 | ) 576 | anno_sent_ids = [i for i in anno_info["sent_id"] if i in group_sent_ids] 577 | 578 | anti_sent_ids = [ 579 | sent_id 580 | for sent_id in group_sent_ids 581 | if sent_id not in anno_sent_ids 582 | ] 583 | 584 | if not with_rev: 585 | anti_sent_ids = [ 586 | sent_id for sent_id in anti_sent_ids if not self.is_revsent(sent_id) 587 | ] 588 | num_anti_anno_sent += len(anti_sent_ids) 589 | 590 | stat_dict["num_anti_anno_sent"] = num_anti_anno_sent 591 | 592 | # Len of sentence 593 | totle_len = 0 594 | for sent_info in self.load_sents(sent_ids): 595 | totle_len += len(sent_info["raw_sent"].split()) 596 | 597 | stat_dict["avg_sent_len"] = totle_len / stat_dict["nsent"] 598 | 599 | print(stat_dict) 600 | 601 | def is_revsent(self, sent_id): 602 | sent_info = self.load_sents(sent_ids=sent_id) 603 | return sent_info[0]["is_negative"] 604 | 605 | def data2coca(self, out_root, with_rev=False): 606 | group_infos = self.load_groups() 607 | for group_info in group_infos: 608 | sent_ids = group_info["inner_sent_id"] 609 | if not with_rev: 610 | sent_ids = [ 611 | sent_id for sent_id in sent_ids if not self.is_revsent(sent_id) 612 | ] 613 | sent_infos = self.load_sents(sent_ids) 614 | for sent_info in sent_infos: 615 | sent = sent_info["raw_sent"] 616 | img_infos = self.load_imgs(group_info["img_id"]) 617 | for img_info in img_infos: 618 | src_img_path = osp.join(self.image_dir, img_info["file_name"]) 619 | raw_name = img_info["file_name"].split("/")[-1] 620 | out_img_dir = osp.join(out_root, "images", sent) 621 | os.makedirs(out_img_dir, exist_ok=True) 622 | out_img_path = osp.join(out_img_dir, raw_name) 623 | copy_file(src_img_path, out_img_path) 624 | 625 | out_mask_dir = osp.join(out_root, "masks", sent) 626 | os.makedirs(out_mask_dir, exist_ok=True) 627 | out_mask_path = osp.join( 628 | out_mask_dir, raw_name.replace(".jpg", ".png") 629 | ) 630 | 631 | cur_anno_ids = self.get_anno_ids( 632 | img_ids=img_info["id"], sent_ids=sent_info["id"] 633 | ) 634 | anno_infos = self.load_annos(cur_anno_ids) 635 | rle_list = [anno_info["segmentation"] for anno_info in anno_infos] 636 | bmask = merge2bin(rle_list, img_info["height"], img_info["width"]) 637 | cv2.imwrite(out_mask_path, bmask) 638 | 639 | def convert2coco(self, out_root, anti_mode=False, is_group_separated=True): 640 | """ 641 | Convert the annotation format of D^3 dataset to COCO. 642 | 1. The sent_id can be viewed as category_id in COCO. 643 | 2. If `is_group_separated` is True, `outer_sent_id` does not need to be considered. 644 | 3. if `with_rev` is False, sents that meet `is_revsent` will be ignore. 645 | """ 646 | os.makedirs(out_root, exist_ok=True) 647 | coco_dict = { 648 | "images": [], 649 | "categories": [], 650 | "annotations": [], 651 | } 652 | 653 | sent_ids = self.get_sent_ids() 654 | if anti_mode == 1: 655 | sent_ids = [sent_id for sent_id in sent_ids if not self.is_revsent(sent_id)] 656 | elif anti_mode == 2: 657 | sent_ids = [sent_id for sent_id in sent_ids if self.is_revsent(sent_id)] 658 | elif anti_mode == 0: 659 | pass 660 | else: 661 | raise Exception("Unimplemented anti_mode.") 662 | 663 | sent_infos = self.load_sents(sent_ids) 664 | for isent_info in sent_infos: 665 | coco_dict["categories"].append( 666 | { 667 | "id": isent_info["id"], 668 | "name": isent_info["raw_sent"], 669 | } 670 | ) 671 | 672 | item_id = 0 673 | img_infos = self.load_imgs() 674 | for iimg_info in img_infos: 675 | coco_dict["images"].append( 676 | { 677 | "id": iimg_info["id"], 678 | "file_name": iimg_info["file_name"], 679 | "height": iimg_info["height"], 680 | "width": iimg_info["width"], 681 | } 682 | ) 683 | 684 | anno_ids = self.get_anno_ids(img_ids=iimg_info["id"]) 685 | anno_infos = self.load_annos(anno_ids) 686 | 687 | for ianno_info in anno_infos: 688 | if is_group_separated: 689 | inner_group_sent_ids = [ 690 | isent_id 691 | for isent_id in ianno_info["sent_id"] 692 | if isent_id 693 | in self.load_groups(ianno_info["group_id"])[0]["inner_sent_id"] 694 | ] 695 | cur_sent_ids = inner_group_sent_ids 696 | else: 697 | cur_sent_ids = ianno_info["sent_id"] 698 | 699 | for isent_id in cur_sent_ids: 700 | if isent_id not in sent_ids: 701 | continue 702 | 703 | seg = ianno_info["segmentation"][0].copy() 704 | if isinstance(seg, dict): # RLE 705 | counts = seg["counts"] 706 | if not isinstance(counts, str): 707 | # make it json-serializable 708 | seg["counts"] = counts.decode("ascii") 709 | 710 | coco_dict["annotations"].append( 711 | { 712 | "id": item_id, 713 | "image_id": iimg_info["id"], 714 | "category_id": isent_id, 715 | "segmentation": seg, 716 | "area": int(ianno_info["area"][0]), 717 | "bbox": [ 718 | int(cord) for cord in ianno_info["bbox"][0].tolist() 719 | ], 720 | "iscrowd": 0, # TODO: ianno_info["iscrowd"] 721 | } 722 | ) 723 | item_id += 1 724 | 725 | with open(osp.join(out_root, "coco_annotations.json"), "w") as f: 726 | json.dump(coco_dict, f, indent=4) 727 | 728 | def sent_analyse(self, save_dir, with_rev=False): 729 | """analyze word info in D-cube and generate word length histograms, word clouds, etc. 730 | 731 | Args: 732 | save_dir (str): path to save the visualized results. 733 | with_rev (bool, optional): consider absence descriptions or not. Defaults to False. 734 | """ 735 | sent_ids = self.get_sent_ids() 736 | if not with_rev: 737 | sent_ids = [sent_id for sent_id in sent_ids if not self.is_revsent(sent_id)] 738 | 739 | sent_lens, sent_raws = [], [] 740 | sent_infos = self.load_sents(sent_ids) 741 | for isent_info in sent_infos: 742 | sent_raws.append(isent_info["raw_sent"]) 743 | sent_lens.append(len(isent_info["raw_sent"].split())) 744 | 745 | os.makedirs(save_dir, exist_ok=True) 746 | # plot_hist( 747 | # sent_lens, 748 | # bins=max(sent_lens) - min(sent_lens) + 1, 749 | # save_path=osp.join(save_dir, "words_hist.pdf"), 750 | # x="Lengths of descriptions", 751 | # ) 752 | # generate_wordclouds(sent_raws, osp.join(save_dir, "word_clouds")) 753 | 754 | def group_analysis(self, save_dir, with_rev=False): 755 | group_infos = self.load_groups() 756 | scene_tree = defaultdict(dict) 757 | 758 | for group_info in group_infos: 759 | scene_tree[group_info["scene"]][group_info["group_name"]] = {"nimg": 0.1} 760 | 761 | # vis_group_tree(scene_tree, osp.join(save_dir, 'scene_tree.png')) # the visualized result is ugly 762 | 763 | def bbox_num_analyze(self): 764 | n_cat = len(self.data["sentences"].keys()) 765 | all_img_ids = self.data["images"].keys() 766 | n_img = len(all_img_ids) 767 | cat_obj_count = np.zeros((n_cat, n_img), dtype=int) 768 | for img_id in all_img_ids: 769 | # img_cat_ids = self.get_sent_ids(img_ids=img_id) 770 | anno_ids = self.get_anno_ids(img_ids=img_id) 771 | anno_infos = self.load_annos(anno_ids=anno_ids) 772 | for anno in anno_infos: 773 | for sid in anno["sent_id"]: 774 | cat_obj_count[sid - 1, img_id] += 1 775 | return cat_obj_count 776 | -------------------------------------------------------------------------------- /d_cube/data_util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = "Chi Xie and Zhao Zhang" 3 | __maintainer__ = "Chi Xie" 4 | # data utility functions are defined in the script 5 | import json 6 | import pickle 7 | import shutil 8 | 9 | # from io import StringIO 10 | # import string 11 | 12 | import numpy as np 13 | import cv2 14 | from pycocotools import mask as cocomask 15 | 16 | VOC_COLORMAP = [ 17 | [128, 0, 0], 18 | [0, 128, 0], 19 | [128, 128, 0], 20 | [0, 0, 128], 21 | [128, 0, 128], 22 | [0, 128, 128], 23 | [128, 128, 128], 24 | [64, 0, 0], 25 | [192, 0, 0], 26 | [64, 128, 0], 27 | [192, 128, 0], 28 | [64, 0, 128], 29 | [192, 0, 128], 30 | [64, 128, 128], 31 | [192, 128, 128], 32 | [0, 64, 0], 33 | [128, 64, 0], 34 | [0, 192, 0], 35 | [128, 192, 0], 36 | [0, 64, 128], 37 | ] 38 | 39 | 40 | def visualize_bbox_on_image(img, bbox_list, save_path=None, thickness=3): 41 | img_copy = img.copy() 42 | for i, bbox in enumerate(bbox_list): 43 | color = tuple(VOC_COLORMAP[i % len(VOC_COLORMAP)]) 44 | x, y, w, h = bbox 45 | img_copy = cv2.rectangle( 46 | img_copy, (int(x), int(y)), (int((x + w)), int(y + h)), color, thickness 47 | ) 48 | if save_path: 49 | cv2.imwrite(save_path, img_copy) 50 | return img_copy 51 | 52 | 53 | def rle2bmask(rle): 54 | bm = cocomask.decode(rle) 55 | if len(bm.shape) == 3: 56 | bm = np.sum( 57 | bm, axis=2 58 | ) # sometimes there are multiple binary map (corresponding to multiple segs) 59 | bm = bm.astype(np.uint8) # convert to np.uint8 60 | return bm 61 | 62 | 63 | def merge_rle(rle_list, is_instance=True, on_image=False): 64 | if is_instance: 65 | cm_list = [] 66 | for rle_idx, rle in enumerate(rle_list): 67 | color = VOC_COLORMAP[rle_idx] 68 | bm = rle2bmask(rle) 69 | cm = cv2.cvtColor(bm, cv2.COLOR_GRAY2BGR) 70 | cm_list.append(cm * color) 71 | merge_map = np.sum(cm_list, axis=0, dtype=np.uint8) 72 | else: 73 | bm_list = [rle2bmask(rle) for rle in rle_list] 74 | merge_map = np.sum(bm_list, axis=0, dtype=np.uint8) 75 | merge_map[merge_map >= 1] = 1 76 | if not on_image: 77 | color = VOC_COLORMAP[0] 78 | merge_map = cv2.cvtColor(merge_map, cv2.COLOR_GRAY2BGR) 79 | merge_map *= np.array(color, dtype=np.uint8) 80 | 81 | merge_map[merge_map > 255] = 255 82 | 83 | if not on_image: 84 | tmp_sum_map = np.sum(merge_map, axis=-1) 85 | merge_map[tmp_sum_map == 0] = 220 86 | return merge_map 87 | 88 | 89 | def merge2bin(rle_list, img_h, img_w): 90 | if rle_list: 91 | bm_list = [rle2bmask(rle) for rle in rle_list] 92 | merge_map = np.sum(bm_list, axis=0, dtype=np.uint8) 93 | merge_map[merge_map >= 1] = 255 94 | merge_map = np.expand_dims(merge_map, axis=-1) 95 | return merge_map 96 | else: 97 | return np.zeros([img_h, img_w, 1], dtype=np.uint8) 98 | 99 | 100 | def paste_text(img, text): 101 | fontFace = cv2.FONT_HERSHEY_COMPLEX_SMALL 102 | overlay = img.copy() 103 | # fontFace = cv2.FONT_HERSHEY_TRIPLEX 104 | fontScale = 1 105 | thickness = 1 106 | backgroud_alpha = 0.8 107 | 108 | retval, baseLine = cv2.getTextSize( 109 | text, fontFace=fontFace, fontScale=fontScale, thickness=thickness 110 | ) 111 | topleft = (0, 0) 112 | # bottomright = (topleft[0] + retval[0], topleft[1] + retval[1]+10) 113 | bottomright = (img.shape[1], topleft[1] + retval[1] + 10) 114 | 115 | cv2.rectangle(overlay, topleft, bottomright, thickness=-1, color=(250, 250, 250)) 116 | img = cv2.addWeighted(overlay, backgroud_alpha, img, 1 - backgroud_alpha, 0) 117 | 118 | cv2.putText( 119 | img, 120 | text, 121 | (0, baseLine + 10), 122 | fontScale=fontScale, 123 | fontFace=fontFace, 124 | thickness=thickness, 125 | color=(10, 10, 10), 126 | ) 127 | return img 128 | 129 | 130 | def load_json(json_path, to_int=False): 131 | clean_res_dic = {} 132 | with open(json_path, "r", encoding="utf-8") as f_in: 133 | res_dic = json.load(f_in) 134 | 135 | for ikey, iv in res_dic.items(): 136 | ikey = int(ikey.strip()) if to_int else ikey.strip() 137 | clean_res_dic[ikey] = iv 138 | 139 | return clean_res_dic 140 | 141 | 142 | def path_map(src_path, obj_path): 143 | def inner_map(full_path): 144 | return full_path.replace(src_path, obj_path) 145 | 146 | 147 | def save_pkl(src, obj_path): 148 | with open(obj_path, "wb") as f_out: 149 | pickle.dump(src, f_out) 150 | 151 | 152 | def load_pkl(src_path): 153 | with open(src_path, "rb") as f_in: 154 | in_pkl = pickle.load(f_in) 155 | return in_pkl 156 | 157 | 158 | def copy_file(src_path, obj_path): 159 | shutil.copy(src_path, obj_path) 160 | 161 | 162 | def sentence_analysis(): 163 | return 0 164 | 165 | 166 | def add_checkerboard_bg(image, mask, save_path=None): 167 | # Create a new image with the same size as the original image 168 | new_image = np.zeros_like(image) 169 | 170 | # Define the size of the checkerboard pattern 171 | checkerboard_size = 24 172 | 173 | # Loop over each pixel in the mask 174 | for x in range(mask.shape[1]): 175 | for y in range(mask.shape[0]): 176 | # If the pixel is transparent, draw a checkerboard pattern 177 | if mask[y, x] == 0: 178 | if (x // checkerboard_size) % 2 == (y // checkerboard_size) % 2: 179 | new_image[y, x] = (255, 255, 255) 180 | else: 181 | new_image[y, x] = (128, 128, 128) 182 | # Otherwise, copy the corresponding pixel from the original image 183 | else: 184 | new_image[y, x] = image[y, x] 185 | 186 | # Save the new image with the checkerboard background 187 | if save_path: 188 | cv2.imwrite(save_path, new_image) 189 | return new_image 190 | 191 | 192 | def visualize_mask_on_image( 193 | img, mask, save_path=None, add_edge=False, dark_background=False 194 | ): 195 | # Convert the mask to a binary mask if it's not already 196 | if mask.max() > 1: 197 | mask = mask.astype(np.uint8) // 255 198 | 199 | # Convert the mask to a 3-channel mask if it's not already 200 | if len(mask.shape) == 2: 201 | mask = np.expand_dims(mask, axis=2) 202 | mask = np.tile(mask, (1, 1, 3)) 203 | 204 | # Create a color map for the mask 205 | cmap = np.array([255, 117, 44], dtype=np.uint8) 206 | mask_colors = mask * cmap 207 | 208 | # Add an opaque white edge to the mask if desired 209 | if add_edge: 210 | if len(mask.shape) == 2: 211 | mask = np.expand_dims(mask, axis=2) 212 | mask = np.tile(mask, (1, 1, 3)) 213 | 214 | kernel = np.ones((5, 5), dtype=np.uint8) 215 | mask_edge = cv2.erode(mask, kernel, iterations=1) 216 | mask_edge = mask - mask_edge 217 | 218 | # mask_edge = np.tile(mask_edge[:, :, np.newaxis], [1, 1, 3]) 219 | mask_colors[mask_edge > 0] = 255 220 | 221 | # Overlay the mask on the masked image 222 | if dark_background: 223 | masked_img = cv2.addWeighted(img, 0.4, mask_colors, 0.6, 0) 224 | else: 225 | masked_img = img.copy() 226 | masked_img[mask > 0] = cv2.addWeighted(img, 0.4, mask_colors, 0.6, 0)[mask > 0] 227 | 228 | # Save the result to the specified path if provided 229 | if save_path is not None: 230 | cv2.imwrite(save_path, masked_img) 231 | 232 | return masked_img 233 | 234 | 235 | # def visualize_mask_on_image(img, mask, save_path=None, add_edge=False): 236 | # # Convert the mask to a binary mask if it's not already 237 | # if mask.max() > 1: 238 | # mask = mask.astype(np.uint8) // 255 239 | 240 | # # Convert the mask to a 3-channel mask if it's not already 241 | # if len(mask.shape) == 2: 242 | # mask = np.expand_dims(mask, axis=2) 243 | # mask = np.tile(mask, (1, 1, 3)) 244 | 245 | # # Create a color map for the mask 246 | # cmap = np.array([255, 117, 44], dtype=np.uint8) 247 | # mask_colors = mask * cmap 248 | 249 | # # Add an opaque white edge to the mask if desired 250 | # if add_edge: 251 | # if len(mask.shape) == 2: 252 | # mask = np.expand_dims(mask, axis=2) 253 | # mask = np.tile(mask, (1, 1, 3)) 254 | 255 | # kernel = np.ones((5, 5), dtype=np.uint8) 256 | # mask_edge = cv2.erode(mask, kernel, iterations=1) 257 | # mask_edge = mask - mask_edge 258 | 259 | # # mask_edge = np.tile(mask_edge[:, :, np.newaxis], [1, 1, 3]) 260 | # mask_colors[mask_edge > 0] = 255 261 | 262 | # # Overlay the mask on the masked image 263 | # masked_img = cv2.addWeighted(img, 0.5, mask_colors, 0.5, 0) 264 | 265 | # # Save the result to the specified path if provided 266 | # if save_path is not None: 267 | # cv2.imwrite(save_path, masked_img) 268 | 269 | # return masked_img 270 | -------------------------------------------------------------------------------- /d_cube/vis_util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = "Chi Xie and Zhao Zhang" 3 | __maintainer__ = "Chi Xie" 4 | import os 5 | from collections import Counter 6 | 7 | import spacy 8 | import matplotlib.pyplot as plt 9 | import seaborn as sns 10 | from wordcloud import WordCloud 11 | 12 | # from pycirclize import Circos 13 | # from Bio.Phylo.BaseTree import Tree 14 | # from Bio import Phylo 15 | # from newick import Node 16 | 17 | 18 | def plot_hist(data, bins=10, is_norm=False, save_path=None, x=None): 19 | sns.set_theme(style="whitegrid", font_scale=2.0) 20 | ax = sns.histplot(data, bins=bins, common_norm=is_norm, kde=False) 21 | ax.set_xlabel(x) 22 | plt.tight_layout() 23 | plt.savefig(save_path) 24 | plt.close() 25 | 26 | 27 | def plot_bars(names, nums, is_sort, save_path=None): 28 | sns.set(style="whitegrid") 29 | 30 | if is_sort: 31 | zipped = zip(nums, names) 32 | sort_zipped = sorted(zipped, key=lambda x: (x[0], x[1])) 33 | result = zip(*sort_zipped) 34 | nums, names = [list(x) for x in result] 35 | 36 | fontx = {"family": "Times New Roman", "size": 10} 37 | fig, ax = plt.subplots() 38 | fig = plt.figure(figsize=(16, 4)) 39 | # sns.set_palette("PuBuGn_d") 40 | sns.barplot(names, nums, palette=sns.cubehelix_palette(80, start=0.5, rot=-0.75)) 41 | fig.autofmt_xdate(rotation=90) 42 | plt.tick_params(axis="x", labelsize=10) 43 | labels = ax.get_xticklabels() + ax.get_yticklabels() 44 | [label.set_fontname("Times New Roman") for label in labels] 45 | plt.tight_layout() 46 | plt.savefig(save_path) 47 | 48 | 49 | def generate_wordclouds(sentences, save_dir): 50 | """Generates word clouds for different parts of speech in a list of sentences. 51 | 52 | Args: 53 | sentences: A list of sentences. 54 | save_dir: The directory to save the word cloud images. 55 | """ 56 | 57 | os.makedirs(save_dir, exist_ok=True) 58 | # Load the spacy model 59 | nlp = spacy.load("en_core_web_sm") 60 | 61 | # Define the parts of speech to include in the word clouds 62 | pos_to_include = ["NOUN", "VERB", "ADJ", "ADV"] 63 | 64 | # Process each sentence and collect the relevant words for each part of speech 65 | words_by_pos = {pos: [] for pos in pos_to_include} 66 | for sent in sentences: 67 | doc = nlp(sent) 68 | for token in doc: 69 | if token.pos_ in pos_to_include: 70 | words_by_pos[token.pos_].append(token.lemma_.lower()) 71 | 72 | # Generate a word cloud for each part of speech 73 | for pos, words in words_by_pos.items(): 74 | if len(words) == 0: 75 | continue # skip parts of speech with no words 76 | 77 | # Count the frequency of each word 78 | word_counts = Counter(words) 79 | 80 | # Generate the word cloud 81 | wordcloud = WordCloud( 82 | width=800, 83 | height=800, 84 | background_color="white", 85 | max_words=200, 86 | colormap="Set2", 87 | max_font_size=150, 88 | ).generate_from_frequencies(word_counts) 89 | 90 | # Save the word cloud image 91 | filename = f"{pos.lower()}_wordcloud.png" 92 | filepath = os.path.join(save_dir, filename) 93 | wordcloud.to_file(filepath) 94 | 95 | 96 | # def vis_group_tree(data_dict, save_path): 97 | 98 | # # Create 3 randomized trees 99 | # tree_size_list = [60, 40, 50] 100 | # trees = [Tree.randomized(string.ascii_uppercase, branch_stdev=0.5) for size in tree_size_list] 101 | 102 | # # Initialize circos sector with 3 randomized tree size 103 | # sectors = {name: size for name, size in zip(list("ABC"), tree_size_list)} 104 | # circos = Circos(sectors, space=5) 105 | 106 | # colors = ["tomato", "skyblue", "limegreen"] 107 | # cmaps = ["bwr", "viridis", "Spectral"] 108 | # for idx, sector in enumerate(circos.sectors): 109 | # sector.text(sector.name, r=120, size=12) 110 | # # Plot randomized tree 111 | # tree = trees[idx] 112 | # tree_track = sector.add_track((30, 70)) 113 | # tree_track.axis(fc=colors[idx], alpha=0.2) 114 | # tree_track.tree(tree, leaf_label_size=3, leaf_label_margin=21) 115 | # # Plot randomized bar 116 | # bar_track = sector.add_track((70, 90)) 117 | # x = np.arange(0, int(sector.size)) + 0.5 118 | # height = np.random.randint(1, 10, int(sector.size)) 119 | # bar_track.bar(x, height, facecolor=colors[idx], ec="grey", lw=0.5, hatch="//") 120 | 121 | # circos.savefig(save_path, dpi=600) 122 | 123 | # def clean_newick_key(in_str): 124 | # bad_chars = [':', ';', ',', '(', ')'] 125 | # for bad_char in bad_chars: 126 | # in_str = in_str.replace(bad_char, ' ') 127 | # return in_str 128 | 129 | # def build_tree_from_dict(data): 130 | # root = Node() # create the root node 131 | # for key, value in data.items(): 132 | # node = Node(name=clean_newick_key(key)) # name doesn't need to be cleaned 133 | # if value is not None: 134 | # child_node = build_tree_from_dict(value) 135 | # node.add_descendant(child_node) 136 | # root.add_descendant(node) 137 | 138 | # return root 139 | 140 | 141 | def replace_chars_in_dict_keys(d): 142 | """ 143 | Replaces the characters ':', ';', ',', '(', and ')' in the keys of a nested dictionary with '_'. 144 | """ 145 | new_dict = {} 146 | for k, v in d.items(): 147 | if isinstance(v, dict): 148 | v = replace_chars_in_dict_keys(v) 149 | new_key = k.translate(str.maketrans(":;,()", "_____")) 150 | new_dict[new_key] = v 151 | return new_dict 152 | 153 | 154 | def build_newick_tree(tree_dict): 155 | newick_tree = "" 156 | if isinstance(tree_dict, dict): 157 | for key, value in tree_dict.items(): 158 | if isinstance(value, dict): 159 | subtree = build_newick_tree(value) 160 | if subtree: 161 | newick_tree += "(" + subtree + ")" + key + "," 162 | else: 163 | newick_tree += key + "," 164 | else: 165 | newick_tree += key + ":" + str(value) + "," 166 | newick_tree = newick_tree.rstrip(",") + ")" 167 | return newick_tree 168 | else: 169 | return None 170 | 171 | 172 | # def vis_group_tree(data_dict, save_path): 173 | # data_dic = replace_chars_in_dict_keys(data_dict) 174 | # super_group_names = data_dict.keys() 175 | 176 | # # Create 3 randomized trees 177 | # tree_size_list = [60, 40, 50] 178 | # trees = [Phylo.read(StringIO(build_newick_tree(data_dict[super_group_name])), "newick") for super_group_name in super_group_names] 179 | 180 | # # Initialize circos sector with 3 randomized tree size 181 | # sectors = {name: size for name, size in zip(list("ABC"), tree_size_list)} 182 | # circos = Circos(sectors, space=5) 183 | 184 | # colors = ["tomato", "skyblue", "limegreen"] 185 | # cmaps = ["bwr", "viridis", "Spectral"] 186 | # for idx, sector in enumerate(circos.sectors): 187 | # sector.text(sector.name, r=120, size=12) 188 | # # Plot randomized tree 189 | # tree = trees[idx] 190 | # tree_track = sector.add_track((30, 70)) 191 | # tree_track.axis(fc=colors[idx], alpha=0.2) 192 | # tree_track.tree(tree, leaf_label_size=3, leaf_label_margin=21) 193 | # # Plot randomized bar 194 | # bar_track = sector.add_track((70, 90)) 195 | # x = np.arange(0, int(sector.size)) + 0.5 196 | # height = np.random.randint(1, 10, int(sector.size)) 197 | # bar_track.bar(x, height, facecolor=colors[idx], ec="grey", lw=0.5, hatch="//") 198 | 199 | # circos.savefig(save_path, dpi=600) 200 | -------------------------------------------------------------------------------- /doc.md: -------------------------------------------------------------------------------- 1 | # $D^3$ Toolkit Documentation 2 | 3 | 4 | ## Table of Contents 5 | 6 | - [Inference](#inference-on-d3) 7 | - [Key Concepts](#key-concepts-for-users) 8 | - [Evaluation Settings](#evaluation-settings) 9 | - [Evaluation Code and Examples](#evaluation-code-and-examples) 10 | - [Dataset statistics](#dataset-statistics) 11 | 12 | 13 | 14 | 15 | ## Inference on $D^3$ 16 | 17 | ```python 18 | # import the dataset class 19 | from d_cube import D3 20 | # init a dataset instance 21 | d3 = D3(IMG_ROOT, PKL_ANNO_PATH) 22 | all_img_ids = d3.get_img_ids() # get the image ids in the dataset 23 | all_img_info = d3.load_imgs(all_img_ids) # load images by passing a list containing some image ids 24 | img_path = all_img_info[0]["file_name"] # obtain one image path so you can load it and inference 25 | # then you can load the image as input for your model 26 | 27 | group_ids = d3.get_group_ids(img_ids=[img_id]) # get the group ids by passing anno ids, image ids, etc. 28 | sent_ids = d3.get_sent_ids(group_ids=group_ids) # get the sentence ids by passing image ids, group ids, etc. 29 | sent_list = d3.load_sents(sent_ids=sent_ids) 30 | ref_list = [sent['raw_sent'] for sent in sent_list] # list[str] 31 | # use these language references in `ref_list` as the references to your REC/OVD/DOD model 32 | 33 | # save the result to a JSON file 34 | ``` 35 | 36 | Concepts and structures of `anno`, `image`, `sent` and `group` are explained in [this part](#key-concepts-for-users). 37 | 38 | In [this directory](eval_sota/) we provide the inference (and evaluation) script on some existing SOTA OVD/REC methods. 39 | 40 | 41 | 42 | ### Output Format 43 | When the inference is done, you need to save a JSON file in the format below (COCO standard output JSON form): 44 | ```json 45 | [ 46 | { 47 | "category_id": "int, the value of sent_id, range [1, 422]", 48 | "bbox": "list[int], [x1, y1, w, h], predicted by your model, same as COCO result format, absolute value in the range of [w, h, w, h]", 49 | "image_id": "int, img_id, can be 0, 1, 2, ....", 50 | "score": "float, predicted by your model, no restriction on its absolute value range" 51 | } 52 | ] 53 | ``` 54 | This JSON file should contain a list, where each item in the list is a dictionary of one detection result. 55 | 56 | With this JSON saved, you can evaluate the JSON in the next step. See [the evaluation step](#evaluation-code-and-examples). 57 | 58 | 59 | 60 | 61 | 62 | ## Key Concepts for Users 63 | 64 | ### `anno` 65 | A Python dictionary where the keys are integers and the values are dictionaries with the following key-value pairs: 66 | 67 | * `id`: an integer representing the ID of the annotation. 68 | * `sent_id`: a list of integers representing the IDs of sentences associated with this annotation. 69 | * `segmentation`: a Run Length Encoding (RLE) representation of the annotation. 70 | * `area`: an integer representing the area of the annotation. 71 | * `iscrowd`: an integer indicating whether this annotation represents a crowd or not. 72 | * `image_id`: an integer representing the ID of the image associated with this annotation. 73 | * `bbox`: a list of four integers representing the bounding box coordinates of the annotation in the format [x, y, width, height]. 74 | * `group_id`: a value that can be any object and represents the ID of the group associated with this annotation. 75 | 76 | ``` python 77 | { 78 | 1 : { 79 | "id": int, 80 | "sent_id": list, 81 | "segmentation": RLE, 82 | "area": int, 83 | "iscrowd": int, 84 | "image_id": int, 85 | "bbox": list, # [x, y, width, height] 86 | "group_id": int 87 | } 88 | } 89 | ``` 90 | 91 | ### `image` 92 | A Python dictionary where the keys are integers and the values are dictionaries with the following key-value pairs: 93 | 94 | * `id`: an integer representing the ID of the image. 95 | * `file_name`: a string representing the file name of the image. 96 | * `height`: an integer representing the height of the image. 97 | * `width`: an integer representing the width of the image. 98 | * `flickr_url`: a string representing the Flickr URL of the image. 99 | * `anno_id`: a list of integers representing the IDs of annotations associated with this image. 100 | * `group_id`: an integer representing the ID of the group associated with this image. 101 | * `license`: a string representing the license of the image. 102 | 103 | ``` python 104 | { 105 | int : { 106 | "id": int, 107 | "file_name": str, 108 | "height": int, 109 | "width": int, 110 | "flickr_url": str, 111 | "anno_id": list, 112 | "group_id": int, 113 | "license": str, 114 | } 115 | } 116 | ``` 117 | 118 | ### `sent` 119 | A Python dictionary where the keys are integers and the values are dictionaries with the following key-value pairs: 120 | 121 | * `id`: an integer representing the ID of the sentence. 122 | * `anno_id`: a list of integers representing the IDs of annotations associated with this sentence. 123 | * `group_id`: a list of integers representing the IDs of groups associated with this sentence. 124 | * `is_negative`: a boolean indicating whether this sentence is *absence expression* or not. `True` means *absence expression*. 125 | * `raw_sent`: a string representing the raw text of the sentence in English. 126 | * `raw_sent_zh`: a string representing the raw text of the sentence in Chinese. 127 | 128 | ``` python 129 | { 130 | int : { 131 | "id": int, 132 | "anno_id": list, 133 | "group_id": list, 134 | "is_negative": bool, 135 | "raw_sent": str, 136 | "raw_sent_zh": str 137 | } 138 | } 139 | ``` 140 | 141 | ### `group` 142 | A Python dictionary where the keys are integers and the values are dictionaries with the following key-value pairs: 143 | 144 | * `id`: an integer representing the ID of the group. 145 | * `pos_sent_id`: a list of integers representing the IDs of sentences that has referred obejct in the group. 146 | * `inner_sent_id`: a list of integers representing the IDs of sentences belonging to this group. 147 | * `outer_sent_id`: a list of integers representing the IDs of outer-group sentences that has referred obejct in the group. 148 | * `img_id`: a list of integers representing the IDs of images of this group. 149 | * `scene`: a list of strings representing the scenes of this group. 150 | * `group_name`: a string representing the name of this group in English. 151 | * `group_name_zh`: a string representing the name of this group in Chinese. 152 | 153 | ``` python 154 | { 155 | int : { 156 | "id": int, 157 | "pos_sent_id": list, 158 | "inner_sent_id": list, 159 | "outer_sent_id": list, 160 | "img_id": list, 161 | "scene": list, 162 | "group_name": str, 163 | "group_name_zh": str 164 | } 165 | } 166 | ``` 167 | 168 | 169 | 170 | 171 | 172 | ## Evaluation Settings 173 | 174 | 175 | ### Intra- or Inter-Group Settings 176 | 177 | The default evaluation protocol is the intra-group setting, where only a certain references are evaluated for each image. 178 | 179 | In the $D^3$ dataset, images are collected for different groups (scenarios), and the categories (descriptions) are designed based on the scenarios. For the intra-group setting, each image are only evaluated with the descriptions from the group the image belongs to. We call this **intra-scenario setting**. 180 | 181 | Note that each category is actually annotated on each image (with positive or negative instances). 182 | So you can also evaluate all categories on all images, just like traditional detection datasets. We call this **inter-scenario setting**. 183 | This is quite challenging for the DOD task as this will produce many false positive instances on current methods. 184 | 185 | For intra-group evaluation, you should use: 186 | ``` 187 | sent_ids = d3.get_sent_ids(group_ids=group_ids) 188 | # only get the refs (sents) for the group the image belongs to, which is usually 4 189 | ``` 190 | 191 | For inter-group evaluation, change the correponding code to: 192 | 193 | ``` 194 | sent_ids = d3.get_sent_ids() 195 | # get all the refs in the dataset 196 | ``` 197 | 198 | This will use all the sentences in the dataset, rather than a few sentences in the group that this image belongs to. 199 | 200 | This is the only difference in the implentation and evaluation. No further code changes need to be applied. 201 | 202 | For more information, you can refer to the Section 3.4 of the DOD paper. 203 | 204 | 205 | ### FULL, PRES and ABS 206 | 207 | FULL, PRES and ABS means the full descriptions (422 categories), presence descriptions (316 categories) and absence descriptions (106 categories). 208 | 209 | The meaning of absence descriptions are the descriptions involving the absence of some concepts, like lacking certain relationships, attributes or objects. For example, descriptions like "dog *without* leash", "person *without* helmet" and "a hat that is *not* blue" are absence ones. 210 | Similary, the descriptions involving *only* the presence of some concepts are presence descriptions. 211 | 212 | Most existing REC datasets have presence descriptions but few absence descriptions. 213 | 214 | For more details and the meaning of evaluating absence descriptions, please refer to Section 3.1 of the DOD paper. 215 | 216 | 217 | 218 | 219 | ## Evaluation Code and Examples 220 | 221 | In this part, we introduce how to evaluate the performance and get the metric values given the prediction result of a JSON file. 222 | 223 | ### Write a Snippet in Your Code 224 | 225 | This is based on [cocoapi (pycocotools)](https://github.com/cocodataset/cocoapi/tree/master/PythonAPI), and is quite simple: 226 | 227 | ```python 228 | from pycocotools.coco import COCO 229 | from pycocotools.cocoeval import COCOeval 230 | 231 | # Eval results 232 | coco = COCO(gt_path) # `gt_path` is the ground-truth JSON path (different JSON for FULL, PRES or ABS settings in our paper) 233 | d3_model = coco.loadRes(pred_path) # `pred_path` is the prediction JSON file 234 | cocoEval = COCOeval(coco, d3_model, "bbox") 235 | cocoEval.evaluate() 236 | cocoEval.accumulate() 237 | cocoEval.summarize() 238 | ``` 239 | 240 | ### An Off-the-shelf Script 241 | 242 | We also provide [a script](scripts/eval_and_analysis_json.py) that can produce the evaluation results (and some additional analysis) in our paper, given a prediction JSON. 243 | You can use it by: 244 | ```shell 245 | python eval_and_analysis_json.py YOUR_PREDICTION_JSON_PATH 246 | ``` 247 | 248 | A few options are provided for format conversion or more analysis: 249 | ```shell 250 | python eval_and_analysis_json.py --help 251 | 252 | usage: An example script for $D^3$ evaluation with prediction file (JSON) [-h] [--partition-by-nbox] [--partition-by-lens] [--xyxy2xywh] pred_path 253 | 254 | positional arguments: 255 | pred_path path to prediction json 256 | 257 | optional arguments: 258 | -h, --help show this help message and exit 259 | --partition-by-nbox divide the images by num of boxes for each ref 260 | --partition-by-lens divide the references by their lengths 261 | --xyxy2xywh transform box coords from xyxy to xywh 262 | ``` 263 | 264 | 265 | ### Evaluation Examples on SOTA Methods 266 | 267 | See [this directory](eval_sota/) for details. We include the evaluation scripts of some methods there. 268 | 269 | 270 | 271 | ## Dataset Statistics 272 | 273 | [A python script](scripts/get_d3_stat.py) is provided for calculating the statistics of $D^3$ or visualizing figures like histograms, word clouds, etc. 274 | 275 | The specific statistics of the dataset are available in Section 3.3 of the DOD paper. 276 | -------------------------------------------------------------------------------- /eval_sota/README.md: -------------------------------------------------------------------------------- 1 | # Evaluting SOTA Methods on $D^3$ 2 | 3 | ## Leaderboard 4 | 5 | In this directory, we keep the scripts or github links (official or custom) to evaluate SOTA methods (REC/OVD/DOD/MLLM) on $D^3$: 6 | 7 | | Name | Paper | Original Tasks | Training Data | Evaluation Code | Intra-FULL/PRES/ABS/Inter-FULL/PRES/ABS | Source | Note | 8 | |:-----|:-----:|:----:|:-----:|:-----:|:-----:|:-----:|:-----:| 9 | | OFA-large | [OFA: Unifying Architectures, Tasks, and Modalities Through a Simple Sequence-to-Sequence Learning Framework (ICML 2022)](https://arxiv.org/abs/2202.03052) | REC | - | - | 4.2/4.1/4.6/0.1/0.1/0.1 | [DOD paper](https://arxiv.org/abs/2307.12813) | - | 10 | | CORA-R50 | [CORA: Adapting CLIP for Open-Vocabulary Detection with Region Prompting and Anchor Pre-Matching (CVPR 2023)](https://openaccess.thecvf.com/content/CVPR2023/papers/Wu_CORA_Adapting_CLIP_for_Open-Vocabulary_Detection_With_Region_Prompting_and_CVPR_2023_paper.pdf) | OVD | - | - | 6.2/6.7/5.0/2.0/2.2/1.3 | [DOD paper](https://arxiv.org/abs/2307.12813) | - | 11 | | OWL-ViT-large | [Simple Open-Vocabulary Object Detection with Vision Transformers (ECCV 2022)](https://www.ecva.net/papers/eccv_2022/papers_ECCV/papers/136700714.pdf) | OVD | - | [DOD official](./owl_vit.py) | 9.6/10.7/6.4/2.5/2.9/2.1 | [DOD paper](https://arxiv.org/abs/2307.12813) | Post-processing hyper-parameters may affect the performance and the result may not exactly match the paper | 12 | | SPHINX-7B | [SPHINX: The Joint Mixing of Weights, Tasks, and Visual Embeddings for Multi-modal Large Language Models (arxiv 2023)](https://arxiv.org/abs/2311.07575) | **MLLM** capable of REC | - | [DOD official](./sphinx.py) | 10.6/11.4/7.9/-/-/- | DOD authors | A lot of contribution from [Jie Li](https://github.com/theFool32) | 13 | | GLIP-T | [Grounded Language-Image Pre-training (CVPR 2022)](https://arxiv.org/abs/2112.03857) | OVD & PG | - | - | 19.1/18.3/21.5/-/-/- | GEN paper | - | 14 | | UNINEXT-huge | [Universal Instance Perception as Object Discovery and Retrieval (CVPR 2023)](https://arxiv.org/abs/2303.06674v2) | OVD & REC | - | [DOD official](https://github.com/Charles-Xie/UNINEXT_D3) | 20.0/20.6/18.1/3.3/3.9/1.6 | [DOD paper](https://arxiv.org/abs/2307.12813) | - | 15 | | Grounding-DINO-base | [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection (arxiv 2023)](https://arxiv.org/abs/2303.05499) | OVD & REC | - | [DOD official](./groundingdino.py) | 20.7/20.1/22.5/2.7/2.4/3.5 | [DOD paper](https://arxiv.org/abs/2307.12813) | Post-processing hyper-parameters may affect the performance and the result may not exactly match the paper | 16 | | OFA-DOD-base | [Described Object Detection: Liberating Object Detection with Flexible Expressions (NeurIPS 2023)](https://arxiv.org/abs/2307.12813) | DOD | - | - | 21.6/23.7/15.4/5.7/6.9/2.3 | [DOD paper](https://arxiv.org/abs/2307.12813) | - | 17 | | FIBER-B | [Coarse-to-Fine Vision-Language Pre-training with Fusion in the Backbone (NeurIPS 2022)](https://arxiv.org/abs/2206.07643) | OVD & REC | - | - | 22.7/21.5/26.0/-/-/- | GEN paper | - | 18 | | MM-Grounding-DINO | [An Open and Comprehensive Pipeline for Unified Object Grounding and Detection (arxiv 2024)](https://arxiv.org/abs/2401.02361) | DOD & OVD & REC | O365, GoldG, GRIT, V3Det | [MM-GDINO official](https://github.com/open-mmlab/mmdetection/tree/main/configs/mm_grounding_dino#zero-shot-description-detection-datasetdod) | 22.9/21.9/26.0/-/-/- | MM-GDINO paper | - | 19 | | GEN (FIBER-B) | [Generating Enhanced Negatives for Training Language-Based Object Detectors (arxiv 2024](https://arxiv.org/abs/2401.00094) | DOD | - | - | 26.0/25.2/28.1/-/-/- | GEN paper | Enhancement based on FIBER-B | 20 | | APE-large (D) | [Aligning and Prompting Everything All at Once for Universal Visual Perception (arxiv 2023)](https://arxiv.org/abs/2312.02153) | DOD & OVD & REC | COCO, LVIS, O365, OpenImages, Visual Genome, RefCOCO/+/g, SA-1B, GQA, PhraseCut, Flickr30k | [APE official](https://github.com/shenyunhang/APE) | 37.5/38.8/33.9/21.0/22.0/17.9 | APE paper | Extra training data helps for this amazing performance | 21 | 22 | 23 | Some extra notes: 24 | - Each method is currently recorded by *the variant with the highest performance* in this table, if there are multiple variants available, so it's only a leaderboard, not meant for fair comparison. 25 | - Methods like GLIP, FIBER, etc. are actually not evaluated on OVD benchmarks. For zero-shot eval on DOD, We currently do not distinguish between methods for OVD benchmarks and methods for ZS-OD, as long as it is verified with open-set detection capability. 26 | 27 | For other variants (e.g. for a fair comparison regarding data, backbone, etc.), please refer to the papers. 28 | -------------------------------------------------------------------------------- /eval_sota/groundingdino.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = "Chi Xie" 3 | __maintainer__ = "Chi Xie" 4 | 5 | # An example for how to run this script: 6 | # CUDA_VISIBLE_DEVICES=0 7 | # python groundingdino.py \ 8 | # -c ./groundingdino/config/GroundingDINO_SwinB.cfg.py \ 9 | # -p ./ckpt/groundingdino_swinb_cogcoor.pth \ 10 | # -o "outputs/gdino_d3" \ 11 | # --box_threshold 0.05 \ 12 | # --text_threshold 0.05 \ 13 | # --img-top1 14 | 15 | import argparse 16 | import json 17 | import os 18 | 19 | import numpy as np 20 | import torch 21 | from PIL import Image, ImageDraw, ImageFont 22 | from pycocotools.coco import COCO 23 | from pycocotools.cocoeval import COCOeval 24 | from tqdm import tqdm 25 | 26 | import groundingdino.datasets.transforms as T 27 | from groundingdino.models import build_model 28 | from groundingdino.util.slconfig import SLConfig 29 | from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap 30 | from d_cube import D3 31 | 32 | 33 | def plot_boxes_to_image(image_pil, tgt): 34 | H, W = tgt["size"] 35 | boxes = tgt["boxes"] 36 | labels = tgt["labels"] 37 | assert len(boxes) == len(labels), "boxes and labels must have same length" 38 | 39 | draw = ImageDraw.Draw(image_pil) 40 | mask = Image.new("L", image_pil.size, 0) 41 | mask_draw = ImageDraw.Draw(mask) 42 | 43 | # draw boxes and masks 44 | for box, label in zip(boxes, labels): 45 | # from 0..1 to 0..W, 0..H 46 | box = box * torch.Tensor([W, H, W, H]) 47 | # from xywh to xyxy 48 | box[:2] -= box[2:] / 2 49 | box[2:] += box[:2] 50 | # random color 51 | color = tuple(np.random.randint(0, 255, size=3).tolist()) 52 | # draw 53 | x0, y0, x1, y1 = box 54 | x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1) 55 | 56 | draw.rectangle([x0, y0, x1, y1], outline=color, width=6) 57 | # draw.text((x0, y0), str(label), fill=color) 58 | 59 | font = ImageFont.load_default() 60 | if hasattr(font, "getbbox"): 61 | bbox = draw.textbbox((x0, y0), str(label), font) 62 | else: 63 | w, h = draw.textsize(str(label), font) 64 | bbox = (x0, y0, w + x0, y0 + h) 65 | # bbox = draw.textbbox((x0, y0), str(label)) 66 | draw.rectangle(bbox, fill=color) 67 | draw.text((x0, y0), str(label), fill="white") 68 | 69 | mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6) 70 | return image_pil, mask 71 | 72 | 73 | def load_image(image_path): 74 | # load image 75 | image_pil = Image.open(image_path).convert("RGB") # load image 76 | 77 | transform = T.Compose( 78 | [ 79 | T.RandomResize([800], max_size=1333), 80 | T.ToTensor(), 81 | T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), 82 | ] 83 | ) 84 | image, _ = transform(image_pil, None) # 3, h, w 85 | return image_pil, image 86 | 87 | 88 | def load_model(model_config_path, model_checkpoint_path, cpu_only=False): 89 | args = SLConfig.fromfile(model_config_path) 90 | args.device = "cuda" if not cpu_only else "cpu" 91 | model = build_model(args) 92 | checkpoint = torch.load(model_checkpoint_path, map_location="cpu") 93 | load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False) 94 | print(load_res) 95 | _ = model.eval() 96 | return model 97 | 98 | 99 | def get_grounding_output(model, image, caption, box_threshold, text_threshold, with_logits=True, cpu_only=False): 100 | caption = caption.lower() 101 | caption = caption.strip() 102 | if not caption.endswith("."): 103 | caption = caption + "." 104 | device = "cuda" if not cpu_only else "cpu" 105 | model = model.to(device) 106 | image = image.to(device) 107 | with torch.no_grad(): 108 | outputs = model(image[None], captions=[caption]) 109 | logits = outputs["pred_logits"].cpu().sigmoid()[0] # (nq, 256) 110 | boxes = outputs["pred_boxes"].cpu()[0] # (nq, 4) 111 | logits.shape[0] 112 | 113 | # filter output 114 | logits_filt = logits.clone() 115 | boxes_filt = boxes.clone() 116 | filt_mask = logits_filt.max(dim=1)[0] > box_threshold 117 | logits_filt = logits_filt[filt_mask] # num_filt, 256 118 | boxes_filt = boxes_filt[filt_mask] # num_filt, 4 119 | logits_filt.shape[0] 120 | 121 | # get phrase 122 | tokenlizer = model.tokenizer 123 | tokenized = tokenlizer(caption) 124 | # build pred 125 | pred_phrases = [] 126 | logits_list = [] 127 | for logit, box in zip(logits_filt, boxes_filt): 128 | pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer) 129 | logits_list.append(logit.max().item()) 130 | if with_logits: 131 | pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})") 132 | else: 133 | pred_phrases.append(pred_phrase) 134 | 135 | return boxes_filt, pred_phrases, logits_list 136 | 137 | 138 | def get_dataset_iter(coco): 139 | img_ids = coco.get_img_ids() 140 | for img_id in img_ids: 141 | img_info = coco.load_imgs(img_id)[0] 142 | file_name = img_info["file_name"] 143 | img_path = os.path.join(IMG_ROOT, file_name) 144 | yield img_id, img_path 145 | 146 | 147 | def eval_on_d3(pred_path, mode="pn"): 148 | assert mode in ("pn", "p", "n") 149 | if mode == "pn": 150 | gt_path = os.path.join(JSON_ANNO_PATH, "d3_full_annotations.json") 151 | elif mode == "p": 152 | gt_path = os.path.join(JSON_ANNO_PATH, "d3_pres_annotations.json") 153 | else: 154 | gt_path = os.path.join(JSON_ANNO_PATH, "d3_abs_annotations.json") 155 | coco = COCO(gt_path) 156 | d3_res = coco.loadRes(pred_path) 157 | cocoEval = COCOeval(coco, d3_res, "bbox") 158 | cocoEval.evaluate() 159 | cocoEval.accumulate() 160 | cocoEval.summarize() 161 | 162 | # comment the following if u only need intra/inter map for full/pres/abs 163 | # ===================== uncomment this if u need detailed analysis ===================== 164 | # aps = cocoEval.eval["precision"][:, :, :, 0, -1] 165 | # category_ids = coco.getCatIds() 166 | # category_names = [cat["name"] for cat in coco.loadCats(category_ids)] 167 | 168 | # aps_lens = defaultdict(list) 169 | # counter_lens = defaultdict(int) 170 | # for i in range(len(category_names)): 171 | # ap = aps[:, :, i] 172 | # ap_value = ap[ap > -1].mean() 173 | # if not np.isnan(ap_value): 174 | # len_ref = len(category_names[i].split(" ")) 175 | # aps_lens[len_ref].append(ap_value) 176 | # counter_lens[len_ref] += 1 177 | 178 | # ap_sum_short = sum([sum(aps_lens[i]) for i in range(0, 4)]) 179 | # ap_sum_mid = sum([sum(aps_lens[i]) for i in range(4, 7)]) 180 | # ap_sum_long = sum([sum(aps_lens[i]) for i in range(7, 10)]) 181 | # ap_sum_very_long = sum( 182 | # [sum(aps_lens[i]) for i in range(10, max(counter_lens.keys()) + 1)] 183 | # ) 184 | # c_sum_short = sum([counter_lens[i] for i in range(1, 4)]) 185 | # c_sum_mid = sum([counter_lens[i] for i in range(4, 7)]) 186 | # c_sum_long = sum([counter_lens[i] for i in range(7, 10)]) 187 | # c_sum_very_long = sum( 188 | # [counter_lens[i] for i in range(10, max(counter_lens.keys()) + 1)] 189 | # ) 190 | # map_short = ap_sum_short / c_sum_short 191 | # map_mid = ap_sum_mid / c_sum_mid 192 | # map_long = ap_sum_long / c_sum_long 193 | # map_very_long = ap_sum_very_long / c_sum_very_long 194 | # print( 195 | # f"mAP over reference length: short - {map_short:.4f}, mid - {map_mid:.4f}, long - {map_long:.4f}, very long - {map_very_long:.4f}" 196 | # ) 197 | # ===================== uncomment this if u need detailed analysis ===================== 198 | 199 | 200 | def inference_on_d3(data_iter, model, args, box_threshold, text_threshold): 201 | pred = [] 202 | for idx, (img_id, image_path) in enumerate(tqdm(data_iter)): 203 | # load image 204 | image_pil, image = load_image(image_path) 205 | size = image_pil.size 206 | W, H = size 207 | 208 | group_ids = d3.get_group_ids(img_ids=[img_id]) 209 | sent_ids = d3.get_sent_ids(group_ids=group_ids) 210 | sent_list = d3.load_sents(sent_ids=sent_ids) 211 | text_list = [sent['raw_sent'] for sent in sent_list] 212 | 213 | for sent_id, text_prompt in zip(sent_ids, text_list): 214 | # run model 215 | boxes_filt, pred_phrases, logit_list = get_grounding_output( 216 | model, image, text_prompt, box_threshold, text_threshold, cpu_only=args.cpu_only, with_logits=False, 217 | ) 218 | if args.vis: 219 | pred_dict = { 220 | "boxes": boxes_filt, # [x_center, y_center, w, h] 221 | "size": [size[1], size[0]], 222 | "labels": [f"{phrase}({str(logit)[:4]})" for phrase, logit in zip(pred_phrases, logit_list)], 223 | } 224 | image_with_box = plot_boxes_to_image(image_pil.copy(), pred_dict)[0] 225 | image_with_box.save(os.path.join(output_dir, f"{img_id}_{text_prompt}.jpg")) 226 | if not logit_list: 227 | continue 228 | if args.img_top1: 229 | max_score_idx = logit_list.index(max(logit_list)) 230 | bboxes, phrases, logits = [boxes_filt[max_score_idx]], [pred_phrases[max_score_idx]], [logit_list[max_score_idx]] 231 | else: 232 | bboxes, phrases, logits = boxes_filt, pred_phrases, logit_list 233 | for box, phrase, logit in zip(bboxes, phrases, logits): 234 | if len(phrase) > args.overlap_percent * len(text_prompt) or phrase == text_prompt: 235 | x1, y1, w, h = box.tolist() 236 | x0, y0 = x1 - w / 2, y1 - h / 2 237 | pred_item = { 238 | "image_id": img_id, 239 | "category_id": sent_id, 240 | "bbox": [x0 * W, y0 * H, w * W, h * H], 241 | "score": float(logit), 242 | } 243 | pred.append(pred_item) 244 | 245 | return pred 246 | 247 | 248 | if __name__ == "__main__": 249 | IMG_ROOT = None # set here 250 | JSON_ANNO_PATH = None # set here 251 | PKL_ANNO_PATH = None # set here 252 | assert IMG_ROOT is not None, "Please set IMG_ROOT in the script first" 253 | assert JSON_ANNO_PATH is not None, "Please set JSON_ANNO_PATH in the script first" 254 | assert PKL_ANNO_PATH is not None, "Please set PKL_ANNO_PATH in the script first" 255 | 256 | d3 = D3(IMG_ROOT, PKL_ANNO_PATH) 257 | 258 | parser = argparse.ArgumentParser("Grounding DINO evaluation on D-cube (https://arxiv.org/abs/2307.12813)", add_help=True) 259 | parser.add_argument("--config_file", "-c", type=str, required=True, help="path to config file") 260 | parser.add_argument( 261 | "--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file" 262 | ) 263 | # parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file") 264 | # parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt") 265 | parser.add_argument( 266 | "--output_dir", "-o", type=str, default="outputs", required=True, help="output directory" 267 | ) 268 | parser.add_argument("--vis", action="store_true", help="visualization on D3") 269 | 270 | parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold") 271 | parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold") 272 | 273 | parser.add_argument("--cpu-only", action="store_true", help="running on cpu only!, default=False") 274 | parser.add_argument("--img-top1", action="store_true", help="select only the box with top max score") 275 | # parser.add_argument("--overlap-percent", type=float, default=1.0, help="overlapping percentage between input prompt and output label") 276 | # this overlapping percentage denotes an additional post-processing technique we designed. if you turn this on, you may get higher performance by tuning this parameter. 277 | args = parser.parse_args() 278 | args.overlap_percent = 1 # by default, we do not use this technique. 279 | print(args) 280 | 281 | # cfg 282 | config_file = args.config_file # change the path of the model config file 283 | checkpoint_path = args.checkpoint_path # change the path of the model 284 | # image_path = args.image_path 285 | # text_prompt = args.text_prompt 286 | output_dir = args.output_dir 287 | box_threshold = args.box_threshold 288 | text_threshold = args.text_threshold 289 | 290 | # make dir 291 | os.makedirs(output_dir, exist_ok=True) 292 | # load model 293 | model = load_model(config_file, checkpoint_path, cpu_only=args.cpu_only) 294 | 295 | data_iter = get_dataset_iter(d3) 296 | 297 | pred = inference_on_d3(data_iter, model, args, box_threshold=box_threshold, text_threshold=text_threshold) 298 | 299 | pred_path = os.path.join(output_dir, f"prediction.json") 300 | with open(pred_path, "w") as f_: 301 | json.dump(pred, f_) 302 | eval_on_d3(pred_path, mode='pn') 303 | eval_on_d3(pred_path, mode='p') 304 | eval_on_d3(pred_path, mode='n') 305 | -------------------------------------------------------------------------------- /eval_sota/owl_vit.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from collections import defaultdict 4 | 5 | from tqdm import tqdm 6 | from PIL import Image 7 | import numpy as np 8 | from pycocotools.coco import COCO 9 | from pycocotools.cocoeval import COCOeval 10 | import torch 11 | from transformers import OwlViTProcessor, OwlViTForObjectDetection 12 | 13 | from d_cube import D3 14 | 15 | 16 | def write_json(json_path, json_data): 17 | with open(json_path, "w") as f_: 18 | json.dump(json_data, f_) 19 | 20 | 21 | def read_json(json_path): 22 | with open(json_path, "r") as f_: 23 | json_data = json.load(f_) 24 | return json_data 25 | 26 | 27 | def load_image_general(image_path): 28 | image_pil = Image.open(image_path) 29 | return image_pil 30 | 31 | 32 | def get_prediction(model, image, captions, cpu_only=False): 33 | for i in range(len(captions)): 34 | captions[i] = captions[i].lower() 35 | captions[i] = captions[i].strip() 36 | if not captions[i].endswith("."): 37 | captions[i] = captions[i] + "." 38 | device = "cuda" if not cpu_only else "cpu" 39 | model = model.to(device) 40 | with torch.no_grad(): 41 | inputs = processor(text=[captions], images=image, return_tensors="pt").to( 42 | device 43 | ) 44 | outputs = model(**inputs) 45 | target_size = torch.Tensor([image.size[::-1]]).to(device) 46 | results = processor.post_process_object_detection( 47 | outputs=outputs, target_sizes=target_size, threshold=0.1 48 | # the post precessing threshold will affect the performance obviously 49 | # you may tune it to get better performance, e.g., 0.05 50 | ) 51 | boxes, scores, labels = ( 52 | results[0]["boxes"], 53 | results[0]["scores"], 54 | results[0]["labels"], 55 | ) 56 | return boxes, scores, labels 57 | 58 | 59 | def get_dataset_iter(coco): 60 | img_ids = coco.get_img_ids() 61 | for img_id in img_ids: 62 | img_info = coco.load_imgs(img_id)[0] 63 | file_name = img_info["file_name"] 64 | img_path = os.path.join(IMG_ROOT, file_name) 65 | yield img_id, img_path 66 | 67 | 68 | def eval_on_d3(pred_path, mode="pn"): 69 | assert mode in ("pn", "p", "n") 70 | if mode == "pn": 71 | gt_path = os.path.join(JSON_ANNO_PATH, "d3_full_annotations.json") 72 | elif mode == "p": 73 | gt_path = os.path.join(JSON_ANNO_PATH, "d3_pres_annotations.json") 74 | else: 75 | gt_path = os.path.join(JSON_ANNO_PATH, "d3_abs_annotations.json") 76 | coco = COCO(gt_path) 77 | d3_res = coco.loadRes(pred_path) 78 | cocoEval = COCOeval(coco, d3_res, "bbox") 79 | cocoEval.evaluate() 80 | cocoEval.accumulate() 81 | cocoEval.summarize() 82 | 83 | # comment the following if u only need intra/inter map for full/pres/abs 84 | # ===================== uncomment this if u need detailed analysis ===================== 85 | # aps = cocoEval.eval["precision"][:, :, :, 0, -1] 86 | # category_ids = coco.getCatIds() 87 | # category_names = [cat["name"] for cat in coco.loadCats(category_ids)] 88 | 89 | # aps_lens = defaultdict(list) 90 | # counter_lens = defaultdict(int) 91 | # for i in range(len(category_names)): 92 | # ap = aps[:, :, i] 93 | # ap_value = ap[ap > -1].mean() 94 | # if not np.isnan(ap_value): 95 | # len_ref = len(category_names[i].split(" ")) 96 | # aps_lens[len_ref].append(ap_value) 97 | # counter_lens[len_ref] += 1 98 | 99 | # ap_sum_short = sum([sum(aps_lens[i]) for i in range(0, 4)]) 100 | # ap_sum_mid = sum([sum(aps_lens[i]) for i in range(4, 7)]) 101 | # ap_sum_long = sum([sum(aps_lens[i]) for i in range(7, 10)]) 102 | # ap_sum_very_long = sum( 103 | # [sum(aps_lens[i]) for i in range(10, max(counter_lens.keys()) + 1)] 104 | # ) 105 | # c_sum_short = sum([counter_lens[i] for i in range(1, 4)]) 106 | # c_sum_mid = sum([counter_lens[i] for i in range(4, 7)]) 107 | # c_sum_long = sum([counter_lens[i] for i in range(7, 10)]) 108 | # c_sum_very_long = sum( 109 | # [counter_lens[i] for i in range(10, max(counter_lens.keys()) + 1)] 110 | # ) 111 | # map_short = ap_sum_short / c_sum_short 112 | # map_mid = ap_sum_mid / c_sum_mid 113 | # map_long = ap_sum_long / c_sum_long 114 | # map_very_long = ap_sum_very_long / c_sum_very_long 115 | # print( 116 | # f"mAP over reference length: short - {map_short:.4f}, mid - {map_mid:.4f}, long - {map_long:.4f}, very long - {map_very_long:.4f}" 117 | # ) 118 | # ===================== uncomment this if u need detailed analysis ===================== 119 | 120 | 121 | def inference_on_d3(data_iter, model): 122 | pred = [] 123 | error = [] 124 | for img_id, image_path in tqdm(data_iter): 125 | image = load_image_general(image_path) 126 | 127 | # ==================================== intra-group setting ==================================== 128 | # each image is evaluated with the categories in its group (usually 4) 129 | group_ids = d3.get_group_ids(img_ids=[img_id]) 130 | sent_ids = d3.get_sent_ids(group_ids=group_ids) 131 | # ==================================== intra-group setting ==================================== 132 | # ==================================== inter-group setting ==================================== 133 | # each image is evaluated with all categories in the dataset (422 for the first version of the dataset) 134 | # sent_ids = d3.get_sent_ids() 135 | # ==================================== inter-group setting ==================================== 136 | sent_list = d3.load_sents(sent_ids=sent_ids) 137 | text_list = [sent["raw_sent"] for sent in sent_list] 138 | 139 | try: 140 | boxes, scores, labels = get_prediction(model, image, text_list, cpu_only=False) 141 | for box, score, label in zip(boxes, scores, labels): 142 | pred_item = { 143 | "image_id": img_id, 144 | "category_id": sent_ids[label], 145 | "bbox": convert_to_xywh(box.tolist()), # use xywh 146 | "score": float(score), 147 | } 148 | pred.append(pred_item) # the output to be saved to JSON. 149 | except: 150 | print("error!!!") 151 | return pred, error 152 | 153 | 154 | def convert_to_xywh(bbox_xyxy): 155 | """ 156 | Convert top-left and bottom-right corner coordinates to [x, y, width, height] format. 157 | """ 158 | x1, y1, x2, y2 = bbox_xyxy 159 | width = x2 - x1 160 | height = y2 - y1 161 | return [x1, y1, width, height] 162 | 163 | 164 | if __name__ == "__main__": 165 | IMG_ROOT = None # set here 166 | JSON_ANNO_PATH = None # set here 167 | PKL_ANNO_PATH = None # set here 168 | assert IMG_ROOT is not None, "Please set IMG_ROOT in the script first" 169 | assert JSON_ANNO_PATH is not None, "Please set JSON_ANNO_PATH in the script first" 170 | assert PKL_ANNO_PATH is not None, "Please set PKL_ANNO_PATH in the script first" 171 | 172 | d3 = D3(IMG_ROOT, PKL_ANNO_PATH) 173 | 174 | output_dir = "ovd/owlvit/" 175 | os.makedirs(output_dir, exist_ok=True) 176 | 177 | # model prediction 178 | processor = OwlViTProcessor.from_pretrained("owl-vit") 179 | model = OwlViTForObjectDetection.from_pretrained("owl-vit") 180 | data_iter = get_dataset_iter(d3) 181 | pred, error = inference_on_d3(data_iter, model) 182 | 183 | pred_path = os.path.join(output_dir, f"prediction.json") 184 | pred_path_error = os.path.join(output_dir, "error.json") 185 | write_json(pred_path, pred) 186 | write_json(pred_path_error, error) 187 | # see https://github.com/shikras/d-cube/blob/main/doc.md#output-format for the output format 188 | # the output format is identical to COCO. 189 | 190 | eval_on_d3(pred_path, mode="pn") # the FULL setting 191 | eval_on_d3(pred_path, mode="p") # the PRES setting 192 | eval_on_d3(pred_path, mode="n") # the ABS setting 193 | -------------------------------------------------------------------------------- /eval_sota/sphinx.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = "Chi Xie and Jie Li" 3 | __maintainer__ = "Chi Xie" 4 | 5 | import json 6 | import os 7 | from collections import defaultdict 8 | import re 9 | 10 | from PIL import Image 11 | from pycocotools.coco import COCO 12 | from pycocotools.cocoeval import COCOeval 13 | 14 | from d_cube import D3 15 | 16 | 17 | def write_json(json_path, json_data): 18 | with open(json_path, "w") as f_: 19 | json.dump(json_data, f_) 20 | 21 | 22 | def read_json(json_path): 23 | with open(json_path, "r") as f_: 24 | json_data = json.load(f_) 25 | return json_data 26 | 27 | 28 | def load_image_general(image_path): 29 | image_pil = Image.open(image_path) 30 | return image_pil 31 | 32 | 33 | def extract_boxes(input_string): 34 | # if input_string.startswith("None"): 35 | # return [] 36 | # Define the pattern using regular expression 37 | pattern = r'\[([\d.,; ]+)\]' 38 | 39 | # Search for the pattern in the input string 40 | match = re.search(pattern, input_string) 41 | 42 | # If a match is found, extract and return the boxes as a list 43 | if match: 44 | boxes_str = match.group(1) 45 | boxes_list = [list(map(float, box.split(','))) for box in boxes_str.split(';')] 46 | return boxes_list 47 | else: 48 | return [] 49 | 50 | 51 | def get_prediction(mllm_res, image, captions, cpu_only=False): 52 | boxes, scores, labels = [], [], [] 53 | width, height = image.size 54 | for idx, res_item in enumerate(mllm_res): 55 | boxes_list = extract_boxes(res_item["answer"]) 56 | for bbox in boxes_list: 57 | bbox_rescaled = get_true_bbox(image.size, bbox) 58 | boxes.append(bbox_rescaled) 59 | scores.append(1.0) 60 | labels.append(idx) 61 | return boxes, scores, labels 62 | 63 | 64 | def get_dataset_iter(coco): 65 | img_ids = coco.get_img_ids() 66 | for img_id in img_ids: 67 | img_info = coco.load_imgs(img_id)[0] 68 | file_name = img_info["file_name"] 69 | img_path = os.path.join(IMG_ROOT, file_name) 70 | yield img_id, file_name, img_path 71 | 72 | 73 | def eval_on_d3(pred_path, mode="pn"): 74 | assert mode in ("pn", "p", "n") 75 | if mode == "pn": 76 | gt_path = os.path.join(JSON_ANNO_PATH, "d3_full_annotations.json") 77 | elif mode == "p": 78 | gt_path = os.path.join(JSON_ANNO_PATH, "d3_pres_annotations.json") 79 | else: 80 | gt_path = os.path.join(JSON_ANNO_PATH, "d3_abs_annotations.json") 81 | coco = COCO(gt_path) 82 | d3_res = coco.loadRes(pred_path) 83 | cocoEval = COCOeval(coco, d3_res, "bbox") 84 | cocoEval.evaluate() 85 | cocoEval.accumulate() 86 | cocoEval.summarize() 87 | 88 | 89 | def group_sphinx_res_by_img(inference_res): 90 | inference_res_by_img = defaultdict(list) 91 | for res_item in inference_res: 92 | img_path = "/".join(res_item["image_path"].split("/")[-2:]) 93 | inference_res_by_img[img_path].append(res_item) 94 | inference_res_by_img = dict(inference_res_by_img) 95 | return inference_res_by_img 96 | 97 | 98 | def get_true_bbox(img_size, bbox): 99 | width, height = img_size 100 | max_edge = max(height, width) 101 | bbox = [v * max_edge for v in bbox] 102 | diff = abs(width - height) // 2 103 | if height < width: 104 | bbox[1] -= diff 105 | bbox[3] -= diff 106 | else: 107 | bbox[0] -= diff 108 | bbox[2] -= diff 109 | return bbox 110 | 111 | 112 | def inference_on_d3(data_iter, inference_res): 113 | pred = [] 114 | inf_res_by_img = group_sphinx_res_by_img(inference_res) 115 | for idx, (img_id, img_name, img_path) in enumerate(data_iter): 116 | image = load_image_general(img_path) 117 | 118 | # ==================================== intra-group setting ==================================== 119 | # each image is evaluated with the categories in its group (usually 4) 120 | group_ids = d3.get_group_ids(img_ids=[img_id]) 121 | sent_ids = d3.get_sent_ids(group_ids=group_ids) 122 | # ==================================== intra-group setting ==================================== 123 | # ==================================== inter-group setting ==================================== 124 | # each image is evaluated with all categories in the dataset (422 for the first version of the dataset) 125 | # sent_ids = d3.get_sent_ids() 126 | # ==================================== inter-group setting ==================================== 127 | sent_list = d3.load_sents(sent_ids=sent_ids) 128 | text_list = [sent["raw_sent"] for sent in sent_list] 129 | 130 | boxes, scores, labels = get_prediction(inf_res_by_img[img_name], image, text_list, cpu_only=False) 131 | for box, score, label in zip(boxes, scores, labels): 132 | pred_item = { 133 | "image_id": img_id, 134 | "category_id": sent_ids[label], 135 | "bbox": convert_to_xywh(box), # use xywh 136 | "score": float(score), 137 | } 138 | pred.append(pred_item) # the output to be saved to JSON. 139 | return pred 140 | 141 | 142 | def convert_to_xywh(bbox_xyxy): 143 | """ 144 | Convert top-left and bottom-right corner coordinates to [x, y, width, height] format. 145 | """ 146 | x1, y1, x2, y2 = bbox_xyxy 147 | width = x2 - x1 148 | height = y2 - y1 149 | return [x1, y1, width, height] 150 | 151 | 152 | if __name__ == "__main__": 153 | IMG_ROOT = None # set here 154 | JSON_ANNO_PATH = None # set here 155 | PKL_ANNO_PATH = None # set here 156 | # ============================== SPHINX inference result file =============== 157 | SPHINX_INFERENCE_RES_PATH = None 158 | # You can download the SPHINX d3 inference result example from: 159 | # https://github.com/shikras/d-cube/files/14276682/sphinx_d3_result.json 160 | # For the inference process, please refer to SPHINX official repo (https://github.com/Alpha-VLLM/LLaMA2-Accessory) 161 | # the prompts we used are available in this JSON file 162 | # Thanks for the contribution from Jie Li (https://github.com/theFool32) 163 | # ============================== SPHINX inference result file =============== 164 | assert IMG_ROOT is not None, "Please set IMG_ROOT in the script first" 165 | assert JSON_ANNO_PATH is not None, "Please set JSON_ANNO_PATH in the script first" 166 | assert PKL_ANNO_PATH is not None, "Please set PKL_ANNO_PATH in the script first" 167 | 168 | d3 = D3(IMG_ROOT, PKL_ANNO_PATH) 169 | 170 | output_dir = "mllm/sphinx/" # or whatever you prefer 171 | inference_res = read_json(SPHINX_INFERENCE_RES_PATH) 172 | 173 | # model prediction 174 | data_iter = get_dataset_iter(d3) 175 | pred = inference_on_d3(data_iter, inference_res) 176 | 177 | pred_path = os.path.join(output_dir, f"prediction.json") 178 | write_json(pred_path, pred) 179 | # see https://github.com/shikras/d-cube/blob/main/doc.md#output-format for the output format 180 | # the output format is identical to COCO. 181 | 182 | eval_on_d3(pred_path, mode="pn") # the FULL setting 183 | eval_on_d3(pred_path, mode="p") # the PRES setting 184 | eval_on_d3(pred_path, mode="n") # the ABS setting 185 | -------------------------------------------------------------------------------- /qa.md: -------------------------------------------------------------------------------- 1 | # Frequently Asked Questions 2 | 3 | Q: 4 | What's the difference between Intra-Group and Inter-Group setting in [the DOD paper](https://arxiv.org/abs/2307.12813), and how to set them? 5 | 6 | A: 7 | Please see [this explanation in the document](./doc.md#intra--or-inter-group-settings). 8 | 9 | 10 | 11 | Q: 12 | What's the meaning of and difference between FULL, PRES, and ABS? 13 | 14 | A: 15 | Please see [this explanation in the document](./doc.md#full-pres-and-abs). 16 | 17 | 18 | 19 | Q: 20 | How do I perform a visualization of ground truth or prediction on a image? 21 | 22 | A: 23 | You can use `d3.get_anno_ids` function and pass the `img_id` you choose as parameter to get the annotation ids for a image. 24 | After this, you can obtain the annotation details (class ids, bboxes) with `d3.load_annos`. 25 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pycocotools 3 | opencv-python 4 | matplotlib 5 | -------------------------------------------------------------------------------- /scripts/eval_and_analysis_json.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = "Chi Xie and Zhao Zhang" 3 | __maintainer__ = "Chi Xie" 4 | # this script takes the result json in, and print evaluation and analysis result on D-cube (FULL/PRES/ABS, etc.) 5 | import os 6 | import json 7 | import argparse 8 | from collections import defaultdict 9 | 10 | import numpy as np 11 | from pycocotools.coco import COCO 12 | from pycocotools.cocoeval import COCOeval 13 | 14 | from d_cube import D3 15 | 16 | def eval_on_d3(pred_path, mode="pn", nbox_partition=None, lref_partition=False): 17 | assert mode in ("pn", "p", "n") 18 | if mode == "pn": 19 | gt_path = os.path.join(JSON_ANNO_PATH, "d3_full_annotations.json") 20 | elif mode == "p": 21 | gt_path = os.path.join(JSON_ANNO_PATH, "d3_pres_annotations.json") 22 | else: 23 | gt_path = os.path.join(JSON_ANNO_PATH, "d3_abs_annotations.json") 24 | 25 | if nbox_partition: 26 | gt_path, pred_path = nbox_partition_json(gt_path, pred_path, nbox_partition) 27 | 28 | # Eval results 29 | coco = COCO(gt_path) 30 | d3_res = coco.loadRes(pred_path) 31 | cocoEval = COCOeval(coco, d3_res, "bbox") 32 | cocoEval.evaluate() 33 | cocoEval.accumulate() 34 | cocoEval.summarize() 35 | 36 | aps = cocoEval.eval["precision"][:, :, :, 0, -1] 37 | category_ids = coco.getCatIds() 38 | category_names = [cat["name"] for cat in coco.loadCats(category_ids)] 39 | 40 | if lref_partition: 41 | aps_lens = defaultdict(list) 42 | counter_lens = defaultdict(int) 43 | for i in range(len(category_names)): 44 | ap = aps[:, :, i] 45 | ap_value = ap[ap > -1].mean() 46 | if not np.isnan(ap_value): 47 | len_ref = len(category_names[i].split(" ")) 48 | aps_lens[len_ref].append(ap_value) 49 | counter_lens[len_ref] += 1 50 | 51 | ap_sum_short = sum([sum(aps_lens[i]) for i in range(0, 4)]) 52 | ap_sum_mid = sum([sum(aps_lens[i]) for i in range(4, 7)]) 53 | ap_sum_long = sum([sum(aps_lens[i]) for i in range(7, 10)]) 54 | ap_sum_very_long = sum( 55 | [sum(aps_lens[i]) for i in range(10, max(counter_lens.keys()) + 1)] 56 | ) 57 | c_sum_short = sum([counter_lens[i] for i in range(1, 4)]) 58 | c_sum_mid = sum([counter_lens[i] for i in range(4, 7)]) 59 | c_sum_long = sum([counter_lens[i] for i in range(7, 10)]) 60 | c_sum_very_long = sum( 61 | [counter_lens[i] for i in range(10, max(counter_lens.keys()) + 1)] 62 | ) 63 | map_short = ap_sum_short / c_sum_short 64 | map_mid = ap_sum_mid / c_sum_mid 65 | map_long = ap_sum_long / c_sum_long 66 | map_very_long = ap_sum_very_long / c_sum_very_long 67 | print( 68 | f"mAP over reference length: short - {map_short:.4f}, mid - {map_mid:.4f}, long - {map_long:.4f}, very long - {map_very_long:.4f}" 69 | ) 70 | 71 | 72 | def nbox_partition_json(gt_path, pred_path, nbox_partition): 73 | with open(gt_path, "r") as f_gt: 74 | gts = json.load(f_gt) 75 | with open(pred_path, "r") as f_pred: 76 | preds = json.load(f_pred) 77 | 78 | cat_obj_count = d3.bbox_num_analyze() 79 | annos = gts["annotations"] 80 | new_annos = [] 81 | for ann in annos: 82 | img_id = ann["image_id"] 83 | category_id = ann["category_id"] 84 | if nbox_partition == "one" and cat_obj_count[category_id - 1, img_id] == 1: 85 | new_annos.append(ann) 86 | if nbox_partition == "multi" and cat_obj_count[category_id - 1, img_id] > 1: 87 | new_annos.append(ann) 88 | if nbox_partition == "two" and cat_obj_count[category_id - 1, img_id] == 2: 89 | new_annos.append(ann) 90 | if nbox_partition == "three" and cat_obj_count[category_id - 1, img_id] == 3: 91 | new_annos.append(ann) 92 | if nbox_partition == "four" and cat_obj_count[category_id - 1, img_id] == 4: 93 | new_annos.append(ann) 94 | if nbox_partition == "four_more" and cat_obj_count[category_id - 1, img_id] > 4: 95 | new_annos.append(ann) 96 | gts["annotations"] = new_annos 97 | new_gts = gts 98 | new_preds = [] 99 | for prd in preds: 100 | img_id = prd["image_id"] 101 | category_id = prd["category_id"] 102 | if nbox_partition == "no" and cat_obj_count[category_id - 1, img_id] == 0: 103 | new_preds.append(prd) 104 | if nbox_partition == "one" and cat_obj_count[category_id - 1, img_id] == 1: 105 | new_preds.append(prd) 106 | if nbox_partition == "multi" and cat_obj_count[category_id - 1, img_id] > 1: 107 | new_preds.append(prd) 108 | if nbox_partition == "two" and cat_obj_count[category_id - 1, img_id] == 2: 109 | new_preds.append(prd) 110 | if nbox_partition == "three" and cat_obj_count[category_id - 1, img_id] == 3: 111 | new_preds.append(prd) 112 | if nbox_partition == "four" and cat_obj_count[category_id - 1, img_id] == 4: 113 | new_preds.append(prd) 114 | if nbox_partition == "four_more" and cat_obj_count[category_id - 1, img_id] > 4: 115 | new_preds.append(prd) 116 | 117 | new_gt_path = gt_path.replace(".json", f".{nbox_partition}-instance.json") 118 | new_pred_path = pred_path.replace(".json", f".{nbox_partition}-instance.json") 119 | with open(new_gt_path, "w") as fo_gt: 120 | json.dump(new_gts, fo_gt) 121 | with open(new_pred_path, "w") as fo_pred: 122 | json.dump(new_preds, fo_pred) 123 | return new_gt_path, new_pred_path 124 | 125 | 126 | def convert_to_xywh(x1, y1, x2, y2): 127 | """ 128 | Convert top-left and bottom-right corner coordinates to [x,y,width,height] format. 129 | """ 130 | width = x2 - x1 131 | height = y2 - y1 132 | return x1, y1, width, height 133 | 134 | 135 | def transform_json_boxes(pred_path): 136 | with open(pred_path, "r") as f_: 137 | res = json.load(f_) 138 | for item in res: 139 | item["bbox"] = convert_to_xywh(*item["bbox"]) 140 | res_path = pred_path.replace(".json", ".xywh.json") 141 | with open(res_path, "w") as f_w: 142 | json.dump(res, f_w) 143 | return res_path 144 | 145 | 146 | if __name__ == "__main__": 147 | IMG_ROOT = None # set here 148 | JSON_ANNO_PATH = None # set here 149 | PKL_ANNO_PATH = None # set here 150 | assert IMG_ROOT is not None, "Please set IMG_ROOT in the script first" 151 | assert JSON_ANNO_PATH is not None, "Please set JSON_ANNO_PATH in the script first" 152 | assert PKL_ANNO_PATH is not None, "Please set PKL_ANNO_PATH in the script first" 153 | d3 = D3(IMG_ROOT, PKL_ANNO_PATH) 154 | 155 | parser = argparse.ArgumentParser( 156 | "An example script for D-cube evaluation with prediction file (JSON)", 157 | add_help=True, 158 | ) 159 | parser.add_argument("pred_path", type=str, help="path to the prediction JSON file") 160 | parser.add_argument( 161 | "--partition-by-nbox", 162 | action="store_true", 163 | help="divide the images by num of boxes for each ref", 164 | ) 165 | parser.add_argument( 166 | "--partition-by-lens", 167 | action="store_true", 168 | help="divide the references by their lengths", 169 | ) 170 | parser.add_argument( 171 | "--xyxy2xywh", 172 | action="store_true", 173 | help="transform box coords from xyxy to xywh", 174 | ) 175 | args = parser.parse_args() 176 | if args.xyxy2xywh: 177 | pred_path = transform_json_boxes(args.pred_path) 178 | else: 179 | pred_path = args.pred_path 180 | pred_path = args.pred_path 181 | if args.partition_by_nbox: 182 | # partiton: no-instance, one-instance, multi-instance 183 | for mode in ("pn", "p", "n"): 184 | # for ptt in ('no', 'one', 'multi'): 185 | for ptt in ("no", "one", "two", "three", "four", "four_more"): 186 | eval_on_d3(pred_path, mode=mode, nbox_partition=ptt) 187 | else: 188 | eval_on_d3(pred_path, mode="pn", lref_partition=args.partition_by_lens) 189 | eval_on_d3(pred_path, mode="p", lref_partition=args.partition_by_lens) 190 | eval_on_d3(pred_path, mode="n", lref_partition=args.partition_by_lens) 191 | -------------------------------------------------------------------------------- /scripts/eval_json_example.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = "Chi Xie and Zhao Zhang" 3 | __maintainer__ = "Chi Xie" 4 | # this script takes the result json in, and print evaluation and analysis result on D-cube (FULL/PRES/ABS, etc.) 5 | from pycocotools.coco import COCO 6 | from pycocotools.cocoeval import COCOeval 7 | 8 | # Eval results with COCOAPI 9 | gt_path = "./d3_full_annotations.json" # FULL, PRES or ABS 10 | pred_path = None # set your prediction JSON path 11 | coco = COCO(gt_path) 12 | d3_res = coco.loadRes(pred_path) 13 | cocoEval = COCOeval(coco, d3_res, "bbox") 14 | cocoEval.evaluate() 15 | cocoEval.accumulate() 16 | cocoEval.summarize() 17 | -------------------------------------------------------------------------------- /scripts/get_d3_stat.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from d_cube.vis_util import plot_hist 4 | from d_cube import D3 5 | 6 | 7 | def vis_num_instance(cat_obj_count): 8 | # Assuming `cat_obj_count` is your numpy array of shape [n_cat, n_img] 9 | 10 | # Calculate the total number of instances in each image 11 | total_instances_per_image = np.sum(cat_obj_count, axis=0) 12 | 13 | # # Plot the histogram 14 | # plt.hist(total_instances_per_image, bins=20) 15 | # plt.xlabel('Number of Instances') 16 | # plt.ylabel('Frequency') 17 | # plt.title('Distribution of Number of Instances on a Image') 18 | 19 | # # Save the figure 20 | # plt.savefig('vis_fig/instance_distribution.png', bbox_inches='tight') 21 | # plt.close() 22 | plot_hist( 23 | total_instances_per_image, 24 | bins=max(total_instances_per_image) - min(total_instances_per_image) + 1, 25 | save_path="vis_fig/instance_dist_hist.pdf", 26 | ) 27 | 28 | 29 | def vis_num_category(cat_obj_count): 30 | # Assuming `cat_obj_count` is your numpy array of shape [n_cat, n_img] 31 | 32 | # Calculate the number of categories in each image 33 | num_categories_per_image = np.sum(cat_obj_count > 0, axis=0) 34 | 35 | # # Plot the histogram 36 | # plt.hist(num_categories_per_image, bins=20) 37 | # plt.xlabel('Number of Categories') 38 | # plt.ylabel('Frequency') 39 | # plt.title('Distribution of Number of Categories on a Image') 40 | 41 | # # Save the figure 42 | # plt.savefig('vis_fig/category_distribution.png', bbox_inches='tight') 43 | # plt.close() 44 | plot_hist( 45 | num_categories_per_image, 46 | bins=max(num_categories_per_image) - min(num_categories_per_image) + 1, 47 | save_path="vis_fig/category_dist_hist.pdf", 48 | ) 49 | 50 | 51 | def vis_num_img_per_cat(cat_obj_count): 52 | num_img_per_cat = np.sum(cat_obj_count > 0, axis=1) 53 | plot_hist( 54 | num_img_per_cat, 55 | bins=20, 56 | save_path="vis_fig/nimg_pcat_hist.pdf", 57 | x="Num. of images", 58 | ) 59 | 60 | 61 | def vis_num_box_per_cat(cat_obj_count): 62 | num_box_per_cat = np.sum(cat_obj_count, axis=1) 63 | plot_hist( 64 | num_box_per_cat, 65 | bins=20, 66 | save_path="vis_fig/nbox_pcat_hist.pdf", 67 | x="Num. of instances", 68 | ) 69 | 70 | 71 | def vis_num_box_per_cat_per_img(cat_obj_count): 72 | img_obj_count = cat_obj_count.reshape(-1) 73 | plot_hist( 74 | img_obj_count[img_obj_count > 0], 75 | bins=max(img_obj_count) - min(img_obj_count) + 1, 76 | save_path="vis_fig/nbox_pcat_pimg_hist.pdf", 77 | x="Num. of instances on a image", 78 | ) 79 | 80 | 81 | if __name__ == "__main__": 82 | IMG_ROOT = None # set here 83 | PKL_ANNO_PATH = None # set here 84 | assert IMG_ROOT is not None, "Please set IMG_ROOT in the script first" 85 | assert PKL_ANNO_PATH is not None, "Please set PKL_ANNO_PATH in the script first" 86 | d3 = D3(IMG_ROOT, PKL_ANNO_PATH) 87 | 88 | cat_obj_count = d3.bbox_num_analyze() 89 | vis_num_instance(cat_obj_count) 90 | vis_num_category(cat_obj_count) 91 | vis_num_img_per_cat(cat_obj_count) 92 | vis_num_box_per_cat(cat_obj_count) 93 | vis_num_box_per_cat_per_img(cat_obj_count) 94 | 95 | d3.stat_description(with_rev=False) 96 | d3.stat_description(with_rev=True) 97 | d3.stat_description(with_rev=False, inter_group=True) 98 | d3.stat_description(with_rev=True, inter_group=True) 99 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | setuptools.setup( 4 | name='ddd-dataset', 5 | version='0.1.2', 6 | author='Chi Xie', 7 | author_email='chixie.personal@gmail.com', 8 | description='Toolkit for Description Detection Dataset ($D^3$)', 9 | long_description='Toolkit for Description Detection Dataset ($D^3$): A detection dataset with class names characterized by intricate and flexible expressions, for the Described Object Detection (DOD) task.', 10 | long_description_content_type='text/markdown', 11 | license='CC BY-NC 4.0', 12 | packages=['d_cube'], 13 | package_dir={"d_cube": "d_cube"}, 14 | url='https://github.com/shikras/d-cube', 15 | project_urls={ 16 | "Bug Tracker": "https://github.com/shikras/d-cube/issues", 17 | }, 18 | install_requires=['numpy', 'pycocotools', 'opencv-python', 'matplotlib'], 19 | 20 | classifiers=[ 21 | 'Development Status :: 4 - Beta', 22 | 'Intended Audience :: Science/Research', 23 | 'Intended Audience :: Developers', 24 | 'Intended Audience :: Education', 25 | 'Operating System :: MacOS', 26 | 'Operating System :: Microsoft :: Windows', 27 | 'Operating System :: POSIX :: Linux', 28 | 'Programming Language :: Python :: 3', 29 | ], 30 | ) 31 | --------------------------------------------------------------------------------