├── .assets
    ├── d-cube_logo.png
    └── teaser.png
├── .gitignore
├── LICENSE
├── README.md
├── d_cube
    ├── __init__.py
    ├── d3.py
    ├── data_util.py
    └── vis_util.py
├── doc.md
├── eval_sota
    ├── README.md
    ├── groundingdino.py
    ├── owl_vit.py
    └── sphinx.py
├── qa.md
├── requirements.txt
├── scripts
    ├── eval_and_analysis_json.py
    ├── eval_json_example.py
    └── get_d3_stat.py
└── setup.py


/.assets/d-cube_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shikras/d-cube/fa0ccd6358b2bb958e8dcf810fc758717f18e4ec/.assets/d-cube_logo.png


--------------------------------------------------------------------------------
/.assets/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shikras/d-cube/fa0ccd6358b2bb958e8dcf810fc758717f18e4ec/.assets/teaser.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | .vscode/*
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # poetry
 99 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
101 | #   commonly ignored for libraries.
102 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103 | #poetry.lock
104 | 
105 | # pdm
106 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
107 | #pdm.lock
108 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
109 | #   in version control.
110 | #   https://pdm.fming.dev/#use-with-ide
111 | .pdm.toml
112 | 
113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
114 | __pypackages__/
115 | 
116 | # Celery stuff
117 | celerybeat-schedule
118 | celerybeat.pid
119 | 
120 | # SageMath parsed files
121 | *.sage.py
122 | 
123 | # Environments
124 | .env
125 | .venv
126 | env/
127 | venv/
128 | ENV/
129 | env.bak/
130 | venv.bak/
131 | 
132 | # Spyder project settings
133 | .spyderproject
134 | .spyproject
135 | 
136 | # Rope project settings
137 | .ropeproject
138 | 
139 | # mkdocs documentation
140 | /site
141 | 
142 | # mypy
143 | .mypy_cache/
144 | .dmypy.json
145 | dmypy.json
146 | 
147 | # Pyre type checker
148 | .pyre/
149 | 
150 | # pytype static type analyzer
151 | .pytype/
152 | 
153 | # Cython debug symbols
154 | cython_debug/
155 | 
156 | # PyCharm
157 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
158 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
159 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
160 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
161 | #.idea/
162 | 
163 | # mac system
164 | *.DS_Store
165 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Creative Commons Attribution-NonCommercial 4.0 International
  2 | 
  3 | Creative Commons Corporation ("Creative Commons") is not a law firm and
  4 | does not provide legal services or legal advice. Distribution of
  5 | Creative Commons public licenses does not create a lawyer-client or
  6 | other relationship. Creative Commons makes its licenses and related
  7 | information available on an "as-is" basis. Creative Commons gives no
  8 | warranties regarding its licenses, any material licensed under their
  9 | terms and conditions, or any related information. Creative Commons
 10 | disclaims all liability for damages resulting from their use to the
 11 | fullest extent possible.
 12 | 
 13 | Using Creative Commons Public Licenses
 14 | 
 15 | Creative Commons public licenses provide a standard set of terms and
 16 | conditions that creators and other rights holders may use to share
 17 | original works of authorship and other material subject to copyright and
 18 | certain other rights specified in the public license below. The
 19 | following considerations are for informational purposes only, are not
 20 | exhaustive, and do not form part of our licenses.
 21 | 
 22 | -   Considerations for licensors: Our public licenses are intended for
 23 |     use by those authorized to give the public permission to use
 24 |     material in ways otherwise restricted by copyright and certain other
 25 |     rights. Our licenses are irrevocable. Licensors should read and
 26 |     understand the terms and conditions of the license they choose
 27 |     before applying it. Licensors should also secure all rights
 28 |     necessary before applying our licenses so that the public can reuse
 29 |     the material as expected. Licensors should clearly mark any material
 30 |     not subject to the license. This includes other CC-licensed
 31 |     material, or material used under an exception or limitation to
 32 |     copyright. More considerations for licensors :
 33 |     wiki.creativecommons.org/Considerations_for_licensors
 34 | 
 35 | -   Considerations for the public: By using one of our public licenses,
 36 |     a licensor grants the public permission to use the licensed material
 37 |     under specified terms and conditions. If the licensor's permission
 38 |     is not necessary for any reason–for example, because of any
 39 |     applicable exception or limitation to copyright–then that use is not
 40 |     regulated by the license. Our licenses grant only permissions under
 41 |     copyright and certain other rights that a licensor has authority to
 42 |     grant. Use of the licensed material may still be restricted for
 43 |     other reasons, including because others have copyright or other
 44 |     rights in the material. A licensor may make special requests, such
 45 |     as asking that all changes be marked or described. Although not
 46 |     required by our licenses, you are encouraged to respect those
 47 |     requests where reasonable. More considerations for the public :
 48 |     wiki.creativecommons.org/Considerations_for_licensees
 49 | 
 50 | Creative Commons Attribution-NonCommercial 4.0 International Public
 51 | License
 52 | 
 53 | By exercising the Licensed Rights (defined below), You accept and agree
 54 | to be bound by the terms and conditions of this Creative Commons
 55 | Attribution-NonCommercial 4.0 International Public License ("Public
 56 | License"). To the extent this Public License may be interpreted as a
 57 | contract, You are granted the Licensed Rights in consideration of Your
 58 | acceptance of these terms and conditions, and the Licensor grants You
 59 | such rights in consideration of benefits the Licensor receives from
 60 | making the Licensed Material available under these terms and conditions.
 61 | 
 62 | -   Section 1 – Definitions.
 63 | 
 64 |     -   a. Adapted Material means material subject to Copyright and
 65 |         Similar Rights that is derived from or based upon the Licensed
 66 |         Material and in which the Licensed Material is translated,
 67 |         altered, arranged, transformed, or otherwise modified in a
 68 |         manner requiring permission under the Copyright and Similar
 69 |         Rights held by the Licensor. For purposes of this Public
 70 |         License, where the Licensed Material is a musical work,
 71 |         performance, or sound recording, Adapted Material is always
 72 |         produced where the Licensed Material is synched in timed
 73 |         relation with a moving image.
 74 |     -   b. Adapter's License means the license You apply to Your
 75 |         Copyright and Similar Rights in Your contributions to Adapted
 76 |         Material in accordance with the terms and conditions of this
 77 |         Public License.
 78 |     -   c. Copyright and Similar Rights means copyright and/or similar
 79 |         rights closely related to copyright including, without
 80 |         limitation, performance, broadcast, sound recording, and Sui
 81 |         Generis Database Rights, without regard to how the rights are
 82 |         labeled or categorized. For purposes of this Public License, the
 83 |         rights specified in Section 2(b)(1)-(2) are not Copyright and
 84 |         Similar Rights.
 85 |     -   d. Effective Technological Measures means those measures that,
 86 |         in the absence of proper authority, may not be circumvented
 87 |         under laws fulfilling obligations under Article 11 of the WIPO
 88 |         Copyright Treaty adopted on December 20, 1996, and/or similar
 89 |         international agreements.
 90 |     -   e. Exceptions and Limitations means fair use, fair dealing,
 91 |         and/or any other exception or limitation to Copyright and
 92 |         Similar Rights that applies to Your use of the Licensed
 93 |         Material.
 94 |     -   f. Licensed Material means the artistic or literary work,
 95 |         database, or other material to which the Licensor applied this
 96 |         Public License.
 97 |     -   g. Licensed Rights means the rights granted to You subject to
 98 |         the terms and conditions of this Public License, which are
 99 |         limited to all Copyright and Similar Rights that apply to Your
100 |         use of the Licensed Material and that the Licensor has authority
101 |         to license.
102 |     -   h. Licensor means the individual(s) or entity(ies) granting
103 |         rights under this Public License.
104 |     -   i. NonCommercial means not primarily intended for or directed
105 |         towards commercial advantage or monetary compensation. For
106 |         purposes of this Public License, the exchange of the Licensed
107 |         Material for other material subject to Copyright and Similar
108 |         Rights by digital file-sharing or similar means is NonCommercial
109 |         provided there is no payment of monetary compensation in
110 |         connection with the exchange.
111 |     -   j. Share means to provide material to the public by any means or
112 |         process that requires permission under the Licensed Rights, such
113 |         as reproduction, public display, public performance,
114 |         distribution, dissemination, communication, or importation, and
115 |         to make material available to the public including in ways that
116 |         members of the public may access the material from a place and
117 |         at a time individually chosen by them.
118 |     -   k. Sui Generis Database Rights means rights other than copyright
119 |         resulting from Directive 96/9/EC of the European Parliament and
120 |         of the Council of 11 March 1996 on the legal protection of
121 |         databases, as amended and/or succeeded, as well as other
122 |         essentially equivalent rights anywhere in the world.
123 |     -   l. You means the individual or entity exercising the Licensed
124 |         Rights under this Public License. Your has a corresponding
125 |         meaning.
126 | 
127 | -   Section 2 – Scope.
128 | 
129 |     -   a. License grant.
130 |         -   1. Subject to the terms and conditions of this Public
131 |             License, the Licensor hereby grants You a worldwide,
132 |             royalty-free, non-sublicensable, non-exclusive, irrevocable
133 |             license to exercise the Licensed Rights in the Licensed
134 |             Material to:
135 |             -   A. reproduce and Share the Licensed Material, in whole
136 |                 or in part, for NonCommercial purposes only; and
137 |             -   B. produce, reproduce, and Share Adapted Material for
138 |                 NonCommercial purposes only.
139 |         -   2. Exceptions and Limitations. For the avoidance of doubt,
140 |             where Exceptions and Limitations apply to Your use, this
141 |             Public License does not apply, and You do not need to comply
142 |             with its terms and conditions.
143 |         -   3. Term. The term of this Public License is specified in
144 |             Section 6(a).
145 |         -   4. Media and formats; technical modifications allowed. The
146 |             Licensor authorizes You to exercise the Licensed Rights in
147 |             all media and formats whether now known or hereafter
148 |             created, and to make technical modifications necessary to do
149 |             so. The Licensor waives and/or agrees not to assert any
150 |             right or authority to forbid You from making technical
151 |             modifications necessary to exercise the Licensed Rights,
152 |             including technical modifications necessary to circumvent
153 |             Effective Technological Measures. For purposes of this
154 |             Public License, simply making modifications authorized by
155 |             this Section 2(a)(4) never produces Adapted Material.
156 |         -   5. Downstream recipients.
157 |             -   A. Offer from the Licensor – Licensed Material. Every
158 |                 recipient of the Licensed Material automatically
159 |                 receives an offer from the Licensor to exercise the
160 |                 Licensed Rights under the terms and conditions of this
161 |                 Public License.
162 |             -   B. No downstream restrictions. You may not offer or
163 |                 impose any additional or different terms or conditions
164 |                 on, or apply any Effective Technological Measures to,
165 |                 the Licensed Material if doing so restricts exercise of
166 |                 the Licensed Rights by any recipient of the Licensed
167 |                 Material.
168 |         -   6. No endorsement. Nothing in this Public License
169 |             constitutes or may be construed as permission to assert or
170 |             imply that You are, or that Your use of the Licensed
171 |             Material is, connected with, or sponsored, endorsed, or
172 |             granted official status by, the Licensor or others
173 |             designated to receive attribution as provided in Section
174 |             3(a)(1)(A)(i).
175 |     -   b. Other rights.
176 |         -   1. Moral rights, such as the right of integrity, are not
177 |             licensed under this Public License, nor are publicity,
178 |             privacy, and/or other similar personality rights; however,
179 |             to the extent possible, the Licensor waives and/or agrees
180 |             not to assert any such rights held by the Licensor to the
181 |             limited extent necessary to allow You to exercise the
182 |             Licensed Rights, but not otherwise.
183 |         -   2. Patent and trademark rights are not licensed under this
184 |             Public License.
185 |         -   3. To the extent possible, the Licensor waives any right to
186 |             collect royalties from You for the exercise of the Licensed
187 |             Rights, whether directly or through a collecting society
188 |             under any voluntary or waivable statutory or compulsory
189 |             licensing scheme. In all other cases the Licensor expressly
190 |             reserves any right to collect such royalties, including when
191 |             the Licensed Material is used other than for NonCommercial
192 |             purposes.
193 | 
194 | -   Section 3 – License Conditions.
195 | 
196 |     Your exercise of the Licensed Rights is expressly made subject to
197 |     the following conditions.
198 | 
199 |     -   a. Attribution.
200 |         -   1. If You Share the Licensed Material (including in modified
201 |             form), You must:
202 |             -   A. retain the following if it is supplied by the
203 |                 Licensor with the Licensed Material:
204 |                 -   i. identification of the creator(s) of the Licensed
205 |                     Material and any others designated to receive
206 |                     attribution, in any reasonable manner requested by
207 |                     the Licensor (including by pseudonym if designated);
208 |                 -   ii. a copyright notice;
209 |                 -   iii. a notice that refers to this Public License;
210 |                 -   iv. a notice that refers to the disclaimer of
211 |                     warranties;
212 |                 -   v. a URI or hyperlink to the Licensed Material to
213 |                     the extent reasonably practicable;
214 |             -   B. indicate if You modified the Licensed Material and
215 |                 retain an indication of any previous modifications; and
216 |             -   C. indicate the Licensed Material is licensed under this
217 |                 Public License, and include the text of, or the URI or
218 |                 hyperlink to, this Public License.
219 |         -   2. You may satisfy the conditions in Section 3(a)(1) in any
220 |             reasonable manner based on the medium, means, and context in
221 |             which You Share the Licensed Material. For example, it may
222 |             be reasonable to satisfy the conditions by providing a URI
223 |             or hyperlink to a resource that includes the required
224 |             information.
225 |         -   3. If requested by the Licensor, You must remove any of the
226 |             information required by Section 3(a)(1)(A) to the extent
227 |             reasonably practicable.
228 |         -   4. If You Share Adapted Material You produce, the Adapter's
229 |             License You apply must not prevent recipients of the Adapted
230 |             Material from complying with this Public License.
231 | 
232 | -   Section 4 – Sui Generis Database Rights.
233 | 
234 |     Where the Licensed Rights include Sui Generis Database Rights that
235 |     apply to Your use of the Licensed Material:
236 | 
237 |     -   a. for the avoidance of doubt, Section 2(a)(1) grants You the
238 |         right to extract, reuse, reproduce, and Share all or a
239 |         substantial portion of the contents of the database for
240 |         NonCommercial purposes only;
241 |     -   b. if You include all or a substantial portion of the database
242 |         contents in a database in which You have Sui Generis Database
243 |         Rights, then the database in which You have Sui Generis Database
244 |         Rights (but not its individual contents) is Adapted Material;
245 |         and
246 |     -   c. You must comply with the conditions in Section 3(a) if You
247 |         Share all or a substantial portion of the contents of the
248 |         database.
249 | 
250 |     For the avoidance of doubt, this Section 4 supplements and does not
251 |     replace Your obligations under this Public License where the
252 |     Licensed Rights include other Copyright and Similar Rights.
253 | 
254 | -   Section 5 – Disclaimer of Warranties and Limitation of Liability.
255 | 
256 |     -   a. Unless otherwise separately undertaken by the Licensor, to
257 |         the extent possible, the Licensor offers the Licensed Material
258 |         as-is and as-available, and makes no representations or
259 |         warranties of any kind concerning the Licensed Material, whether
260 |         express, implied, statutory, or other. This includes, without
261 |         limitation, warranties of title, merchantability, fitness for a
262 |         particular purpose, non-infringement, absence of latent or other
263 |         defects, accuracy, or the presence or absence of errors, whether
264 |         or not known or discoverable. Where disclaimers of warranties
265 |         are not allowed in full or in part, this disclaimer may not
266 |         apply to You.
267 |     -   b. To the extent possible, in no event will the Licensor be
268 |         liable to You on any legal theory (including, without
269 |         limitation, negligence) or otherwise for any direct, special,
270 |         indirect, incidental, consequential, punitive, exemplary, or
271 |         other losses, costs, expenses, or damages arising out of this
272 |         Public License or use of the Licensed Material, even if the
273 |         Licensor has been advised of the possibility of such losses,
274 |         costs, expenses, or damages. Where a limitation of liability is
275 |         not allowed in full or in part, this limitation may not apply to
276 |         You.
277 |     -   c. The disclaimer of warranties and limitation of liability
278 |         provided above shall be interpreted in a manner that, to the
279 |         extent possible, most closely approximates an absolute
280 |         disclaimer and waiver of all liability.
281 | 
282 | -   Section 6 – Term and Termination.
283 | 
284 |     -   a. This Public License applies for the term of the Copyright and
285 |         Similar Rights licensed here. However, if You fail to comply
286 |         with this Public License, then Your rights under this Public
287 |         License terminate automatically.
288 |     -   b. Where Your right to use the Licensed Material has terminated
289 |         under Section 6(a), it reinstates:
290 | 
291 |         -   1. automatically as of the date the violation is cured,
292 |             provided it is cured within 30 days of Your discovery of the
293 |             violation; or
294 |         -   2. upon express reinstatement by the Licensor.
295 | 
296 |         For the avoidance of doubt, this Section 6(b) does not affect
297 |         any right the Licensor may have to seek remedies for Your
298 |         violations of this Public License.
299 | 
300 |     -   c. For the avoidance of doubt, the Licensor may also offer the
301 |         Licensed Material under separate terms or conditions or stop
302 |         distributing the Licensed Material at any time; however, doing
303 |         so will not terminate this Public License.
304 |     -   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
305 |         License.
306 | 
307 | -   Section 7 – Other Terms and Conditions.
308 | 
309 |     -   a. The Licensor shall not be bound by any additional or
310 |         different terms or conditions communicated by You unless
311 |         expressly agreed.
312 |     -   b. Any arrangements, understandings, or agreements regarding the
313 |         Licensed Material not stated herein are separate from and
314 |         independent of the terms and conditions of this Public License.
315 | 
316 | -   Section 8 – Interpretation.
317 | 
318 |     -   a. For the avoidance of doubt, this Public License does not, and
319 |         shall not be interpreted to, reduce, limit, restrict, or impose
320 |         conditions on any use of the Licensed Material that could
321 |         lawfully be made without permission under this Public License.
322 |     -   b. To the extent possible, if any provision of this Public
323 |         License is deemed unenforceable, it shall be automatically
324 |         reformed to the minimum extent necessary to make it enforceable.
325 |         If the provision cannot be reformed, it shall be severed from
326 |         this Public License without affecting the enforceability of the
327 |         remaining terms and conditions.
328 |     -   c. No term or condition of this Public License will be waived
329 |         and no failure to comply consented to unless expressly agreed to
330 |         by the Licensor.
331 |     -   d. Nothing in this Public License constitutes or may be
332 |         interpreted as a limitation upon, or waiver of, any privileges
333 |         and immunities that apply to the Licensor or You, including from
334 |         the legal processes of any jurisdiction or authority.
335 | 
336 | Creative Commons is not a party to its public licenses. Notwithstanding,
337 | Creative Commons may elect to apply one of its public licenses to
338 | material it publishes and in those instances will be considered the
339 | "Licensor." The text of the Creative Commons public licenses is
340 | dedicated to the public domain under the CC0 Public Domain Dedication.
341 | Except for the limited purpose of indicating that material is shared
342 | under a Creative Commons public license or as otherwise permitted by the
343 | Creative Commons policies published at creativecommons.org/policies,
344 | Creative Commons does not authorize the use of the trademark "Creative
345 | Commons" or any other trademark or logo of Creative Commons without its
346 | prior written consent including, without limitation, in connection with
347 | any unauthorized modifications to any of its public licenses or any
348 | other arrangements, understandings, or agreements concerning use of
349 | licensed material. For the avoidance of doubt, this paragraph does not
350 | form part of the public licenses.
351 | 
352 | Creative Commons may be contacted at creativecommons.org.
353 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!-- PROJECT LOGO -->
  2 | <br />
  3 | <p align="center">
  4 |   <a href="#">
  5 | <img src=".assets/d-cube_logo.png" alt="Logo" width="310"></a>
  6 |   <h4 align="center">A detection/segmentation dataset with class names characterized by intricate and flexible expressions</h4>
  7 |     <p align="center">
  8 |     The repo is the toolbox for <b>D<sup>3</sup></b>
  9 |     <br />
 10 |     <a href="doc.md"><strong> [Doc 📚]</strong></a>
 11 |     <!-- <a href="https://huggingface.co/datasets/zbrl/d-cube"><strong> [HuggingFace 🤗]</strong></a> -->
 12 |     <a href="https://arxiv.org/abs/2307.12813"><strong> [Paper (DOD) 📄] </strong></a>
 13 |     <a href="https://arxiv.org/abs/2305.12452"><strong> [Paper (GRES) 📄] </strong></a>
 14 |     <a href="https://github.com/Charles-Xie/awesome-described-object-detection"><strong> [Awesome-DOD 🕶️] </strong></a>
 15 |     <br />
 16 |   </p>
 17 | </p>
 18 | 
 19 | ***
 20 | Description Detection Dataset ($D^3$, /dikju:b/) is an attempt at creating a next-generation object detection dataset.
 21 | Unlike traditional detection datasets, the class names of the objects are no longer just simple nouns or noun phrases, but rather complex and descriptive descriptions, such as `a dog not being held by a leash`.
 22 | For each image in the dataset, any object that matches the description is annotated.
 23 | The dataset provides annotations such as bounding boxes and finely crafted instance masks.
 24 | We believe it will contribute to computer vision and vision-language communities.
 25 | 
 26 | 
 27 | 
 28 | # News
 29 | - [02/14/2024] Evaluation on several SOTA methods (SPHNX (the first MLLM evaluated!), G-DINO, UNINEXT, etc.) are released, together with a [leaderboard](https://github.com/shikras/d-cube/tree/main/eval_sota) for $D^3$. :fire::fire:
 30 | 
 31 | - [10/12/2023] We released an [awesome-described-object-detection](https://github.com/Charles-Xie/awesome-described-object-detection) list to collect and track related works.
 32 | 
 33 | - [09/22/2023] Our DOD [paper](https://arxiv.org/abs/2307.12813) just got accepted by NeurIPS 2023! :fire:
 34 | 
 35 | - [07/25/2023] This toolkit is available on PyPI now. You can install this repo with `pip install ddd-dataset`.
 36 | 
 37 | - [07/25/2023] The [paper preprint](https://arxiv.org/abs/2307.12813) introducing the DOD task and the $D^3$ dataset, is available on arxiv. Check it out!
 38 | 
 39 | - [07/18/2023] We have released our Description Detection Dataset ($D^3$) and the first version of $D^3$ toolbox. You can download it now for your project.
 40 | 
 41 | - [07/14/2023] Our GRES [paper](https://arxiv.org/abs/2305.12452) has been accepted by ICCV 2023.
 42 | 
 43 | 
 44 | 
 45 | # Contents
 46 | - [Dataset Highlight](#task-and-dataset-highlight)
 47 | - [Download](#download)
 48 | - [Installation](#installation)
 49 | - [Usage](#usage)
 50 | 
 51 | 
 52 | 
 53 | # Task and Dataset Highlight
 54 | 
 55 | The $D^3$ dataset is meant for the Described Object Detection (DOD) task. In the image below we show the difference between Referring Expression Comprehension (REC), Object Detection/Open-Vocabulary Detection (OVD) and Described Object Detection (DOD). OVD detect object based on category name, and each category can have zero to multiple instances; REC grounds one region based on a language description, whether the object truly exits or not; DOD detect all instances on each image in the dataset, based on a flexible reference. Related works are tracked in the [awesome-DOD](https://github.com/Charles-Xie/awesome-described-object-detection) list.
 56 | 
 57 | ![Dataset Highlight](.assets/teaser.png "Highlight of the task & dataset")
 58 | 
 59 | For more information on the characteristics of this dataset, please refer to our paper.
 60 | 
 61 | 
 62 | 
 63 | # Download
 64 | Currently we host the $D^3$ dataset on cloud drives. You can download the dataset from [Google Drive](https://drive.google.com/drive/folders/11kfY12NzKPwsliLEcIYki1yUqt7PbMEi?usp=sharing) or [Baidu Pan]().
 65 | 
 66 | After downloading the `d3_images.zip` (images in the dataset), `d3_pkl.zip` (dataset information for this toolkit) and `d3_json.zip` (annotation for evaluation), please extract these 3 zip files to your custom `IMG_ROOT`, `PKL_PATH` and `JSON_ANNO_PATH` directory. These paths will be used when you perform inference or evaluation on this dataset.
 67 | 
 68 | 
 69 | 
 70 | # Installation
 71 | 
 72 | ## Prerequisites
 73 | This toolkit requires a few python packages like `numpy` and `pycocotools`. Other packages like `matplotlib` and `opencv-python` may also be required if you want to utilize the visualization scripts.
 74 | 
 75 | <!-- There are three ways to install $D^3$ toolbox, and the third one (with huggingface) is currently in the works and will be available soon. -->
 76 | 
 77 | There are multiple ways to install $D^3$ toolbox, as listed below:
 78 | 
 79 | 
 80 | ## Install with pip
 81 | ```bash
 82 | pip install ddd-dataset
 83 | ```
 84 | 
 85 | ## Install from source
 86 | ```bash
 87 | git clone https://github.com/shikra/d-cube.git
 88 | # option 1: install it as a python package
 89 | cd d-cube
 90 | python -m pip install .
 91 | # done
 92 | 
 93 | # option 2: just put the d-cube/d_cube directory in the root directory of your local repository
 94 | ```
 95 | 
 96 | <!-- ## Via HuggingFace Datasets 🤗
 97 | ```bash
 98 | coming soon
 99 | ``` -->
100 | 
101 | 
102 | 
103 | # Usage
104 | Please refer to the [documentation 📚](doc.md) for more details.
105 | Our toolbox is similar to [cocoapi](https://github.com/cocodataset/cocoapi) in style.
106 | 
107 | Here is a quick example of how to use $D^3$.
108 | ```python
109 | from d_cube import D3
110 | d3 = D3(IMG_ROOT, PKL_ANNO_PATH)
111 | all_img_ids = d3.get_img_ids()  # get the image ids in the dataset
112 | all_img_info = d3.load_imgs(all_img_ids)  # load images by passing a list of some image ids
113 | img_path = all_img_info[0]["file_name"]  # obtain one image path so you can load it and inference
114 | ```
115 | 
116 | Some frequently asked questions are answered in [this Q&A file](./qa.md).
117 | 
118 | # Citation
119 | 
120 | If you use our $D^3$ dataset, this toolbox, or otherwise find our work valuable, please cite [our paper](https://arxiv.org/abs/2307.12813):
121 | 
122 | ```bibtex
123 | @inproceedings{xie2023DOD,
124 |   title={Described Object Detection: Liberating Object Detection with Flexible Expressions},
125 |   author={Xie, Chi and Zhang, Zhao and Wu, Yixuan and Zhu, Feng and Zhao, Rui and Liang, Shuang},
126 |   booktitle={Thirty-seventh Conference on Neural Information Processing Systems (NeurIPS)},
127 |   year={2023}
128 | }
129 | 
130 | @inproceedings{wu2023gres,
131 |   title={Advancing Referring Expression Segmentation Beyond Single Image},
132 |   author={Wu, Yixuan and Zhang, Zhao and Xie, Chi and Zhu, Feng and Zhao, Rui},
133 |   booktitle={International Conference on Computer Vision (ICCV)},
134 |   year={2023}
135 | }
136 | ```
137 | 
138 | More works related to Described Object Detection are tracked in this list: [awesome-described-object-detection](https://github.com/Charles-Xie/awesome-described-object-detection).
139 | 


--------------------------------------------------------------------------------
/d_cube/__init__.py:
--------------------------------------------------------------------------------
1 | from .d3 import D3
2 | 


--------------------------------------------------------------------------------
/d_cube/d3.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | __author__ = "Chi Xie and Zhao Zhang"
  3 | __maintainer__ = "Chi Xie"
  4 | # this is the core of the d-cube toolkit
  5 | import os
  6 | import os.path as osp
  7 | import json
  8 | from collections import defaultdict
  9 | 
 10 | import numpy as np
 11 | from pycocotools import mask
 12 | import cv2
 13 | import matplotlib.pyplot as plt
 14 | 
 15 | 
 16 | from .data_util import *
 17 | 
 18 | 
 19 | class D3:
 20 |     def __init__(self, img_root, anno_root):
 21 |         self.image_dir = img_root
 22 |         self.anno_dir = anno_root
 23 |         self.load_data()
 24 | 
 25 |     def load_data(self):
 26 |         file_names = ["sentences.pkl", "annotations.pkl", "images.pkl", "groups.pkl"]
 27 |         self.data = {
 28 |             name.split(".")[0]: load_pkl(osp.join(self.anno_dir, name))
 29 |             for name in file_names
 30 |         }
 31 | 
 32 |     def get_sent_ids(self, anno_ids=[], img_ids=[], group_ids=[], sent_ids=[]):
 33 |         """get sentence ids for D-cube.
 34 | 
 35 |         Args:
 36 |             anno_ids (list, optional): annotation ids to get sentence ids. Defaults to [].
 37 |             img_ids (list, optional): image ids to get sentence ids. Defaults to [].
 38 |             group_ids (list, optional): group ids to get sentence ids. Defaults to [].
 39 |             sent_ids (list, optional): additional sentence ids you want to include. Defaults to [].
 40 | 
 41 |         Raises:
 42 |             Exception: anno_ids, img_ids and group_ids cannot be used together.
 43 | 
 44 |         Returns:
 45 |             list: sentence ids.
 46 |         """
 47 |         img_ids = img_ids if isinstance(img_ids, list) else [img_ids]
 48 |         anno_ids = anno_ids if isinstance(anno_ids, list) else [anno_ids]
 49 |         group_ids = group_ids if isinstance(group_ids, list) else [group_ids]
 50 |         sent_ids = sent_ids if isinstance(sent_ids, list) else [sent_ids]
 51 | 
 52 |         if not any([img_ids, anno_ids, group_ids, sent_ids]):
 53 |             return list(self.data["sentences"].keys())
 54 | 
 55 |         if (
 56 |             (anno_ids and img_ids)
 57 |             or (anno_ids and group_ids)
 58 |             or (img_ids and group_ids)
 59 |         ):
 60 |             raise Exception("anno_ids, img_ids, group_ids can only be used alone")
 61 | 
 62 |         out_ids_set = set()
 63 |         if img_ids:
 64 |             for img_id in img_ids:
 65 |                 imganno_ids = self.data["images"][img_id]["anno_id"]
 66 |                 for ianno_id in imganno_ids:
 67 |                     out_ids_set |= set(self.data["annotations"][ianno_id]["sent_id"])
 68 | 
 69 |         if group_ids:
 70 |             for group_id in group_ids:
 71 |                 out_ids_set |= set(self.data["groups"][group_id]["inner_sent_id"])
 72 | 
 73 |         if anno_ids:
 74 |             for ianno_id in anno_ids:
 75 |                 out_ids_set |= set(self.data["annotations"][ianno_id]["sent_id"])
 76 | 
 77 |         if sent_ids:
 78 |             out_ids_set &= set(sent_ids)
 79 | 
 80 |         return list(out_ids_set)
 81 | 
 82 |     def get_anno_ids(self, anno_ids=[], img_ids=[], group_ids=[], sent_ids=[]):
 83 |         """get annotation ids for D-cube.
 84 | 
 85 |         Args:
 86 |             anno_ids (list, optional): additional annotation ids you want to include. Defaults to [].
 87 |             img_ids (list, optional): image ids to get annotation ids. Defaults to [].
 88 |             group_ids (list, optional): group ids to get annotation ids. Defaults to [].
 89 |             sent_ids (list, optional): sentence ids to get annotation ids. Defaults to [].
 90 | 
 91 |         Raises:
 92 |             Exception: img_ids and group_ids cannot be used together.
 93 | 
 94 |         Returns:
 95 |             list: annotation ids.
 96 |         """
 97 |         img_ids = img_ids if isinstance(img_ids, list) else [img_ids]
 98 |         anno_ids = anno_ids if isinstance(anno_ids, list) else [anno_ids]
 99 |         group_ids = group_ids if isinstance(group_ids, list) else [group_ids]
100 |         sent_ids = sent_ids if isinstance(sent_ids, list) else [sent_ids]
101 | 
102 |         if not any([img_ids, anno_ids, group_ids, sent_ids]):
103 |             return list(self.data["annotations"].keys())
104 | 
105 |         if img_ids and group_ids:
106 |             raise Exception("img_ids, group_ids can only be used alone")
107 | 
108 |         out_ids_set = set()
109 |         if img_ids:
110 |             for img_id in img_ids:
111 |                 out_ids_set |= set(self.data["images"][img_id]["anno_id"])
112 | 
113 |         if group_ids:
114 |             for group_id in group_ids:
115 |                 for groupimg_id in self.data["groups"][group_id]["img_id"]:
116 |                     out_ids_set |= set(self.data["images"][groupimg_id]["anno_id"])
117 | 
118 |         if sent_ids and img_ids:
119 |             for sent_id in sent_ids:
120 |                 out_ids_set &= set(self.data["sentences"][sent_id]["anno_id"])
121 |         else:
122 |             for sent_id in sent_ids:
123 |                 out_ids_set |= set(self.data["sentences"][sent_id]["anno_id"])
124 | 
125 |         if anno_ids:
126 |             out_ids_set &= set(anno_ids)
127 | 
128 |         return list(out_ids_set)
129 | 
130 |     def get_img_ids(self, anno_ids=[], img_ids=[], group_ids=[], sent_ids=[]):
131 |         """get image ids for D-cube.
132 | 
133 |         Args:
134 |             anno_ids (list, optional): annotation ids to get image ids. Defaults to [].
135 |             img_ids (list, optional): additional image ids you want to include. Defaults to [].
136 |             group_ids (list, optional): group ids to get image ids. Defaults to [].
137 |             sent_ids (list, optional): sentence ids to get image ids. Defaults to [].
138 | 
139 |         Raises:
140 |             Exception: anno_ids and img_ids cannot be used together.
141 |             Exception: anno_ids and group_ids cannot be used together.
142 | 
143 |         Returns:
144 |             list: image ids.
145 |         """
146 |         img_ids = img_ids if isinstance(img_ids, list) else [img_ids]
147 |         anno_ids = anno_ids if isinstance(anno_ids, list) else [anno_ids]
148 |         group_ids = group_ids if isinstance(group_ids, list) else [group_ids]
149 |         sent_ids = sent_ids if isinstance(sent_ids, list) else [sent_ids]
150 | 
151 |         if not any([img_ids, anno_ids, group_ids, sent_ids]):
152 |             return list(self.data["images"].keys())
153 | 
154 |         if anno_ids and img_ids:
155 |             raise Exception("anno_ids and img_ids can only be used alone")
156 |         if anno_ids and group_ids:
157 |             raise Exception("anno_ids and group_ids can only be used alone")
158 | 
159 |         out_ids_set = set()
160 |         if anno_ids:
161 |             for ianno_id in anno_ids:
162 |                 out_ids_set.add(self.data["annotations"][ianno_id]["image_id"])
163 | 
164 |         if group_ids:
165 |             for group_id in group_ids:
166 |                 out_ids_set |= set(self.data["groups"][group_id]["img_id"])
167 | 
168 |         if sent_ids:
169 |             for sent_id in sent_ids:
170 |                 for sentanno_id in self.data["sentences"][sent_id]["anno_id"]:
171 |                     out_ids_set.add(self.data["annotations"][sentanno_id]["image_id"])
172 | 
173 |         if img_ids:
174 |             out_ids_set &= set(img_ids)
175 | 
176 |         return list(out_ids_set)
177 | 
178 |     def get_group_ids(self, anno_ids=[], img_ids=[], group_ids=[], sent_ids=[]):
179 |         """get group ids for D-cube.
180 | 
181 |         Args:
182 |             anno_ids (list, optional): annotation ids to get group ids. Defaults to [].
183 |             img_ids (list, optional): image ids to get group ids. Defaults to [].
184 |             group_ids (list, optional): additional group_ids you want to include. Defaults to [].
185 |             sent_ids (list, optional): sentence ids to get group ids. Defaults to [].
186 | 
187 |         Raises:
188 |             Exception: anno_ids, img_ids and sent_ids cannot be used together.
189 | 
190 |         Returns:
191 |             list: group ids.
192 |         """
193 |         img_ids = img_ids if isinstance(img_ids, list) else [img_ids]
194 |         anno_ids = anno_ids if isinstance(anno_ids, list) else [anno_ids]
195 |         group_ids = group_ids if isinstance(group_ids, list) else [group_ids]
196 |         sent_ids = sent_ids if isinstance(sent_ids, list) else [sent_ids]
197 | 
198 |         if not any([img_ids, anno_ids, group_ids, sent_ids]):
199 |             return list(self.data["groups"].keys())
200 | 
201 |         if anno_ids and img_ids:
202 |             raise Exception("anno_ids and img_ids can only be used alone")
203 |         if anno_ids and sent_ids:
204 |             raise Exception("anno_ids and sent_ids can only be used alone")
205 |         if img_ids and sent_ids:
206 |             raise Exception("img_ids and sent_ids can only be used alone")
207 | 
208 |         out_ids_set = set()
209 |         if img_ids:
210 |             for img_id in img_ids:
211 |                 out_ids_set.add(self.data["images"][img_id]["group_id"])
212 | 
213 |         if anno_ids:
214 |             for anno_id in anno_ids:
215 |                 out_ids_set.add(self.data["annotations"][anno_id]["group_id"])
216 | 
217 |         if sent_ids:
218 |             for sent_id in sent_ids:
219 |                 out_ids_set |= set(self.data["sentences"][sent_id]["group_id"])
220 | 
221 |         if group_ids:
222 |             out_ids_set &= set(group_ids)
223 | 
224 |         return list(out_ids_set)
225 | 
226 |     def load_sents(self, sent_ids=None):
227 |         """load sentence info.
228 | 
229 |         Args:
230 |             sent_ids (list, int, optional): sentence ids. Defaults to None.
231 | 
232 |         Returns:
233 |             list: a list of sentence info.
234 |         """
235 |         if sent_ids is not None and not isinstance(sent_ids, list):
236 |             sent_ids = [sent_ids]
237 |         if isinstance(sent_ids, list):
238 |             return [self.data["sentences"][sent_id] for sent_id in sent_ids]
239 |         else:
240 |             return list(self.data["sentences"].values())
241 | 
242 |     def load_annos(self, anno_ids=None):
243 |         """load annotation info.
244 | 
245 |         Args:
246 |             anno_ids (list, int, optional): annotation ids. Defaults to None.
247 | 
248 |         Returns:
249 |             list: a list of annotation info.
250 |         """
251 |         if anno_ids is not None and not isinstance(anno_ids, list):
252 |             anno_ids = [anno_ids]
253 |         if isinstance(anno_ids, list):
254 |             return [self.data["annotations"][anno_id] for anno_id in anno_ids]
255 |         else:
256 |             return list(self.data["annotations"].values())
257 | 
258 |     def load_imgs(self, img_ids=None):
259 |         """load image info.
260 | 
261 |         Args:
262 |             img_ids (list, int, optional): image ids. Defaults to None.
263 | 
264 |         Returns:
265 |             list: a list of image info.
266 |         """
267 |         if img_ids is not None and not isinstance(img_ids, list):
268 |             img_ids = [img_ids]
269 |         if isinstance(img_ids, list):
270 |             return [self.data["images"][img_ids] for img_ids in img_ids]
271 |         else:
272 |             return list(self.data["images"].values())
273 | 
274 |     def load_groups(self, group_ids=None):
275 |         """load group info.
276 | 
277 |         Args:
278 |             group_ids (list, int, optional): group ids. Defaults to None.
279 | 
280 |         Returns:
281 |             list: a list of group info.
282 |         """
283 |         if group_ids is not None and not isinstance(group_ids, list):
284 |             group_ids = [group_ids]
285 |         if isinstance(group_ids, list):
286 |             return [self.data["groups"][group_ids] for group_ids in group_ids]
287 |         else:
288 |             return list(self.data["groups"].values())
289 | 
290 |     def get_mask(self, anno):
291 |         rle = anno[0]["segmentation"]
292 |         m = mask.decode(rle)
293 |         m = np.sum(
294 |             m, axis=2
295 |         )  # sometimes there are multiple binary map (corresponding to multiple segs)
296 |         m = m.astype(np.uint8)  # convert to np.uint8
297 |         # compute area
298 |         area = sum(mask.area(rle))  # should be close to ann['area']
299 |         return {"mask": m, "area": area}
300 | 
301 |     def show_mask(self, anno):
302 |         M = self.get_mask(anno)
303 |         msk = M["mask"]
304 |         ax = plt.gca()
305 |         ax.imshow(msk)
306 | 
307 |     def show_image_seg(
308 |         self,
309 |         img_ids=[],
310 |         save_dir=None,
311 |         show_sent=False,
312 |         on_image=False,
313 |         checkerboard_bg=False,
314 |         is_instance=True,
315 |     ):
316 |         if is_instance and checkerboard_bg:
317 |             raise ValueError(
318 |                 "Cannot apply both is_instance and checkboard_bg at the same time."
319 |             )
320 |         img_infos = self.load_imgs(img_ids=img_ids)
321 |         for img_idx, img_info in enumerate(img_infos):
322 |             img = cv2.imread(osp.join(self.image_dir, img_info["file_name"]))
323 |             anno_infos = self.load_annos(img_info["anno_id"])
324 | 
325 |             bm_canvas = defaultdict(list)
326 |             merge_canvas = defaultdict(list)
327 |             for anno_info in anno_infos:
328 |                 for sent_id in anno_info["sent_id"]:
329 |                     bm_canvas[sent_id].append(anno_info["segmentation"])
330 | 
331 |             for sent_id, bm_list in bm_canvas.items():
332 |                 merge_canvas[sent_id] = merge_rle(
333 |                     bm_list, is_instance=is_instance, on_image=on_image
334 |                 )
335 | 
336 |             cv2.imwrite(osp.join(save_dir, f"{img_info['id']}.png"), img)
337 |             for sent_id, merge_mask in merge_canvas.items():
338 |                 if checkerboard_bg:
339 |                     merge_mask = add_checkerboard_bg(img, merge_mask)
340 |                 elif on_image:
341 |                     merge_mask = visualize_mask_on_image(img, merge_mask, add_edge=True)
342 |                 if show_sent:
343 |                     sent_en = self.load_sents(sent_ids=sent_id)[0]["raw_sent"]
344 |                     merge_mask = paste_text(merge_mask, sent_en)
345 |                 cv2.imwrite(
346 |                     osp.join(save_dir, f"{img_info['id']}_{sent_id}.png"), merge_mask
347 |                 )
348 | 
349 |         return merge_canvas
350 | 
351 |     def show_group_seg(
352 |         self,
353 |         group_ids,
354 |         save_root,
355 |         show_sent=True,
356 |         is_instance=True,
357 |         on_image=False,
358 |         checkerboard_bg=False,
359 |     ):
360 |         group_infos = self.load_groups(group_ids=group_ids)
361 |         for group_info in group_infos:
362 |             save_dir = osp.join(save_root, group_info["group_name"])
363 |             os.makedirs(save_dir, exist_ok=True)
364 |             self.show_image_seg(
365 |                 img_ids=group_info["img_id"],
366 |                 save_dir=save_dir,
367 |                 show_sent=show_sent,
368 |                 is_instance=is_instance,
369 |                 on_image=on_image,
370 |                 checkerboard_bg=checkerboard_bg,
371 |             )
372 | 
373 |     def show_image_seg_bbox(
374 |         self,
375 |         img_ids=[],
376 |         save_dir=None,
377 |         show_sent=False,
378 |         on_image=False,
379 |         checkerboard_bg=False,
380 |         is_instance=True,
381 |     ):
382 |         if is_instance and checkerboard_bg:
383 |             raise ValueError(
384 |                 "Cannot apply both is_instance and checkboard_bg at the same time."
385 |             )
386 |         img_infos = self.load_imgs(img_ids=img_ids)
387 |         for img_idx, img_info in enumerate(img_infos):
388 |             img = cv2.imread(osp.join(self.image_dir, img_info["file_name"]))
389 |             anno_infos = self.load_annos(img_info["anno_id"])
390 | 
391 |             bm_canvas = defaultdict(list)
392 |             merge_canvas = defaultdict(list)
393 |             sent_boxes = defaultdict(list)
394 |             for anno_info in anno_infos:
395 |                 for sent_id in anno_info["sent_id"]:
396 |                     bm_canvas[sent_id].append(anno_info["segmentation"])
397 |                     sent_boxes[sent_id].append(anno_info["bbox"][0].tolist())
398 | 
399 |             for sent_id, bm_list in bm_canvas.items():
400 |                 merge_canvas[sent_id] = merge_rle(
401 |                     bm_list, is_instance=is_instance, on_image=on_image
402 |                 )
403 | 
404 |             cv2.imwrite(osp.join(save_dir, f"{img_info['id']}.png"), img)
405 |             for sent_id, merge_mask in merge_canvas.items():
406 |                 # vis mask
407 |                 if checkerboard_bg:
408 |                     merge_mask = add_checkerboard_bg(img, merge_mask)
409 |                 elif on_image:
410 |                     merge_mask = visualize_mask_on_image(img, merge_mask, add_edge=True)
411 |                 # vis box
412 |                 bboxes = sent_boxes[sent_id]
413 |                 merge_mask = visualize_bbox_on_image(merge_mask, bboxes)
414 |                 # vis sent
415 |                 if show_sent:
416 |                     sent_en = self.load_sents(sent_ids=sent_id)[0]["raw_sent"]
417 |                     merge_mask = paste_text(merge_mask, sent_en)
418 |                 cv2.imwrite(
419 |                     osp.join(save_dir, f"{img_info['id']}_{sent_id}.png"), merge_mask
420 |                 )
421 | 
422 |         return merge_canvas
423 | 
424 |     def show_group_seg_bbox(
425 |         self,
426 |         group_ids,
427 |         save_root,
428 |         show_sent=True,
429 |         is_instance=True,
430 |         on_image=False,
431 |         checkerboard_bg=False,
432 |     ):
433 |         group_infos = self.load_groups(group_ids=group_ids)
434 |         for group_info in group_infos:
435 |             save_dir = osp.join(save_root, group_info["group_name"])
436 |             os.makedirs(save_dir, exist_ok=True)
437 |             self.show_image_seg_bbox(
438 |                 img_ids=group_info["img_id"],
439 |                 save_dir=save_dir,
440 |                 show_sent=show_sent,
441 |                 is_instance=is_instance,
442 |                 on_image=on_image,
443 |                 checkerboard_bg=checkerboard_bg,
444 |             )
445 | 
446 |     def show_image_bbox(self, img_ids=[], save_dir=None, show_sent=False):
447 |         img_infos = self.load_imgs(img_ids=img_ids)
448 |         for img_idx, img_info in enumerate(img_infos):
449 |             img = cv2.imread(osp.join(self.image_dir, img_info["file_name"]))
450 |             anno_infos = self.load_annos(img_info["anno_id"])
451 | 
452 |             sent_boxes = defaultdict(list)
453 |             for anno_info in anno_infos:
454 |                 for sent_id in anno_info["sent_id"]:
455 |                     sent_boxes[sent_id].append(anno_info["bbox"][0].tolist())
456 | 
457 |             cv2.imwrite(osp.join(save_dir, f"{img_info['id']}.png"), img)
458 |             for sent_id, bboxes in sent_boxes.items():
459 |                 merge_img = visualize_bbox_on_image(img, bboxes)
460 |                 if show_sent:
461 |                     sent_en = self.load_sents(sent_ids=sent_id)[0]["raw_sent"]
462 |                     merge_img = paste_text(merge_img, sent_en)
463 |                 cv2.imwrite(
464 |                     osp.join(save_dir, f"{img_info['id']}_{sent_id}.png"), merge_img
465 |                 )
466 | 
467 |     def show_group_bbox(self, group_ids, save_root, show_sent=True):
468 |         group_infos = self.load_groups(group_ids=group_ids)
469 |         for group_info in group_infos:
470 |             save_dir = osp.join(save_root, group_info["group_name"])
471 |             os.makedirs(save_dir, exist_ok=True)
472 |             self.show_image_bbox(
473 |                 img_ids=group_info["img_id"], save_dir=save_dir, show_sent=show_sent
474 |             )
475 | 
476 |     def stat_description(self, with_rev=False, inter_group=False):
477 |         """calculate and print dataset statistics.
478 | 
479 |         Args:
480 |             with_rev (bool, optional): consider absence descriptions or not. Defaults to False.
481 |             inter_group (bool, optional): calculate under intra- or inter-group settings. Defaults to False.
482 |         """
483 |         stat_dict = {}
484 |         # Number of sents
485 |         sent_ids = list(self.data["sentences"].keys())
486 |         if not with_rev:
487 |             sent_ids = [sent_id for sent_id in sent_ids if not self.is_revsent(sent_id)]
488 |         stat_dict["nsent"] = len(sent_ids)
489 |         # Number of annos / instance  # TODO: rm rev
490 |         stat_dict["nanno"] = len(self.data["annotations"].keys())
491 |         # Number of images
492 |         stat_dict["nimg"] = len(self.data["images"].keys())
493 |         # Number of groups
494 |         stat_dict["ngroup"] = len(self.data["groups"].keys())
495 | 
496 |         # Number of img-sent pair
497 |         num_img_sent = 0
498 |         for img_id in self.data["images"].keys():
499 |             anno_ids = self.get_anno_ids(img_ids=img_id)
500 |             anno_infos = self.load_annos(anno_ids=anno_ids)
501 |             cur_sent_set = set()
502 |             group_sent_ids = set(
503 |                 self.load_groups(self.get_group_ids(img_ids=img_id))[0]["inner_sent_id"]
504 |             )
505 |             for anno_info in anno_infos:
506 |                 cur_sent_set |= set(
507 |                     [i for i in anno_info["sent_id"] if i in group_sent_ids]
508 |                 )
509 |             if not with_rev:
510 |                 cur_sent_set = [
511 |                     sent_id for sent_id in cur_sent_set if not self.is_revsent(sent_id)
512 |                 ]
513 |             num_img_sent += len(cur_sent_set)
514 |         stat_dict["num_img_sent"] = num_img_sent
515 | 
516 |         # Number of absence img-sent pair
517 |         num_anti_img_sent = 0
518 |         for img_id in self.data["images"].keys():
519 |             anno_ids = self.get_anno_ids(img_ids=img_id)
520 |             anno_infos = self.load_annos(anno_ids=anno_ids)
521 |             cur_sent_set = set()
522 |             group_sent_ids = set(
523 |                 self.load_groups(self.get_group_ids(img_ids=img_id))[0]["inner_sent_id"]
524 |             )
525 |             for anno_info in anno_infos:
526 |                 cur_sent_set |= set(
527 |                     [i for i in anno_info["sent_id"] if i in group_sent_ids]
528 |                 )
529 |             assert group_sent_ids.issuperset(
530 |                 cur_sent_set
531 |             ), f"{group_sent_ids}, {cur_sent_set}"
532 |             cur_anti_sent_set = group_sent_ids - cur_sent_set
533 |             if not with_rev:
534 |                 cur_anti_sent_set = [
535 |                     sent_id
536 |                     for sent_id in cur_anti_sent_set
537 |                     if not self.is_revsent(sent_id)
538 |                 ]
539 |             num_anti_img_sent += len(cur_anti_sent_set)
540 |         stat_dict["num_anti_img_sent"] = num_anti_img_sent
541 | 
542 |         # Number of anno-sent pair
543 |         num_anno_sent = 0
544 |         anno_infos = self.load_annos()
545 |         for anno_info in anno_infos:
546 |             if inter_group:
547 |                 anno_sent_ids = [i for i in anno_info["sent_id"]]
548 |             else:
549 |                 group_sent_ids = set(
550 |                     self.load_groups(anno_info["group_id"])[0]["inner_sent_id"]
551 |                 )
552 |                 anno_sent_ids = [i for i in anno_info["sent_id"] if i in group_sent_ids]
553 |             if not with_rev:
554 |                 anno_sent_ids = [
555 |                     sent_id for sent_id in anno_sent_ids if not self.is_revsent(sent_id)
556 |                 ]
557 |             num_anno_sent += len(anno_sent_ids)
558 | 
559 |         stat_dict["num_anno_sent"] = num_anno_sent
560 | 
561 |         # Number of anti anno-sent pair
562 |         num_anti_anno_sent = 0
563 |         anno_infos = self.load_annos()
564 |         for anno_info in anno_infos:
565 |             if inter_group:
566 |                 all_sent_ids = set(self.get_sent_ids())
567 |                 anno_sent_ids = anno_info["sent_id"]
568 | 
569 |                 anti_sent_ids = [
570 |                     sent_id for sent_id in all_sent_ids if sent_id not in anno_sent_ids
571 |                 ]
572 |             else:
573 |                 group_sent_ids = set(
574 |                     self.load_groups(anno_info["group_id"])[0]["inner_sent_id"]
575 |                 )
576 |                 anno_sent_ids = [i for i in anno_info["sent_id"] if i in group_sent_ids]
577 | 
578 |                 anti_sent_ids = [
579 |                     sent_id
580 |                     for sent_id in group_sent_ids
581 |                     if sent_id not in anno_sent_ids
582 |                 ]
583 | 
584 |             if not with_rev:
585 |                 anti_sent_ids = [
586 |                     sent_id for sent_id in anti_sent_ids if not self.is_revsent(sent_id)
587 |                 ]
588 |             num_anti_anno_sent += len(anti_sent_ids)
589 | 
590 |         stat_dict["num_anti_anno_sent"] = num_anti_anno_sent
591 | 
592 |         # Len of sentence
593 |         totle_len = 0
594 |         for sent_info in self.load_sents(sent_ids):
595 |             totle_len += len(sent_info["raw_sent"].split())
596 | 
597 |         stat_dict["avg_sent_len"] = totle_len / stat_dict["nsent"]
598 | 
599 |         print(stat_dict)
600 | 
601 |     def is_revsent(self, sent_id):
602 |         sent_info = self.load_sents(sent_ids=sent_id)
603 |         return sent_info[0]["is_negative"]
604 | 
605 |     def data2coca(self, out_root, with_rev=False):
606 |         group_infos = self.load_groups()
607 |         for group_info in group_infos:
608 |             sent_ids = group_info["inner_sent_id"]
609 |             if not with_rev:
610 |                 sent_ids = [
611 |                     sent_id for sent_id in sent_ids if not self.is_revsent(sent_id)
612 |                 ]
613 |             sent_infos = self.load_sents(sent_ids)
614 |             for sent_info in sent_infos:
615 |                 sent = sent_info["raw_sent"]
616 |                 img_infos = self.load_imgs(group_info["img_id"])
617 |                 for img_info in img_infos:
618 |                     src_img_path = osp.join(self.image_dir, img_info["file_name"])
619 |                     raw_name = img_info["file_name"].split("/")[-1]
620 |                     out_img_dir = osp.join(out_root, "images", sent)
621 |                     os.makedirs(out_img_dir, exist_ok=True)
622 |                     out_img_path = osp.join(out_img_dir, raw_name)
623 |                     copy_file(src_img_path, out_img_path)
624 | 
625 |                     out_mask_dir = osp.join(out_root, "masks", sent)
626 |                     os.makedirs(out_mask_dir, exist_ok=True)
627 |                     out_mask_path = osp.join(
628 |                         out_mask_dir, raw_name.replace(".jpg", ".png")
629 |                     )
630 | 
631 |                     cur_anno_ids = self.get_anno_ids(
632 |                         img_ids=img_info["id"], sent_ids=sent_info["id"]
633 |                     )
634 |                     anno_infos = self.load_annos(cur_anno_ids)
635 |                     rle_list = [anno_info["segmentation"] for anno_info in anno_infos]
636 |                     bmask = merge2bin(rle_list, img_info["height"], img_info["width"])
637 |                     cv2.imwrite(out_mask_path, bmask)
638 | 
639 |     def convert2coco(self, out_root, anti_mode=False, is_group_separated=True):
640 |         """
641 |         Convert the annotation format of D^3 dataset to COCO.
642 |         1. The sent_id can be viewed as category_id in COCO.
643 |         2. If `is_group_separated` is True, `outer_sent_id` does not need to be considered.
644 |         3. if `with_rev` is False,  sents that meet `is_revsent` will be ignore.
645 |         """
646 |         os.makedirs(out_root, exist_ok=True)
647 |         coco_dict = {
648 |             "images": [],
649 |             "categories": [],
650 |             "annotations": [],
651 |         }
652 | 
653 |         sent_ids = self.get_sent_ids()
654 |         if anti_mode == 1:
655 |             sent_ids = [sent_id for sent_id in sent_ids if not self.is_revsent(sent_id)]
656 |         elif anti_mode == 2:
657 |             sent_ids = [sent_id for sent_id in sent_ids if self.is_revsent(sent_id)]
658 |         elif anti_mode == 0:
659 |             pass
660 |         else:
661 |             raise Exception("Unimplemented anti_mode.")
662 | 
663 |         sent_infos = self.load_sents(sent_ids)
664 |         for isent_info in sent_infos:
665 |             coco_dict["categories"].append(
666 |                 {
667 |                     "id": isent_info["id"],
668 |                     "name": isent_info["raw_sent"],
669 |                 }
670 |             )
671 | 
672 |         item_id = 0
673 |         img_infos = self.load_imgs()
674 |         for iimg_info in img_infos:
675 |             coco_dict["images"].append(
676 |                 {
677 |                     "id": iimg_info["id"],
678 |                     "file_name": iimg_info["file_name"],
679 |                     "height": iimg_info["height"],
680 |                     "width": iimg_info["width"],
681 |                 }
682 |             )
683 | 
684 |             anno_ids = self.get_anno_ids(img_ids=iimg_info["id"])
685 |             anno_infos = self.load_annos(anno_ids)
686 | 
687 |             for ianno_info in anno_infos:
688 |                 if is_group_separated:
689 |                     inner_group_sent_ids = [
690 |                         isent_id
691 |                         for isent_id in ianno_info["sent_id"]
692 |                         if isent_id
693 |                         in self.load_groups(ianno_info["group_id"])[0]["inner_sent_id"]
694 |                     ]
695 |                     cur_sent_ids = inner_group_sent_ids
696 |                 else:
697 |                     cur_sent_ids = ianno_info["sent_id"]
698 | 
699 |                 for isent_id in cur_sent_ids:
700 |                     if isent_id not in sent_ids:
701 |                         continue
702 | 
703 |                     seg = ianno_info["segmentation"][0].copy()
704 |                     if isinstance(seg, dict):  # RLE
705 |                         counts = seg["counts"]
706 |                         if not isinstance(counts, str):
707 |                             # make it json-serializable
708 |                             seg["counts"] = counts.decode("ascii")
709 | 
710 |                     coco_dict["annotations"].append(
711 |                         {
712 |                             "id": item_id,
713 |                             "image_id": iimg_info["id"],
714 |                             "category_id": isent_id,
715 |                             "segmentation": seg,
716 |                             "area": int(ianno_info["area"][0]),
717 |                             "bbox": [
718 |                                 int(cord) for cord in ianno_info["bbox"][0].tolist()
719 |                             ],
720 |                             "iscrowd": 0,  # TODO: ianno_info["iscrowd"]
721 |                         }
722 |                     )
723 |                     item_id += 1
724 | 
725 |         with open(osp.join(out_root, "coco_annotations.json"), "w") as f:
726 |             json.dump(coco_dict, f, indent=4)
727 | 
728 |     def sent_analyse(self, save_dir, with_rev=False):
729 |         """analyze word info in D-cube and generate word length histograms, word clouds, etc.
730 | 
731 |         Args:
732 |             save_dir (str): path to save the visualized results.
733 |             with_rev (bool, optional): consider absence descriptions or not. Defaults to False.
734 |         """
735 |         sent_ids = self.get_sent_ids()
736 |         if not with_rev:
737 |             sent_ids = [sent_id for sent_id in sent_ids if not self.is_revsent(sent_id)]
738 | 
739 |         sent_lens, sent_raws = [], []
740 |         sent_infos = self.load_sents(sent_ids)
741 |         for isent_info in sent_infos:
742 |             sent_raws.append(isent_info["raw_sent"])
743 |             sent_lens.append(len(isent_info["raw_sent"].split()))
744 | 
745 |         os.makedirs(save_dir, exist_ok=True)
746 |         # plot_hist(
747 |         #     sent_lens,
748 |         #     bins=max(sent_lens) - min(sent_lens) + 1,
749 |         #     save_path=osp.join(save_dir, "words_hist.pdf"),
750 |         #     x="Lengths of descriptions",
751 |         # )
752 |         # generate_wordclouds(sent_raws, osp.join(save_dir, "word_clouds"))
753 | 
754 |     def group_analysis(self, save_dir, with_rev=False):
755 |         group_infos = self.load_groups()
756 |         scene_tree = defaultdict(dict)
757 | 
758 |         for group_info in group_infos:
759 |             scene_tree[group_info["scene"]][group_info["group_name"]] = {"nimg": 0.1}
760 | 
761 |         # vis_group_tree(scene_tree, osp.join(save_dir, 'scene_tree.png'))  # the visualized result is ugly
762 | 
763 |     def bbox_num_analyze(self):
764 |         n_cat = len(self.data["sentences"].keys())
765 |         all_img_ids = self.data["images"].keys()
766 |         n_img = len(all_img_ids)
767 |         cat_obj_count = np.zeros((n_cat, n_img), dtype=int)
768 |         for img_id in all_img_ids:
769 |             # img_cat_ids = self.get_sent_ids(img_ids=img_id)
770 |             anno_ids = self.get_anno_ids(img_ids=img_id)
771 |             anno_infos = self.load_annos(anno_ids=anno_ids)
772 |             for anno in anno_infos:
773 |                 for sid in anno["sent_id"]:
774 |                     cat_obj_count[sid - 1, img_id] += 1
775 |         return cat_obj_count
776 | 


--------------------------------------------------------------------------------
/d_cube/data_util.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | __author__ = "Chi Xie and Zhao Zhang"
  3 | __maintainer__ = "Chi Xie"
  4 | # data utility functions are defined in the script
  5 | import json
  6 | import pickle
  7 | import shutil
  8 | 
  9 | # from io import StringIO
 10 | # import string
 11 | 
 12 | import numpy as np
 13 | import cv2
 14 | from pycocotools import mask as cocomask
 15 | 
 16 | VOC_COLORMAP = [
 17 |     [128, 0, 0],
 18 |     [0, 128, 0],
 19 |     [128, 128, 0],
 20 |     [0, 0, 128],
 21 |     [128, 0, 128],
 22 |     [0, 128, 128],
 23 |     [128, 128, 128],
 24 |     [64, 0, 0],
 25 |     [192, 0, 0],
 26 |     [64, 128, 0],
 27 |     [192, 128, 0],
 28 |     [64, 0, 128],
 29 |     [192, 0, 128],
 30 |     [64, 128, 128],
 31 |     [192, 128, 128],
 32 |     [0, 64, 0],
 33 |     [128, 64, 0],
 34 |     [0, 192, 0],
 35 |     [128, 192, 0],
 36 |     [0, 64, 128],
 37 | ]
 38 | 
 39 | 
 40 | def visualize_bbox_on_image(img, bbox_list, save_path=None, thickness=3):
 41 |     img_copy = img.copy()
 42 |     for i, bbox in enumerate(bbox_list):
 43 |         color = tuple(VOC_COLORMAP[i % len(VOC_COLORMAP)])
 44 |         x, y, w, h = bbox
 45 |         img_copy = cv2.rectangle(
 46 |             img_copy, (int(x), int(y)), (int((x + w)), int(y + h)), color, thickness
 47 |         )
 48 |     if save_path:
 49 |         cv2.imwrite(save_path, img_copy)
 50 |     return img_copy
 51 | 
 52 | 
 53 | def rle2bmask(rle):
 54 |     bm = cocomask.decode(rle)
 55 |     if len(bm.shape) == 3:
 56 |         bm = np.sum(
 57 |             bm, axis=2
 58 |         )  # sometimes there are multiple binary map (corresponding to multiple segs)
 59 |     bm = bm.astype(np.uint8)  # convert to np.uint8
 60 |     return bm
 61 | 
 62 | 
 63 | def merge_rle(rle_list, is_instance=True, on_image=False):
 64 |     if is_instance:
 65 |         cm_list = []
 66 |         for rle_idx, rle in enumerate(rle_list):
 67 |             color = VOC_COLORMAP[rle_idx]
 68 |             bm = rle2bmask(rle)
 69 |             cm = cv2.cvtColor(bm, cv2.COLOR_GRAY2BGR)
 70 |             cm_list.append(cm * color)
 71 |         merge_map = np.sum(cm_list, axis=0, dtype=np.uint8)
 72 |     else:
 73 |         bm_list = [rle2bmask(rle) for rle in rle_list]
 74 |         merge_map = np.sum(bm_list, axis=0, dtype=np.uint8)
 75 |     merge_map[merge_map >= 1] = 1
 76 |     if not on_image:
 77 |         color = VOC_COLORMAP[0]
 78 |         merge_map = cv2.cvtColor(merge_map, cv2.COLOR_GRAY2BGR)
 79 |         merge_map *= np.array(color, dtype=np.uint8)
 80 | 
 81 |     merge_map[merge_map > 255] = 255
 82 | 
 83 |     if not on_image:
 84 |         tmp_sum_map = np.sum(merge_map, axis=-1)
 85 |         merge_map[tmp_sum_map == 0] = 220
 86 |     return merge_map
 87 | 
 88 | 
 89 | def merge2bin(rle_list, img_h, img_w):
 90 |     if rle_list:
 91 |         bm_list = [rle2bmask(rle) for rle in rle_list]
 92 |         merge_map = np.sum(bm_list, axis=0, dtype=np.uint8)
 93 |         merge_map[merge_map >= 1] = 255
 94 |         merge_map = np.expand_dims(merge_map, axis=-1)
 95 |         return merge_map
 96 |     else:
 97 |         return np.zeros([img_h, img_w, 1], dtype=np.uint8)
 98 | 
 99 | 
100 | def paste_text(img, text):
101 |     fontFace = cv2.FONT_HERSHEY_COMPLEX_SMALL
102 |     overlay = img.copy()
103 |     # fontFace = cv2.FONT_HERSHEY_TRIPLEX
104 |     fontScale = 1
105 |     thickness = 1
106 |     backgroud_alpha = 0.8
107 | 
108 |     retval, baseLine = cv2.getTextSize(
109 |         text, fontFace=fontFace, fontScale=fontScale, thickness=thickness
110 |     )
111 |     topleft = (0, 0)
112 |     # bottomright = (topleft[0] + retval[0], topleft[1] + retval[1]+10)
113 |     bottomright = (img.shape[1], topleft[1] + retval[1] + 10)
114 | 
115 |     cv2.rectangle(overlay, topleft, bottomright, thickness=-1, color=(250, 250, 250))
116 |     img = cv2.addWeighted(overlay, backgroud_alpha, img, 1 - backgroud_alpha, 0)
117 | 
118 |     cv2.putText(
119 |         img,
120 |         text,
121 |         (0, baseLine + 10),
122 |         fontScale=fontScale,
123 |         fontFace=fontFace,
124 |         thickness=thickness,
125 |         color=(10, 10, 10),
126 |     )
127 |     return img
128 | 
129 | 
130 | def load_json(json_path, to_int=False):
131 |     clean_res_dic = {}
132 |     with open(json_path, "r", encoding="utf-8") as f_in:
133 |         res_dic = json.load(f_in)
134 | 
135 |     for ikey, iv in res_dic.items():
136 |         ikey = int(ikey.strip()) if to_int else ikey.strip()
137 |         clean_res_dic[ikey] = iv
138 | 
139 |     return clean_res_dic
140 | 
141 | 
142 | def path_map(src_path, obj_path):
143 |     def inner_map(full_path):
144 |         return full_path.replace(src_path, obj_path)
145 | 
146 | 
147 | def save_pkl(src, obj_path):
148 |     with open(obj_path, "wb") as f_out:
149 |         pickle.dump(src, f_out)
150 | 
151 | 
152 | def load_pkl(src_path):
153 |     with open(src_path, "rb") as f_in:
154 |         in_pkl = pickle.load(f_in)
155 |     return in_pkl
156 | 
157 | 
158 | def copy_file(src_path, obj_path):
159 |     shutil.copy(src_path, obj_path)
160 | 
161 | 
162 | def sentence_analysis():
163 |     return 0
164 | 
165 | 
166 | def add_checkerboard_bg(image, mask, save_path=None):
167 |     # Create a new image with the same size as the original image
168 |     new_image = np.zeros_like(image)
169 | 
170 |     # Define the size of the checkerboard pattern
171 |     checkerboard_size = 24
172 | 
173 |     # Loop over each pixel in the mask
174 |     for x in range(mask.shape[1]):
175 |         for y in range(mask.shape[0]):
176 |             # If the pixel is transparent, draw a checkerboard pattern
177 |             if mask[y, x] == 0:
178 |                 if (x // checkerboard_size) % 2 == (y // checkerboard_size) % 2:
179 |                     new_image[y, x] = (255, 255, 255)
180 |                 else:
181 |                     new_image[y, x] = (128, 128, 128)
182 |             # Otherwise, copy the corresponding pixel from the original image
183 |             else:
184 |                 new_image[y, x] = image[y, x]
185 | 
186 |     # Save the new image with the checkerboard background
187 |     if save_path:
188 |         cv2.imwrite(save_path, new_image)
189 |     return new_image
190 | 
191 | 
192 | def visualize_mask_on_image(
193 |     img, mask, save_path=None, add_edge=False, dark_background=False
194 | ):
195 |     # Convert the mask to a binary mask if it's not already
196 |     if mask.max() > 1:
197 |         mask = mask.astype(np.uint8) // 255
198 | 
199 |     # Convert the mask to a 3-channel mask if it's not already
200 |     if len(mask.shape) == 2:
201 |         mask = np.expand_dims(mask, axis=2)
202 |         mask = np.tile(mask, (1, 1, 3))
203 | 
204 |     # Create a color map for the mask
205 |     cmap = np.array([255, 117, 44], dtype=np.uint8)
206 |     mask_colors = mask * cmap
207 | 
208 |     # Add an opaque white edge to the mask if desired
209 |     if add_edge:
210 |         if len(mask.shape) == 2:
211 |             mask = np.expand_dims(mask, axis=2)
212 |             mask = np.tile(mask, (1, 1, 3))
213 | 
214 |         kernel = np.ones((5, 5), dtype=np.uint8)
215 |         mask_edge = cv2.erode(mask, kernel, iterations=1)
216 |         mask_edge = mask - mask_edge
217 | 
218 |         # mask_edge = np.tile(mask_edge[:, :, np.newaxis], [1, 1, 3])
219 |         mask_colors[mask_edge > 0] = 255
220 | 
221 |     # Overlay the mask on the masked image
222 |     if dark_background:
223 |         masked_img = cv2.addWeighted(img, 0.4, mask_colors, 0.6, 0)
224 |     else:
225 |         masked_img = img.copy()
226 |         masked_img[mask > 0] = cv2.addWeighted(img, 0.4, mask_colors, 0.6, 0)[mask > 0]
227 | 
228 |     # Save the result to the specified path if provided
229 |     if save_path is not None:
230 |         cv2.imwrite(save_path, masked_img)
231 | 
232 |     return masked_img
233 | 
234 | 
235 | # def visualize_mask_on_image(img, mask, save_path=None, add_edge=False):
236 | #     # Convert the mask to a binary mask if it's not already
237 | #     if mask.max() > 1:
238 | #         mask = mask.astype(np.uint8) // 255
239 | 
240 | #     # Convert the mask to a 3-channel mask if it's not already
241 | #     if len(mask.shape) == 2:
242 | #         mask = np.expand_dims(mask, axis=2)
243 | #         mask = np.tile(mask, (1, 1, 3))
244 | 
245 | #     # Create a color map for the mask
246 | #     cmap = np.array([255, 117, 44], dtype=np.uint8)
247 | #     mask_colors = mask * cmap
248 | 
249 | #     # Add an opaque white edge to the mask if desired
250 | #     if add_edge:
251 | #         if len(mask.shape) == 2:
252 | #             mask = np.expand_dims(mask, axis=2)
253 | #             mask = np.tile(mask, (1, 1, 3))
254 | 
255 | #         kernel = np.ones((5, 5), dtype=np.uint8)
256 | #         mask_edge = cv2.erode(mask, kernel, iterations=1)
257 | #         mask_edge = mask - mask_edge
258 | 
259 | #         # mask_edge = np.tile(mask_edge[:, :, np.newaxis], [1, 1, 3])
260 | #         mask_colors[mask_edge > 0] = 255
261 | 
262 | #     # Overlay the mask on the masked image
263 | #     masked_img = cv2.addWeighted(img, 0.5, mask_colors, 0.5, 0)
264 | 
265 | #     # Save the result to the specified path if provided
266 | #     if save_path is not None:
267 | #         cv2.imwrite(save_path, masked_img)
268 | 
269 | #     return masked_img
270 | 


--------------------------------------------------------------------------------
/d_cube/vis_util.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | __author__ = "Chi Xie and Zhao Zhang"
  3 | __maintainer__ = "Chi Xie"
  4 | import os
  5 | from collections import Counter
  6 | 
  7 | import spacy
  8 | import matplotlib.pyplot as plt
  9 | import seaborn as sns
 10 | from wordcloud import WordCloud
 11 | 
 12 | # from pycirclize import Circos
 13 | # from Bio.Phylo.BaseTree import Tree
 14 | # from Bio import Phylo
 15 | # from newick import Node
 16 | 
 17 | 
 18 | def plot_hist(data, bins=10, is_norm=False, save_path=None, x=None):
 19 |     sns.set_theme(style="whitegrid", font_scale=2.0)
 20 |     ax = sns.histplot(data, bins=bins, common_norm=is_norm, kde=False)
 21 |     ax.set_xlabel(x)
 22 |     plt.tight_layout()
 23 |     plt.savefig(save_path)
 24 |     plt.close()
 25 | 
 26 | 
 27 | def plot_bars(names, nums, is_sort, save_path=None):
 28 |     sns.set(style="whitegrid")
 29 | 
 30 |     if is_sort:
 31 |         zipped = zip(nums, names)
 32 |         sort_zipped = sorted(zipped, key=lambda x: (x[0], x[1]))
 33 |         result = zip(*sort_zipped)
 34 |         nums, names = [list(x) for x in result]
 35 | 
 36 |     fontx = {"family": "Times New Roman", "size": 10}
 37 |     fig, ax = plt.subplots()
 38 |     fig = plt.figure(figsize=(16, 4))
 39 |     # sns.set_palette("PuBuGn_d")
 40 |     sns.barplot(names, nums, palette=sns.cubehelix_palette(80, start=0.5, rot=-0.75))
 41 |     fig.autofmt_xdate(rotation=90)
 42 |     plt.tick_params(axis="x", labelsize=10)
 43 |     labels = ax.get_xticklabels() + ax.get_yticklabels()
 44 |     [label.set_fontname("Times New Roman") for label in labels]
 45 |     plt.tight_layout()
 46 |     plt.savefig(save_path)
 47 | 
 48 | 
 49 | def generate_wordclouds(sentences, save_dir):
 50 |     """Generates word clouds for different parts of speech in a list of sentences.
 51 | 
 52 |     Args:
 53 |         sentences: A list of sentences.
 54 |         save_dir: The directory to save the word cloud images.
 55 |     """
 56 | 
 57 |     os.makedirs(save_dir, exist_ok=True)
 58 |     # Load the spacy model
 59 |     nlp = spacy.load("en_core_web_sm")
 60 | 
 61 |     # Define the parts of speech to include in the word clouds
 62 |     pos_to_include = ["NOUN", "VERB", "ADJ", "ADV"]
 63 | 
 64 |     # Process each sentence and collect the relevant words for each part of speech
 65 |     words_by_pos = {pos: [] for pos in pos_to_include}
 66 |     for sent in sentences:
 67 |         doc = nlp(sent)
 68 |         for token in doc:
 69 |             if token.pos_ in pos_to_include:
 70 |                 words_by_pos[token.pos_].append(token.lemma_.lower())
 71 | 
 72 |     # Generate a word cloud for each part of speech
 73 |     for pos, words in words_by_pos.items():
 74 |         if len(words) == 0:
 75 |             continue  # skip parts of speech with no words
 76 | 
 77 |         # Count the frequency of each word
 78 |         word_counts = Counter(words)
 79 | 
 80 |         # Generate the word cloud
 81 |         wordcloud = WordCloud(
 82 |             width=800,
 83 |             height=800,
 84 |             background_color="white",
 85 |             max_words=200,
 86 |             colormap="Set2",
 87 |             max_font_size=150,
 88 |         ).generate_from_frequencies(word_counts)
 89 | 
 90 |         # Save the word cloud image
 91 |         filename = f"{pos.lower()}_wordcloud.png"
 92 |         filepath = os.path.join(save_dir, filename)
 93 |         wordcloud.to_file(filepath)
 94 | 
 95 | 
 96 | # def vis_group_tree(data_dict, save_path):
 97 | 
 98 | #     # Create 3 randomized trees
 99 | #     tree_size_list = [60, 40, 50]
100 | #     trees = [Tree.randomized(string.ascii_uppercase, branch_stdev=0.5) for size in tree_size_list]
101 | 
102 | #     # Initialize circos sector with 3 randomized tree size
103 | #     sectors = {name: size for name, size in zip(list("ABC"), tree_size_list)}
104 | #     circos = Circos(sectors, space=5)
105 | 
106 | #     colors = ["tomato", "skyblue", "limegreen"]
107 | #     cmaps = ["bwr", "viridis", "Spectral"]
108 | #     for idx, sector in enumerate(circos.sectors):
109 | #         sector.text(sector.name, r=120, size=12)
110 | #         # Plot randomized tree
111 | #         tree = trees[idx]
112 | #         tree_track = sector.add_track((30, 70))
113 | #         tree_track.axis(fc=colors[idx], alpha=0.2)
114 | #         tree_track.tree(tree, leaf_label_size=3, leaf_label_margin=21)
115 | #         # Plot randomized bar
116 | #         bar_track = sector.add_track((70, 90))
117 | #         x = np.arange(0, int(sector.size)) + 0.5
118 | #         height = np.random.randint(1, 10, int(sector.size))
119 | #         bar_track.bar(x, height, facecolor=colors[idx], ec="grey", lw=0.5, hatch="//")
120 | 
121 | #     circos.savefig(save_path, dpi=600)
122 | 
123 | # def clean_newick_key(in_str):
124 | #     bad_chars = [':', ';', ',', '(', ')']
125 | #     for bad_char in bad_chars:
126 | #         in_str = in_str.replace(bad_char, ' ')
127 | #     return in_str
128 | 
129 | # def build_tree_from_dict(data):
130 | #     root = Node()  # create the root node
131 | #     for key, value in data.items():
132 | #         node = Node(name=clean_newick_key(key)) # name doesn't need to be cleaned
133 | #         if value is not None:
134 | #             child_node = build_tree_from_dict(value)
135 | #             node.add_descendant(child_node)
136 | #         root.add_descendant(node)
137 | 
138 | #     return root
139 | 
140 | 
141 | def replace_chars_in_dict_keys(d):
142 |     """
143 |     Replaces the characters ':', ';', ',', '(', and ')' in the keys of a nested dictionary with '_'.
144 |     """
145 |     new_dict = {}
146 |     for k, v in d.items():
147 |         if isinstance(v, dict):
148 |             v = replace_chars_in_dict_keys(v)
149 |         new_key = k.translate(str.maketrans(":;,()", "_____"))
150 |         new_dict[new_key] = v
151 |     return new_dict
152 | 
153 | 
154 | def build_newick_tree(tree_dict):
155 |     newick_tree = ""
156 |     if isinstance(tree_dict, dict):
157 |         for key, value in tree_dict.items():
158 |             if isinstance(value, dict):
159 |                 subtree = build_newick_tree(value)
160 |                 if subtree:
161 |                     newick_tree += "(" + subtree + ")" + key + ","
162 |                 else:
163 |                     newick_tree += key + ","
164 |             else:
165 |                 newick_tree += key + ":" + str(value) + ","
166 |         newick_tree = newick_tree.rstrip(",") + ")"
167 |         return newick_tree
168 |     else:
169 |         return None
170 | 
171 | 
172 | # def vis_group_tree(data_dict, save_path):
173 | #     data_dic = replace_chars_in_dict_keys(data_dict)
174 | #     super_group_names = data_dict.keys()
175 | 
176 | #     # Create 3 randomized trees
177 | #     tree_size_list = [60, 40, 50]
178 | #     trees = [Phylo.read(StringIO(build_newick_tree(data_dict[super_group_name])), "newick") for super_group_name in super_group_names]
179 | 
180 | #     # Initialize circos sector with 3 randomized tree size
181 | #     sectors = {name: size for name, size in zip(list("ABC"), tree_size_list)}
182 | #     circos = Circos(sectors, space=5)
183 | 
184 | #     colors = ["tomato", "skyblue", "limegreen"]
185 | #     cmaps = ["bwr", "viridis", "Spectral"]
186 | #     for idx, sector in enumerate(circos.sectors):
187 | #         sector.text(sector.name, r=120, size=12)
188 | #         # Plot randomized tree
189 | #         tree = trees[idx]
190 | #         tree_track = sector.add_track((30, 70))
191 | #         tree_track.axis(fc=colors[idx], alpha=0.2)
192 | #         tree_track.tree(tree, leaf_label_size=3, leaf_label_margin=21)
193 | #         # Plot randomized bar
194 | #         bar_track = sector.add_track((70, 90))
195 | #         x = np.arange(0, int(sector.size)) + 0.5
196 | #         height = np.random.randint(1, 10, int(sector.size))
197 | #         bar_track.bar(x, height, facecolor=colors[idx], ec="grey", lw=0.5, hatch="//")
198 | 
199 | #     circos.savefig(save_path, dpi=600)
200 | 


--------------------------------------------------------------------------------
/doc.md:
--------------------------------------------------------------------------------
  1 | # $D^3$ Toolkit Documentation
  2 | 
  3 | 
  4 | ## Table of Contents
  5 | 
  6 | - [Inference](#inference-on-d3)
  7 | - [Key Concepts](#key-concepts-for-users)
  8 | - [Evaluation Settings](#evaluation-settings)
  9 | - [Evaluation Code and Examples](#evaluation-code-and-examples)
 10 | - [Dataset statistics](#dataset-statistics)
 11 | 
 12 | 
 13 | 
 14 | 
 15 | ## Inference on $D^3$
 16 | 
 17 | ```python
 18 | # import the dataset class
 19 | from d_cube import D3
 20 | # init a dataset instance
 21 | d3 = D3(IMG_ROOT, PKL_ANNO_PATH)
 22 | all_img_ids = d3.get_img_ids()  # get the image ids in the dataset
 23 | all_img_info = d3.load_imgs(all_img_ids)  # load images by passing a list containing some image ids
 24 | img_path = all_img_info[0]["file_name"]  # obtain one image path so you can load it and inference
 25 | # then you can load the image as input for your model
 26 | 
 27 | group_ids = d3.get_group_ids(img_ids=[img_id])  # get the group ids by passing anno ids, image ids, etc.
 28 | sent_ids = d3.get_sent_ids(group_ids=group_ids)  # get the sentence ids by passing image ids, group ids, etc.
 29 | sent_list = d3.load_sents(sent_ids=sent_ids)
 30 | ref_list = [sent['raw_sent'] for sent in sent_list]  # list[str]
 31 | # use these language references in `ref_list` as the references to your REC/OVD/DOD model
 32 | 
 33 | # save the result to a JSON file
 34 | ```
 35 | 
 36 | Concepts and structures of `anno`, `image`, `sent` and `group` are explained in [this part](#key-concepts-for-users).
 37 | 
 38 | In [this directory](eval_sota/) we provide the inference (and evaluation) script on some existing SOTA OVD/REC methods.
 39 | 
 40 | 
 41 | 
 42 | ### Output Format
 43 | When the inference is done, you need to save a JSON file in the format below (COCO standard output JSON form):
 44 | ```json
 45 | [
 46 |     {
 47 |         "category_id": "int, the value of sent_id, range [1, 422]",
 48 |         "bbox": "list[int], [x1, y1, w, h], predicted by your model, same as COCO result format, absolute value in the range of [w, h, w, h]",
 49 |         "image_id": "int, img_id, can be 0, 1, 2, ....",
 50 |         "score": "float, predicted by your model, no restriction on its absolute value range"
 51 |     }
 52 | ]
 53 | ```
 54 | This JSON file should contain a list, where each item in the list is a dictionary of one detection result.
 55 | 
 56 | With this JSON saved, you can evaluate the JSON in the next step. See [the evaluation step](#evaluation-code-and-examples).
 57 | 
 58 | 
 59 | 
 60 | 
 61 | 
 62 | ## Key Concepts for Users
 63 | 
 64 | ### `anno`
 65 | A Python dictionary where the keys are integers and the values are dictionaries with the following key-value pairs:
 66 | 
 67 | * `id`: an integer representing the ID of the annotation.
 68 | * `sent_id`: a list of integers representing the IDs of sentences associated with this annotation.
 69 | * `segmentation`: a Run Length Encoding (RLE) representation of the annotation.
 70 | * `area`: an integer representing the area of the annotation.
 71 | * `iscrowd`: an integer indicating whether this annotation represents a crowd or not.
 72 | * `image_id`: an integer representing the ID of the image associated with this annotation.
 73 | * `bbox`: a list of four integers representing the bounding box coordinates of the annotation in the format [x, y, width, height].
 74 | * `group_id`: a value that can be any object and represents the ID of the group associated with this annotation.
 75 | 
 76 | ``` python
 77 | {
 78 |     1 : {
 79 |         "id": int,
 80 |         "sent_id": list,
 81 |         "segmentation": RLE,
 82 |         "area": int,
 83 |         "iscrowd": int,
 84 |         "image_id": int,
 85 |         "bbox": list, # [x, y, width, height]
 86 |         "group_id": int
 87 |     }
 88 | }
 89 | ```
 90 | 
 91 | ### `image`
 92 | A Python dictionary where the keys are integers and the values are dictionaries with the following key-value pairs:
 93 | 
 94 | * `id`: an integer representing the ID of the image.
 95 | * `file_name`: a string representing the file name of the image.
 96 | * `height`: an integer representing the height of the image.
 97 | * `width`: an integer representing the width of the image.
 98 | * `flickr_url`: a string representing the Flickr URL of the image.
 99 | * `anno_id`: a list of integers representing the IDs of annotations associated with this image.
100 | * `group_id`: an integer representing the ID of the group associated with this image.
101 | * `license`: a string representing the license of the image.
102 | 
103 | ``` python
104 | {
105 |     int : {
106 |         "id": int,
107 |         "file_name": str,
108 |         "height": int,
109 |         "width": int,
110 |         "flickr_url": str,
111 |         "anno_id": list,
112 |         "group_id": int,
113 |         "license": str,
114 |     }
115 | }
116 | ```
117 | 
118 | ### `sent`
119 | A Python dictionary where the keys are integers and the values are dictionaries with the following key-value pairs:
120 | 
121 | * `id`: an integer representing the ID of the sentence.
122 | * `anno_id`: a list of integers representing the IDs of annotations associated with this sentence.
123 | * `group_id`: a list of integers representing the IDs of groups associated with this sentence.
124 | * `is_negative`: a boolean indicating whether this sentence is *absence expression* or not. `True` means *absence expression*.
125 | * `raw_sent`: a string representing the raw text of the sentence in English.
126 | * `raw_sent_zh`: a string representing the raw text of the sentence in Chinese.
127 | 
128 | ``` python
129 | {
130 |     int : {
131 |         "id": int,
132 |         "anno_id": list,  
133 |         "group_id": list,
134 |         "is_negative": bool,
135 |         "raw_sent": str,
136 |         "raw_sent_zh": str
137 |     }
138 | }
139 | ```
140 | 
141 | ### `group`
142 | A Python dictionary where the keys are integers and the values are dictionaries with the following key-value pairs:
143 | 
144 | * `id`: an integer representing the ID of the group.
145 | * `pos_sent_id`: a list of integers representing the IDs of sentences that has referred obejct in the group.
146 | * `inner_sent_id`: a list of integers representing the IDs of sentences belonging to this group.
147 | * `outer_sent_id`: a list of integers representing the IDs of outer-group sentences that has referred obejct in the group.
148 | * `img_id`: a list of integers representing the IDs of images of this group.
149 | * `scene`: a list of strings representing the scenes of this group.
150 | * `group_name`: a string representing the name of this group in English.
151 | * `group_name_zh`: a string representing the name of this group in Chinese.
152 | 
153 | ``` python
154 | {
155 |     int : {
156 |         "id": int,
157 |         "pos_sent_id": list,
158 |         "inner_sent_id": list,
159 |         "outer_sent_id": list,
160 |         "img_id": list,
161 |         "scene": list,
162 |         "group_name": str,
163 |         "group_name_zh": str
164 |     }
165 | }
166 | ```
167 | 
168 | 
169 | 
170 | 
171 | 
172 | ## Evaluation Settings
173 | 
174 | 
175 | ### Intra- or Inter-Group Settings
176 | 
177 | The default evaluation protocol is the intra-group setting, where only a certain references are evaluated for each image.
178 | 
179 | In the $D^3$ dataset, images are collected for different groups (scenarios), and the categories (descriptions) are designed based on the scenarios. For the intra-group setting, each image are only evaluated with the descriptions from the group the image belongs to. We call this **intra-scenario setting**.
180 | 
181 | Note that each category is actually annotated on each image (with positive or negative instances).
182 | So you can also evaluate all categories on all images, just like traditional detection datasets. We call this **inter-scenario setting**.
183 | This is quite challenging for the DOD task as this will produce many false positive instances on current methods.
184 | 
185 | For intra-group evaluation, you should use:
186 | ```
187 | sent_ids = d3.get_sent_ids(group_ids=group_ids)
188 | # only get the refs (sents) for the group the image belongs to, which is usually 4
189 | ```
190 | 
191 | For inter-group evaluation, change the correponding code to:
192 | 
193 | ```
194 | sent_ids = d3.get_sent_ids()
195 | # get all the refs in the dataset
196 | ```
197 | 
198 | This will use all the sentences in the dataset, rather than a few sentences in the group that this image belongs to.
199 | 
200 | This is the only difference in the implentation and evaluation. No further code changes need to be applied.
201 | 
202 | For more information, you can refer to the Section 3.4 of the DOD paper.
203 | 
204 | 
205 | ### FULL, PRES and ABS
206 | 
207 | FULL, PRES and ABS means the full descriptions (422 categories), presence descriptions (316 categories) and absence descriptions (106 categories).
208 | 
209 | The meaning of absence descriptions are the descriptions involving the absence of some concepts, like lacking certain relationships, attributes or objects. For example, descriptions like "dog *without* leash", "person *without* helmet" and "a hat that is *not* blue" are absence ones.
210 | Similary, the descriptions involving *only* the presence of some concepts are presence descriptions.
211 | 
212 | Most existing REC datasets have presence descriptions but few absence descriptions.
213 | 
214 | For more details and the meaning of evaluating absence descriptions, please refer to Section 3.1 of the DOD paper.
215 | 
216 | 
217 | 
218 | 
219 | ## Evaluation Code and Examples
220 | 
221 | In this part, we introduce how to evaluate the performance and get the metric values given the prediction result of a JSON file.
222 | 
223 | ### Write a Snippet in Your Code
224 | 
225 | This is based on [cocoapi (pycocotools)](https://github.com/cocodataset/cocoapi/tree/master/PythonAPI), and is quite simple:
226 | 
227 | ```python
228 | from pycocotools.coco import COCO
229 | from pycocotools.cocoeval import COCOeval
230 | 
231 | # Eval results
232 | coco = COCO(gt_path)  # `gt_path` is the ground-truth JSON path (different JSON for FULL, PRES or ABS settings in our paper)
233 | d3_model = coco.loadRes(pred_path)  # `pred_path` is the prediction JSON file 
234 | cocoEval = COCOeval(coco, d3_model, "bbox")
235 | cocoEval.evaluate()
236 | cocoEval.accumulate()
237 | cocoEval.summarize()
238 | ```
239 | 
240 | ### An Off-the-shelf Script
241 | 
242 | We also provide [a script](scripts/eval_and_analysis_json.py) that can produce the evaluation results (and some additional analysis) in our paper, given a prediction JSON.
243 | You can use it by:
244 | ```shell
245 | python eval_and_analysis_json.py YOUR_PREDICTION_JSON_PATH
246 | ```
247 | 
248 | A few options are provided for format conversion or more analysis:
249 | ```shell
250 | python eval_and_analysis_json.py --help
251 | 
252 | usage: An example script for $D^3$ evaluation with prediction file (JSON) [-h] [--partition-by-nbox] [--partition-by-lens] [--xyxy2xywh] pred_path
253 | 
254 | positional arguments:
255 |   pred_path            path to prediction json
256 | 
257 | optional arguments:
258 |   -h, --help           show this help message and exit
259 |   --partition-by-nbox  divide the images by num of boxes for each ref
260 |   --partition-by-lens  divide the references by their lengths
261 |   --xyxy2xywh          transform box coords from xyxy to xywh
262 | ```
263 | 
264 | 
265 | ### Evaluation Examples on SOTA Methods
266 | 
267 | See [this directory](eval_sota/) for details. We include the evaluation scripts of some methods there.
268 | 
269 | 
270 | 
271 | ## Dataset Statistics
272 | 
273 | [A python script](scripts/get_d3_stat.py) is provided for calculating the statistics of $D^3$ or visualizing figures like histograms, word clouds, etc.
274 | 
275 | The specific statistics of the dataset are available in Section 3.3 of the DOD paper.
276 | 


--------------------------------------------------------------------------------
/eval_sota/README.md:
--------------------------------------------------------------------------------
 1 | # Evaluting SOTA Methods on $D^3$
 2 | 
 3 | ## Leaderboard
 4 | 
 5 | In this directory, we keep the scripts or github links (official or custom) to evaluate SOTA methods (REC/OVD/DOD/MLLM) on $D^3$:
 6 | 
 7 | | Name | Paper | Original Tasks | Training Data | Evaluation Code | Intra-FULL/PRES/ABS/Inter-FULL/PRES/ABS | Source | Note |
 8 | |:-----|:-----:|:----:|:-----:|:-----:|:-----:|:-----:|:-----:|
 9 | | OFA-large | [OFA: Unifying Architectures, Tasks, and Modalities Through a Simple Sequence-to-Sequence Learning Framework (ICML 2022)](https://arxiv.org/abs/2202.03052) | REC | - | - | 4.2/4.1/4.6/0.1/0.1/0.1 | [DOD paper](https://arxiv.org/abs/2307.12813) | - |
10 | | CORA-R50 | [CORA: Adapting CLIP for Open-Vocabulary Detection with Region Prompting and Anchor Pre-Matching (CVPR 2023)](https://openaccess.thecvf.com/content/CVPR2023/papers/Wu_CORA_Adapting_CLIP_for_Open-Vocabulary_Detection_With_Region_Prompting_and_CVPR_2023_paper.pdf) | OVD | - | - | 6.2/6.7/5.0/2.0/2.2/1.3 | [DOD paper](https://arxiv.org/abs/2307.12813) | - |
11 | | OWL-ViT-large | [Simple Open-Vocabulary Object Detection with Vision Transformers (ECCV 2022)](https://www.ecva.net/papers/eccv_2022/papers_ECCV/papers/136700714.pdf) | OVD | - | [DOD official](./owl_vit.py) | 9.6/10.7/6.4/2.5/2.9/2.1 | [DOD paper](https://arxiv.org/abs/2307.12813) | Post-processing hyper-parameters may affect the performance and the result may not exactly match the paper |
12 | | SPHINX-7B | [SPHINX: The Joint Mixing of Weights, Tasks, and Visual Embeddings for Multi-modal Large Language Models (arxiv 2023)](https://arxiv.org/abs/2311.07575) | **MLLM** capable of REC | - | [DOD official](./sphinx.py) | 10.6/11.4/7.9/-/-/- | DOD authors | A lot of contribution from [Jie Li](https://github.com/theFool32) |
13 | | GLIP-T | [Grounded Language-Image Pre-training (CVPR 2022)](https://arxiv.org/abs/2112.03857)  | OVD & PG | - | - | 19.1/18.3/21.5/-/-/- | GEN paper | - |
14 | | UNINEXT-huge | [Universal Instance Perception as Object Discovery and Retrieval (CVPR 2023)](https://arxiv.org/abs/2303.06674v2) | OVD & REC | - | [DOD official](https://github.com/Charles-Xie/UNINEXT_D3) | 20.0/20.6/18.1/3.3/3.9/1.6 | [DOD paper](https://arxiv.org/abs/2307.12813) | - |
15 | | Grounding-DINO-base | [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection (arxiv 2023)](https://arxiv.org/abs/2303.05499) | OVD & REC | - | [DOD official](./groundingdino.py) | 20.7/20.1/22.5/2.7/2.4/3.5 | [DOD paper](https://arxiv.org/abs/2307.12813) | Post-processing hyper-parameters may affect the performance and the result may not exactly match the paper |
16 | | OFA-DOD-base | [Described Object Detection: Liberating Object Detection with Flexible Expressions (NeurIPS 2023)](https://arxiv.org/abs/2307.12813) | DOD | - | - | 21.6/23.7/15.4/5.7/6.9/2.3 | [DOD paper](https://arxiv.org/abs/2307.12813) | - |
17 | | FIBER-B | [Coarse-to-Fine Vision-Language Pre-training with Fusion in the Backbone (NeurIPS 2022)](https://arxiv.org/abs/2206.07643) | OVD & REC | - | - | 22.7/21.5/26.0/-/-/- | GEN paper | - |
18 | | MM-Grounding-DINO | [An Open and Comprehensive Pipeline for Unified Object Grounding and Detection (arxiv 2024)](https://arxiv.org/abs/2401.02361) | DOD & OVD & REC | O365, GoldG, GRIT, V3Det | [MM-GDINO official](https://github.com/open-mmlab/mmdetection/tree/main/configs/mm_grounding_dino#zero-shot-description-detection-datasetdod) | 22.9/21.9/26.0/-/-/- | MM-GDINO paper | - |
19 | | GEN (FIBER-B) | [Generating Enhanced Negatives for Training Language-Based Object Detectors (arxiv 2024](https://arxiv.org/abs/2401.00094) | DOD | - | - | 26.0/25.2/28.1/-/-/- | GEN paper | Enhancement based on FIBER-B |
20 | | APE-large (D) | [Aligning and Prompting Everything All at Once for Universal Visual Perception (arxiv 2023)](https://arxiv.org/abs/2312.02153) | DOD & OVD & REC | COCO, LVIS, O365, OpenImages, Visual Genome, RefCOCO/+/g, SA-1B, GQA, PhraseCut, Flickr30k | [APE official](https://github.com/shenyunhang/APE) | 37.5/38.8/33.9/21.0/22.0/17.9 | APE paper | Extra training data helps for this amazing performance |
21 | 
22 | 
23 | Some extra notes:
24 | - Each method is currently recorded by *the variant with the highest performance* in this table, if there are multiple variants available, so it's only a leaderboard, not meant for fair comparison.
25 | - Methods like GLIP, FIBER, etc. are actually not evaluated on OVD benchmarks. For zero-shot eval on DOD, We currently do not distinguish between methods for OVD benchmarks and methods for ZS-OD, as long as it is verified with open-set detection capability.
26 | 
27 | For other variants (e.g. for a fair comparison regarding data, backbone, etc.), please refer to the papers.
28 | 


--------------------------------------------------------------------------------
/eval_sota/groundingdino.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | __author__ = "Chi Xie"
  3 | __maintainer__ = "Chi Xie"
  4 | 
  5 | # An example for how to run this script:
  6 | # CUDA_VISIBLE_DEVICES=0
  7 | # python groundingdino.py \
  8 | #     -c ./groundingdino/config/GroundingDINO_SwinB.cfg.py \
  9 | #     -p ./ckpt/groundingdino_swinb_cogcoor.pth \
 10 | #     -o "outputs/gdino_d3" \
 11 | #     --box_threshold 0.05 \
 12 | #     --text_threshold 0.05 \
 13 | #     --img-top1
 14 | 
 15 | import argparse
 16 | import json
 17 | import os
 18 | 
 19 | import numpy as np
 20 | import torch
 21 | from PIL import Image, ImageDraw, ImageFont
 22 | from pycocotools.coco import COCO
 23 | from pycocotools.cocoeval import COCOeval
 24 | from tqdm import tqdm
 25 | 
 26 | import groundingdino.datasets.transforms as T
 27 | from groundingdino.models import build_model
 28 | from groundingdino.util.slconfig import SLConfig
 29 | from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
 30 | from d_cube import D3
 31 | 
 32 | 
 33 | def plot_boxes_to_image(image_pil, tgt):
 34 |     H, W = tgt["size"]
 35 |     boxes = tgt["boxes"]
 36 |     labels = tgt["labels"]
 37 |     assert len(boxes) == len(labels), "boxes and labels must have same length"
 38 | 
 39 |     draw = ImageDraw.Draw(image_pil)
 40 |     mask = Image.new("L", image_pil.size, 0)
 41 |     mask_draw = ImageDraw.Draw(mask)
 42 | 
 43 |     # draw boxes and masks
 44 |     for box, label in zip(boxes, labels):
 45 |         # from 0..1 to 0..W, 0..H
 46 |         box = box * torch.Tensor([W, H, W, H])
 47 |         # from xywh to xyxy
 48 |         box[:2] -= box[2:] / 2
 49 |         box[2:] += box[:2]
 50 |         # random color
 51 |         color = tuple(np.random.randint(0, 255, size=3).tolist())
 52 |         # draw
 53 |         x0, y0, x1, y1 = box
 54 |         x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
 55 | 
 56 |         draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
 57 |         # draw.text((x0, y0), str(label), fill=color)
 58 | 
 59 |         font = ImageFont.load_default()
 60 |         if hasattr(font, "getbbox"):
 61 |             bbox = draw.textbbox((x0, y0), str(label), font)
 62 |         else:
 63 |             w, h = draw.textsize(str(label), font)
 64 |             bbox = (x0, y0, w + x0, y0 + h)
 65 |         # bbox = draw.textbbox((x0, y0), str(label))
 66 |         draw.rectangle(bbox, fill=color)
 67 |         draw.text((x0, y0), str(label), fill="white")
 68 | 
 69 |         mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
 70 |     return image_pil, mask
 71 | 
 72 | 
 73 | def load_image(image_path):
 74 |     # load image
 75 |     image_pil = Image.open(image_path).convert("RGB")  # load image
 76 | 
 77 |     transform = T.Compose(
 78 |         [
 79 |             T.RandomResize([800], max_size=1333),
 80 |             T.ToTensor(),
 81 |             T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
 82 |         ]
 83 |     )
 84 |     image, _ = transform(image_pil, None)  # 3, h, w
 85 |     return image_pil, image
 86 | 
 87 | 
 88 | def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
 89 |     args = SLConfig.fromfile(model_config_path)
 90 |     args.device = "cuda" if not cpu_only else "cpu"
 91 |     model = build_model(args)
 92 |     checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
 93 |     load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
 94 |     print(load_res)
 95 |     _ = model.eval()
 96 |     return model
 97 | 
 98 | 
 99 | def get_grounding_output(model, image, caption, box_threshold, text_threshold, with_logits=True, cpu_only=False):
100 |     caption = caption.lower()
101 |     caption = caption.strip()
102 |     if not caption.endswith("."):
103 |         caption = caption + "."
104 |     device = "cuda" if not cpu_only else "cpu"
105 |     model = model.to(device)
106 |     image = image.to(device)
107 |     with torch.no_grad():
108 |         outputs = model(image[None], captions=[caption])
109 |     logits = outputs["pred_logits"].cpu().sigmoid()[0]  # (nq, 256)
110 |     boxes = outputs["pred_boxes"].cpu()[0]  # (nq, 4)
111 |     logits.shape[0]
112 | 
113 |     # filter output
114 |     logits_filt = logits.clone()
115 |     boxes_filt = boxes.clone()
116 |     filt_mask = logits_filt.max(dim=1)[0] > box_threshold
117 |     logits_filt = logits_filt[filt_mask]  # num_filt, 256
118 |     boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
119 |     logits_filt.shape[0]
120 | 
121 |     # get phrase
122 |     tokenlizer = model.tokenizer
123 |     tokenized = tokenlizer(caption)
124 |     # build pred
125 |     pred_phrases = []
126 |     logits_list = []
127 |     for logit, box in zip(logits_filt, boxes_filt):
128 |         pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
129 |         logits_list.append(logit.max().item())
130 |         if with_logits:
131 |             pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
132 |         else:
133 |             pred_phrases.append(pred_phrase)
134 | 
135 |     return boxes_filt, pred_phrases, logits_list
136 | 
137 | 
138 | def get_dataset_iter(coco):
139 |     img_ids = coco.get_img_ids()
140 |     for img_id in img_ids:
141 |         img_info = coco.load_imgs(img_id)[0]
142 |         file_name = img_info["file_name"]
143 |         img_path = os.path.join(IMG_ROOT, file_name)
144 |         yield img_id, img_path
145 | 
146 | 
147 | def eval_on_d3(pred_path, mode="pn"):
148 |     assert mode in ("pn", "p", "n")
149 |     if mode == "pn":
150 |         gt_path = os.path.join(JSON_ANNO_PATH, "d3_full_annotations.json")
151 |     elif mode == "p":
152 |         gt_path = os.path.join(JSON_ANNO_PATH, "d3_pres_annotations.json")
153 |     else:
154 |         gt_path = os.path.join(JSON_ANNO_PATH, "d3_abs_annotations.json")
155 |     coco = COCO(gt_path)
156 |     d3_res = coco.loadRes(pred_path)
157 |     cocoEval = COCOeval(coco, d3_res, "bbox")
158 |     cocoEval.evaluate()
159 |     cocoEval.accumulate()
160 |     cocoEval.summarize()
161 | 
162 |     # comment the following if u only need intra/inter map for full/pres/abs
163 |     # ===================== uncomment this if u need detailed analysis =====================
164 |     # aps = cocoEval.eval["precision"][:, :, :, 0, -1]
165 |     # category_ids = coco.getCatIds()
166 |     # category_names = [cat["name"] for cat in coco.loadCats(category_ids)]
167 | 
168 |     # aps_lens = defaultdict(list)
169 |     # counter_lens = defaultdict(int)
170 |     # for i in range(len(category_names)):
171 |     #     ap = aps[:, :, i]
172 |     #     ap_value = ap[ap > -1].mean()
173 |     #     if not np.isnan(ap_value):
174 |     #         len_ref = len(category_names[i].split(" "))
175 |     #         aps_lens[len_ref].append(ap_value)
176 |     #         counter_lens[len_ref] += 1
177 | 
178 |     # ap_sum_short = sum([sum(aps_lens[i]) for i in range(0, 4)])
179 |     # ap_sum_mid = sum([sum(aps_lens[i]) for i in range(4, 7)])
180 |     # ap_sum_long = sum([sum(aps_lens[i]) for i in range(7, 10)])
181 |     # ap_sum_very_long = sum(
182 |     #     [sum(aps_lens[i]) for i in range(10, max(counter_lens.keys()) + 1)]
183 |     # )
184 |     # c_sum_short = sum([counter_lens[i] for i in range(1, 4)])
185 |     # c_sum_mid = sum([counter_lens[i] for i in range(4, 7)])
186 |     # c_sum_long = sum([counter_lens[i] for i in range(7, 10)])
187 |     # c_sum_very_long = sum(
188 |     #     [counter_lens[i] for i in range(10, max(counter_lens.keys()) + 1)]
189 |     # )
190 |     # map_short = ap_sum_short / c_sum_short
191 |     # map_mid = ap_sum_mid / c_sum_mid
192 |     # map_long = ap_sum_long / c_sum_long
193 |     # map_very_long = ap_sum_very_long / c_sum_very_long
194 |     # print(
195 |     #     f"mAP over reference length: short - {map_short:.4f}, mid - {map_mid:.4f}, long - {map_long:.4f}, very long - {map_very_long:.4f}"
196 |     # )
197 |     # ===================== uncomment this if u need detailed analysis =====================
198 | 
199 | 
200 | def inference_on_d3(data_iter, model, args, box_threshold, text_threshold):
201 |     pred = []
202 |     for idx, (img_id, image_path) in enumerate(tqdm(data_iter)):
203 |         # load image
204 |         image_pil, image = load_image(image_path)
205 |         size = image_pil.size
206 |         W, H = size
207 | 
208 |         group_ids = d3.get_group_ids(img_ids=[img_id])
209 |         sent_ids = d3.get_sent_ids(group_ids=group_ids)
210 |         sent_list = d3.load_sents(sent_ids=sent_ids)
211 |         text_list = [sent['raw_sent'] for sent in sent_list]
212 | 
213 |         for sent_id, text_prompt in zip(sent_ids, text_list):
214 |             # run model
215 |             boxes_filt, pred_phrases, logit_list = get_grounding_output(
216 |                 model, image, text_prompt, box_threshold, text_threshold, cpu_only=args.cpu_only, with_logits=False,
217 |             )
218 |             if args.vis:
219 |                 pred_dict = {
220 |                     "boxes": boxes_filt,  # [x_center, y_center, w, h]
221 |                     "size": [size[1], size[0]],
222 |                     "labels": [f"{phrase}({str(logit)[:4]})" for phrase, logit in zip(pred_phrases, logit_list)],
223 |                 }
224 |                 image_with_box = plot_boxes_to_image(image_pil.copy(), pred_dict)[0]
225 |                 image_with_box.save(os.path.join(output_dir, f"{img_id}_{text_prompt}.jpg"))
226 |             if not logit_list:
227 |                 continue
228 |             if args.img_top1:
229 |                 max_score_idx = logit_list.index(max(logit_list))
230 |                 bboxes, phrases, logits = [boxes_filt[max_score_idx]], [pred_phrases[max_score_idx]], [logit_list[max_score_idx]]
231 |             else:
232 |                 bboxes, phrases, logits = boxes_filt, pred_phrases, logit_list
233 |             for box, phrase, logit in zip(bboxes, phrases, logits):
234 |                 if len(phrase) > args.overlap_percent * len(text_prompt) or phrase == text_prompt:
235 |                     x1, y1, w, h = box.tolist()
236 |                     x0, y0 = x1 - w / 2, y1 - h / 2
237 |                     pred_item = {
238 |                         "image_id": img_id,
239 |                         "category_id": sent_id,
240 |                         "bbox": [x0 * W, y0 * H, w * W, h * H],
241 |                         "score": float(logit),
242 |                     }
243 |                     pred.append(pred_item)
244 | 
245 |     return pred
246 | 
247 | 
248 | if __name__ == "__main__":
249 |     IMG_ROOT = None  # set here
250 |     JSON_ANNO_PATH = None  # set here
251 |     PKL_ANNO_PATH = None  # set here
252 |     assert IMG_ROOT is not None, "Please set IMG_ROOT in the script first"
253 |     assert JSON_ANNO_PATH is not None, "Please set JSON_ANNO_PATH in the script first"
254 |     assert PKL_ANNO_PATH is not None, "Please set PKL_ANNO_PATH in the script first"
255 | 
256 |     d3 = D3(IMG_ROOT, PKL_ANNO_PATH)
257 | 
258 |     parser = argparse.ArgumentParser("Grounding DINO evaluation on D-cube (https://arxiv.org/abs/2307.12813)", add_help=True)
259 |     parser.add_argument("--config_file", "-c", type=str, required=True, help="path to config file")
260 |     parser.add_argument(
261 |         "--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file"
262 |     )
263 |     # parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file")
264 |     # parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt")
265 |     parser.add_argument(
266 |         "--output_dir", "-o", type=str, default="outputs", required=True, help="output directory"
267 |     )
268 |     parser.add_argument("--vis", action="store_true", help="visualization on D3")
269 | 
270 |     parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold")
271 |     parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
272 | 
273 |     parser.add_argument("--cpu-only", action="store_true", help="running on cpu only!, default=False")
274 |     parser.add_argument("--img-top1", action="store_true", help="select only the box with top max score")
275 |     # parser.add_argument("--overlap-percent", type=float, default=1.0, help="overlapping percentage between input prompt and output label")
276 |     # this overlapping percentage denotes an additional post-processing technique we designed. if you turn this on, you may get higher performance by tuning this parameter.
277 |     args = parser.parse_args()
278 |     args.overlap_percent = 1  # by default, we do not use this technique.
279 |     print(args)
280 | 
281 |     # cfg
282 |     config_file = args.config_file  # change the path of the model config file
283 |     checkpoint_path = args.checkpoint_path  # change the path of the model
284 |     # image_path = args.image_path
285 |     # text_prompt = args.text_prompt
286 |     output_dir = args.output_dir
287 |     box_threshold = args.box_threshold
288 |     text_threshold = args.text_threshold
289 | 
290 |     # make dir
291 |     os.makedirs(output_dir, exist_ok=True)
292 |     # load model
293 |     model = load_model(config_file, checkpoint_path, cpu_only=args.cpu_only)
294 | 
295 |     data_iter = get_dataset_iter(d3)
296 | 
297 |     pred = inference_on_d3(data_iter, model, args, box_threshold=box_threshold, text_threshold=text_threshold)
298 | 
299 |     pred_path = os.path.join(output_dir, f"prediction.json")
300 |     with open(pred_path, "w") as f_:
301 |         json.dump(pred, f_)
302 |     eval_on_d3(pred_path, mode='pn')
303 |     eval_on_d3(pred_path, mode='p')
304 |     eval_on_d3(pred_path, mode='n')
305 | 


--------------------------------------------------------------------------------
/eval_sota/owl_vit.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from collections import defaultdict
  4 | 
  5 | from tqdm import tqdm
  6 | from PIL import Image
  7 | import numpy as np
  8 | from pycocotools.coco import COCO
  9 | from pycocotools.cocoeval import COCOeval
 10 | import torch
 11 | from transformers import OwlViTProcessor, OwlViTForObjectDetection
 12 | 
 13 | from d_cube import D3
 14 | 
 15 | 
 16 | def write_json(json_path, json_data):
 17 |     with open(json_path, "w") as f_:
 18 |         json.dump(json_data, f_)
 19 | 
 20 | 
 21 | def read_json(json_path):
 22 |     with open(json_path, "r") as f_:
 23 |         json_data = json.load(f_)
 24 |     return json_data
 25 | 
 26 | 
 27 | def load_image_general(image_path):
 28 |     image_pil = Image.open(image_path)
 29 |     return image_pil
 30 | 
 31 | 
 32 | def get_prediction(model, image, captions, cpu_only=False):
 33 |     for i in range(len(captions)):
 34 |         captions[i] = captions[i].lower()
 35 |         captions[i] = captions[i].strip()
 36 |         if not captions[i].endswith("."):
 37 |             captions[i] = captions[i] + "."
 38 |     device = "cuda" if not cpu_only else "cpu"
 39 |     model = model.to(device)
 40 |     with torch.no_grad():
 41 |         inputs = processor(text=[captions], images=image, return_tensors="pt").to(
 42 |             device
 43 |         )
 44 |         outputs = model(**inputs)
 45 |     target_size = torch.Tensor([image.size[::-1]]).to(device)
 46 |     results = processor.post_process_object_detection(
 47 |         outputs=outputs, target_sizes=target_size, threshold=0.1
 48 |         # the post precessing threshold will affect the performance obviously
 49 |         # you may tune it to get better performance, e.g., 0.05
 50 |     )
 51 |     boxes, scores, labels = (
 52 |         results[0]["boxes"],
 53 |         results[0]["scores"],
 54 |         results[0]["labels"],
 55 |     )
 56 |     return boxes, scores, labels
 57 | 
 58 | 
 59 | def get_dataset_iter(coco):
 60 |     img_ids = coco.get_img_ids()
 61 |     for img_id in img_ids:
 62 |         img_info = coco.load_imgs(img_id)[0]
 63 |         file_name = img_info["file_name"]
 64 |         img_path = os.path.join(IMG_ROOT, file_name)
 65 |         yield img_id, img_path
 66 | 
 67 | 
 68 | def eval_on_d3(pred_path, mode="pn"):
 69 |     assert mode in ("pn", "p", "n")
 70 |     if mode == "pn":
 71 |         gt_path = os.path.join(JSON_ANNO_PATH, "d3_full_annotations.json")
 72 |     elif mode == "p":
 73 |         gt_path = os.path.join(JSON_ANNO_PATH, "d3_pres_annotations.json")
 74 |     else:
 75 |         gt_path = os.path.join(JSON_ANNO_PATH, "d3_abs_annotations.json")
 76 |     coco = COCO(gt_path)
 77 |     d3_res = coco.loadRes(pred_path)
 78 |     cocoEval = COCOeval(coco, d3_res, "bbox")
 79 |     cocoEval.evaluate()
 80 |     cocoEval.accumulate()
 81 |     cocoEval.summarize()
 82 | 
 83 |     # comment the following if u only need intra/inter map for full/pres/abs
 84 |     # ===================== uncomment this if u need detailed analysis =====================
 85 |     # aps = cocoEval.eval["precision"][:, :, :, 0, -1]
 86 |     # category_ids = coco.getCatIds()
 87 |     # category_names = [cat["name"] for cat in coco.loadCats(category_ids)]
 88 | 
 89 |     # aps_lens = defaultdict(list)
 90 |     # counter_lens = defaultdict(int)
 91 |     # for i in range(len(category_names)):
 92 |     #     ap = aps[:, :, i]
 93 |     #     ap_value = ap[ap > -1].mean()
 94 |     #     if not np.isnan(ap_value):
 95 |     #         len_ref = len(category_names[i].split(" "))
 96 |     #         aps_lens[len_ref].append(ap_value)
 97 |     #         counter_lens[len_ref] += 1
 98 | 
 99 |     # ap_sum_short = sum([sum(aps_lens[i]) for i in range(0, 4)])
100 |     # ap_sum_mid = sum([sum(aps_lens[i]) for i in range(4, 7)])
101 |     # ap_sum_long = sum([sum(aps_lens[i]) for i in range(7, 10)])
102 |     # ap_sum_very_long = sum(
103 |     #     [sum(aps_lens[i]) for i in range(10, max(counter_lens.keys()) + 1)]
104 |     # )
105 |     # c_sum_short = sum([counter_lens[i] for i in range(1, 4)])
106 |     # c_sum_mid = sum([counter_lens[i] for i in range(4, 7)])
107 |     # c_sum_long = sum([counter_lens[i] for i in range(7, 10)])
108 |     # c_sum_very_long = sum(
109 |     #     [counter_lens[i] for i in range(10, max(counter_lens.keys()) + 1)]
110 |     # )
111 |     # map_short = ap_sum_short / c_sum_short
112 |     # map_mid = ap_sum_mid / c_sum_mid
113 |     # map_long = ap_sum_long / c_sum_long
114 |     # map_very_long = ap_sum_very_long / c_sum_very_long
115 |     # print(
116 |     #     f"mAP over reference length: short - {map_short:.4f}, mid - {map_mid:.4f}, long - {map_long:.4f}, very long - {map_very_long:.4f}"
117 |     # )
118 |     # ===================== uncomment this if u need detailed analysis =====================
119 | 
120 | 
121 | def inference_on_d3(data_iter, model):
122 |     pred = []
123 |     error = []
124 |     for img_id, image_path in tqdm(data_iter):
125 |         image = load_image_general(image_path)
126 | 
127 |         # ==================================== intra-group setting ==================================== 
128 |         # each image is evaluated with the categories in its group (usually 4)
129 |         group_ids = d3.get_group_ids(img_ids=[img_id])
130 |         sent_ids = d3.get_sent_ids(group_ids=group_ids)
131 |         # ==================================== intra-group setting ====================================
132 |         # ==================================== inter-group setting ====================================
133 |         # each image is evaluated with all categories in the dataset (422 for the first version of the dataset)
134 |         # sent_ids = d3.get_sent_ids()
135 |         # ==================================== inter-group setting ====================================
136 |         sent_list = d3.load_sents(sent_ids=sent_ids)
137 |         text_list = [sent["raw_sent"] for sent in sent_list]
138 | 
139 |         try:
140 |             boxes, scores, labels = get_prediction(model, image, text_list, cpu_only=False)
141 |             for box, score, label in zip(boxes, scores, labels):
142 |                 pred_item = {
143 |                     "image_id": img_id,
144 |                     "category_id": sent_ids[label],
145 |                     "bbox": convert_to_xywh(box.tolist()),  # use xywh
146 |                     "score": float(score),
147 |                 }
148 |                 pred.append(pred_item)  # the output to be saved to JSON.
149 |         except:
150 |             print("error!!!")
151 |     return pred, error
152 | 
153 | 
154 | def convert_to_xywh(bbox_xyxy):
155 |     """
156 |     Convert top-left and bottom-right corner coordinates to [x, y, width, height] format.
157 |     """
158 |     x1, y1, x2, y2 = bbox_xyxy
159 |     width = x2 - x1
160 |     height = y2 - y1
161 |     return [x1, y1, width, height]
162 | 
163 | 
164 | if __name__ == "__main__":
165 |     IMG_ROOT = None  # set here
166 |     JSON_ANNO_PATH = None  # set here
167 |     PKL_ANNO_PATH = None  # set here
168 |     assert IMG_ROOT is not None, "Please set IMG_ROOT in the script first"
169 |     assert JSON_ANNO_PATH is not None, "Please set JSON_ANNO_PATH in the script first"
170 |     assert PKL_ANNO_PATH is not None, "Please set PKL_ANNO_PATH in the script first"
171 | 
172 |     d3 = D3(IMG_ROOT, PKL_ANNO_PATH)
173 | 
174 |     output_dir = "ovd/owlvit/"
175 |     os.makedirs(output_dir, exist_ok=True)
176 | 
177 |     # model prediction
178 |     processor = OwlViTProcessor.from_pretrained("owl-vit")
179 |     model = OwlViTForObjectDetection.from_pretrained("owl-vit")
180 |     data_iter = get_dataset_iter(d3)
181 |     pred, error = inference_on_d3(data_iter, model)
182 | 
183 |     pred_path = os.path.join(output_dir, f"prediction.json")
184 |     pred_path_error = os.path.join(output_dir, "error.json")
185 |     write_json(pred_path, pred)
186 |     write_json(pred_path_error, error)
187 |     # see https://github.com/shikras/d-cube/blob/main/doc.md#output-format for the output format
188 |     # the output format is identical to COCO.
189 | 
190 |     eval_on_d3(pred_path, mode="pn")  # the FULL setting
191 |     eval_on_d3(pred_path, mode="p")  # the PRES setting
192 |     eval_on_d3(pred_path, mode="n")  # the ABS setting
193 | 


--------------------------------------------------------------------------------
/eval_sota/sphinx.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | __author__ = "Chi Xie and Jie Li"
  3 | __maintainer__ = "Chi Xie"
  4 | 
  5 | import json
  6 | import os
  7 | from collections import defaultdict
  8 | import re
  9 | 
 10 | from PIL import Image
 11 | from pycocotools.coco import COCO
 12 | from pycocotools.cocoeval import COCOeval
 13 | 
 14 | from d_cube import D3
 15 | 
 16 | 
 17 | def write_json(json_path, json_data):
 18 |     with open(json_path, "w") as f_:
 19 |         json.dump(json_data, f_)
 20 | 
 21 | 
 22 | def read_json(json_path):
 23 |     with open(json_path, "r") as f_:
 24 |         json_data = json.load(f_)
 25 |     return json_data
 26 | 
 27 | 
 28 | def load_image_general(image_path):
 29 |     image_pil = Image.open(image_path)
 30 |     return image_pil
 31 | 
 32 | 
 33 | def extract_boxes(input_string):
 34 |     # if input_string.startswith("None"):
 35 |     #     return []
 36 |     # Define the pattern using regular expression
 37 |     pattern = r'\[([\d.,; ]+)\]'
 38 |     
 39 |     # Search for the pattern in the input string
 40 |     match = re.search(pattern, input_string)
 41 |     
 42 |     # If a match is found, extract and return the boxes as a list
 43 |     if match:
 44 |         boxes_str = match.group(1)
 45 |         boxes_list = [list(map(float, box.split(','))) for box in boxes_str.split(';')]
 46 |         return boxes_list
 47 |     else:
 48 |         return []
 49 | 
 50 | 
 51 | def get_prediction(mllm_res, image, captions, cpu_only=False):
 52 |     boxes, scores, labels = [], [], []
 53 |     width, height = image.size
 54 |     for idx, res_item in enumerate(mllm_res):
 55 |         boxes_list = extract_boxes(res_item["answer"])
 56 |         for bbox in boxes_list:
 57 |             bbox_rescaled = get_true_bbox(image.size, bbox)
 58 |             boxes.append(bbox_rescaled)
 59 |             scores.append(1.0)
 60 |             labels.append(idx)
 61 |     return boxes, scores, labels
 62 | 
 63 | 
 64 | def get_dataset_iter(coco):
 65 |     img_ids = coco.get_img_ids()
 66 |     for img_id in img_ids:
 67 |         img_info = coco.load_imgs(img_id)[0]
 68 |         file_name = img_info["file_name"]
 69 |         img_path = os.path.join(IMG_ROOT, file_name)
 70 |         yield img_id, file_name, img_path
 71 | 
 72 | 
 73 | def eval_on_d3(pred_path, mode="pn"):
 74 |     assert mode in ("pn", "p", "n")
 75 |     if mode == "pn":
 76 |         gt_path = os.path.join(JSON_ANNO_PATH, "d3_full_annotations.json")
 77 |     elif mode == "p":
 78 |         gt_path = os.path.join(JSON_ANNO_PATH, "d3_pres_annotations.json")
 79 |     else:
 80 |         gt_path = os.path.join(JSON_ANNO_PATH, "d3_abs_annotations.json")
 81 |     coco = COCO(gt_path)
 82 |     d3_res = coco.loadRes(pred_path)
 83 |     cocoEval = COCOeval(coco, d3_res, "bbox")
 84 |     cocoEval.evaluate()
 85 |     cocoEval.accumulate()
 86 |     cocoEval.summarize()
 87 | 
 88 | 
 89 | def group_sphinx_res_by_img(inference_res):
 90 |     inference_res_by_img = defaultdict(list)
 91 |     for res_item in inference_res:
 92 |         img_path = "/".join(res_item["image_path"].split("/")[-2:])
 93 |         inference_res_by_img[img_path].append(res_item)
 94 |     inference_res_by_img = dict(inference_res_by_img)
 95 |     return inference_res_by_img
 96 | 
 97 | 
 98 | def get_true_bbox(img_size, bbox):
 99 |     width, height = img_size
100 |     max_edge = max(height, width)
101 |     bbox = [v * max_edge for v in bbox]
102 |     diff = abs(width - height) // 2
103 |     if height < width:
104 |         bbox[1] -= diff
105 |         bbox[3] -= diff
106 |     else:
107 |         bbox[0] -= diff
108 |         bbox[2] -= diff
109 |     return bbox
110 | 
111 | 
112 | def inference_on_d3(data_iter, inference_res):
113 |     pred = []
114 |     inf_res_by_img = group_sphinx_res_by_img(inference_res)
115 |     for idx, (img_id, img_name, img_path) in enumerate(data_iter):
116 |         image = load_image_general(img_path)
117 | 
118 |         # ==================================== intra-group setting ==================================== 
119 |         # each image is evaluated with the categories in its group (usually 4)
120 |         group_ids = d3.get_group_ids(img_ids=[img_id])
121 |         sent_ids = d3.get_sent_ids(group_ids=group_ids)
122 |         # ==================================== intra-group setting ====================================
123 |         # ==================================== inter-group setting ====================================
124 |         # each image is evaluated with all categories in the dataset (422 for the first version of the dataset)
125 |         # sent_ids = d3.get_sent_ids()
126 |         # ==================================== inter-group setting ====================================
127 |         sent_list = d3.load_sents(sent_ids=sent_ids)
128 |         text_list = [sent["raw_sent"] for sent in sent_list]
129 | 
130 |         boxes, scores, labels = get_prediction(inf_res_by_img[img_name], image, text_list, cpu_only=False)
131 |         for box, score, label in zip(boxes, scores, labels):
132 |             pred_item = {
133 |                 "image_id": img_id,
134 |                 "category_id": sent_ids[label],
135 |                 "bbox": convert_to_xywh(box),  # use xywh
136 |                 "score": float(score),
137 |             }
138 |             pred.append(pred_item)  # the output to be saved to JSON.
139 |     return pred
140 | 
141 | 
142 | def convert_to_xywh(bbox_xyxy):
143 |     """
144 |     Convert top-left and bottom-right corner coordinates to [x, y, width, height] format.
145 |     """
146 |     x1, y1, x2, y2 = bbox_xyxy
147 |     width = x2 - x1
148 |     height = y2 - y1
149 |     return [x1, y1, width, height]
150 | 
151 | 
152 | if __name__ == "__main__":
153 |     IMG_ROOT = None  # set here
154 |     JSON_ANNO_PATH = None  # set here
155 |     PKL_ANNO_PATH = None  # set here
156 |     # ============================== SPHINX inference result file ===============
157 |     SPHINX_INFERENCE_RES_PATH = None
158 |     # You can download the SPHINX d3 inference result example from:
159 |     # https://github.com/shikras/d-cube/files/14276682/sphinx_d3_result.json
160 |     # For the inference process, please refer to SPHINX official repo (https://github.com/Alpha-VLLM/LLaMA2-Accessory)
161 |     # the prompts we used are available in this JSON file
162 |     # Thanks for the contribution from Jie Li (https://github.com/theFool32)
163 |     # ============================== SPHINX inference result file ===============
164 |     assert IMG_ROOT is not None, "Please set IMG_ROOT in the script first"
165 |     assert JSON_ANNO_PATH is not None, "Please set JSON_ANNO_PATH in the script first"
166 |     assert PKL_ANNO_PATH is not None, "Please set PKL_ANNO_PATH in the script first"
167 | 
168 |     d3 = D3(IMG_ROOT, PKL_ANNO_PATH)
169 | 
170 |     output_dir = "mllm/sphinx/"  # or whatever you prefer
171 |     inference_res = read_json(SPHINX_INFERENCE_RES_PATH)
172 | 
173 |     # model prediction
174 |     data_iter = get_dataset_iter(d3)
175 |     pred = inference_on_d3(data_iter, inference_res)
176 | 
177 |     pred_path = os.path.join(output_dir, f"prediction.json")
178 |     write_json(pred_path, pred)
179 |     # see https://github.com/shikras/d-cube/blob/main/doc.md#output-format for the output format
180 |     # the output format is identical to COCO.
181 | 
182 |     eval_on_d3(pred_path, mode="pn")  # the FULL setting
183 |     eval_on_d3(pred_path, mode="p")  # the PRES setting
184 |     eval_on_d3(pred_path, mode="n")  # the ABS setting
185 | 


--------------------------------------------------------------------------------
/qa.md:
--------------------------------------------------------------------------------
 1 | # Frequently Asked Questions
 2 | 
 3 | Q:
 4 | What's the difference between Intra-Group and Inter-Group setting in [the DOD paper](https://arxiv.org/abs/2307.12813), and how to set them?
 5 | 
 6 | A:
 7 | Please see [this explanation in the document](./doc.md#intra--or-inter-group-settings).
 8 | 
 9 | 
10 | 
11 | Q:
12 | What's the meaning of and difference between FULL, PRES, and ABS?
13 | 
14 | A:
15 | Please see [this explanation in the document](./doc.md#full-pres-and-abs).
16 | 
17 | 
18 | 
19 | Q:
20 | How do I perform a visualization of ground truth or prediction on a image?
21 | 
22 | A:
23 | You can use `d3.get_anno_ids` function and pass the `img_id` you choose as parameter to get the annotation ids for a image.
24 | After this, you can obtain the annotation details (class ids, bboxes) with `d3.load_annos`.
25 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pycocotools
3 | opencv-python
4 | matplotlib
5 | 


--------------------------------------------------------------------------------
/scripts/eval_and_analysis_json.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | __author__ = "Chi Xie and Zhao Zhang"
  3 | __maintainer__ = "Chi Xie"
  4 | # this script takes the result json in, and print evaluation and analysis result on D-cube (FULL/PRES/ABS, etc.)
  5 | import os
  6 | import json
  7 | import argparse
  8 | from collections import defaultdict
  9 | 
 10 | import numpy as np
 11 | from pycocotools.coco import COCO
 12 | from pycocotools.cocoeval import COCOeval
 13 | 
 14 | from d_cube import D3
 15 | 
 16 | def eval_on_d3(pred_path, mode="pn", nbox_partition=None, lref_partition=False):
 17 |     assert mode in ("pn", "p", "n")
 18 |     if mode == "pn":
 19 |         gt_path = os.path.join(JSON_ANNO_PATH, "d3_full_annotations.json")
 20 |     elif mode == "p":
 21 |         gt_path = os.path.join(JSON_ANNO_PATH, "d3_pres_annotations.json")
 22 |     else:
 23 |         gt_path = os.path.join(JSON_ANNO_PATH, "d3_abs_annotations.json")
 24 | 
 25 |     if nbox_partition:
 26 |         gt_path, pred_path = nbox_partition_json(gt_path, pred_path, nbox_partition)
 27 | 
 28 |     # Eval results
 29 |     coco = COCO(gt_path)
 30 |     d3_res = coco.loadRes(pred_path)
 31 |     cocoEval = COCOeval(coco, d3_res, "bbox")
 32 |     cocoEval.evaluate()
 33 |     cocoEval.accumulate()
 34 |     cocoEval.summarize()
 35 | 
 36 |     aps = cocoEval.eval["precision"][:, :, :, 0, -1]
 37 |     category_ids = coco.getCatIds()
 38 |     category_names = [cat["name"] for cat in coco.loadCats(category_ids)]
 39 | 
 40 |     if lref_partition:
 41 |         aps_lens = defaultdict(list)
 42 |         counter_lens = defaultdict(int)
 43 |         for i in range(len(category_names)):
 44 |             ap = aps[:, :, i]
 45 |             ap_value = ap[ap > -1].mean()
 46 |             if not np.isnan(ap_value):
 47 |                 len_ref = len(category_names[i].split(" "))
 48 |                 aps_lens[len_ref].append(ap_value)
 49 |                 counter_lens[len_ref] += 1
 50 | 
 51 |         ap_sum_short = sum([sum(aps_lens[i]) for i in range(0, 4)])
 52 |         ap_sum_mid = sum([sum(aps_lens[i]) for i in range(4, 7)])
 53 |         ap_sum_long = sum([sum(aps_lens[i]) for i in range(7, 10)])
 54 |         ap_sum_very_long = sum(
 55 |             [sum(aps_lens[i]) for i in range(10, max(counter_lens.keys()) + 1)]
 56 |         )
 57 |         c_sum_short = sum([counter_lens[i] for i in range(1, 4)])
 58 |         c_sum_mid = sum([counter_lens[i] for i in range(4, 7)])
 59 |         c_sum_long = sum([counter_lens[i] for i in range(7, 10)])
 60 |         c_sum_very_long = sum(
 61 |             [counter_lens[i] for i in range(10, max(counter_lens.keys()) + 1)]
 62 |         )
 63 |         map_short = ap_sum_short / c_sum_short
 64 |         map_mid = ap_sum_mid / c_sum_mid
 65 |         map_long = ap_sum_long / c_sum_long
 66 |         map_very_long = ap_sum_very_long / c_sum_very_long
 67 |         print(
 68 |             f"mAP over reference length: short - {map_short:.4f}, mid - {map_mid:.4f}, long - {map_long:.4f}, very long - {map_very_long:.4f}"
 69 |         )
 70 | 
 71 | 
 72 | def nbox_partition_json(gt_path, pred_path, nbox_partition):
 73 |     with open(gt_path, "r") as f_gt:
 74 |         gts = json.load(f_gt)
 75 |     with open(pred_path, "r") as f_pred:
 76 |         preds = json.load(f_pred)
 77 | 
 78 |     cat_obj_count = d3.bbox_num_analyze()
 79 |     annos = gts["annotations"]
 80 |     new_annos = []
 81 |     for ann in annos:
 82 |         img_id = ann["image_id"]
 83 |         category_id = ann["category_id"]
 84 |         if nbox_partition == "one" and cat_obj_count[category_id - 1, img_id] == 1:
 85 |             new_annos.append(ann)
 86 |         if nbox_partition == "multi" and cat_obj_count[category_id - 1, img_id] > 1:
 87 |             new_annos.append(ann)
 88 |         if nbox_partition == "two" and cat_obj_count[category_id - 1, img_id] == 2:
 89 |             new_annos.append(ann)
 90 |         if nbox_partition == "three" and cat_obj_count[category_id - 1, img_id] == 3:
 91 |             new_annos.append(ann)
 92 |         if nbox_partition == "four" and cat_obj_count[category_id - 1, img_id] == 4:
 93 |             new_annos.append(ann)
 94 |         if nbox_partition == "four_more" and cat_obj_count[category_id - 1, img_id] > 4:
 95 |             new_annos.append(ann)
 96 |     gts["annotations"] = new_annos
 97 |     new_gts = gts
 98 |     new_preds = []
 99 |     for prd in preds:
100 |         img_id = prd["image_id"]
101 |         category_id = prd["category_id"]
102 |         if nbox_partition == "no" and cat_obj_count[category_id - 1, img_id] == 0:
103 |             new_preds.append(prd)
104 |         if nbox_partition == "one" and cat_obj_count[category_id - 1, img_id] == 1:
105 |             new_preds.append(prd)
106 |         if nbox_partition == "multi" and cat_obj_count[category_id - 1, img_id] > 1:
107 |             new_preds.append(prd)
108 |         if nbox_partition == "two" and cat_obj_count[category_id - 1, img_id] == 2:
109 |             new_preds.append(prd)
110 |         if nbox_partition == "three" and cat_obj_count[category_id - 1, img_id] == 3:
111 |             new_preds.append(prd)
112 |         if nbox_partition == "four" and cat_obj_count[category_id - 1, img_id] == 4:
113 |             new_preds.append(prd)
114 |         if nbox_partition == "four_more" and cat_obj_count[category_id - 1, img_id] > 4:
115 |             new_preds.append(prd)
116 | 
117 |     new_gt_path = gt_path.replace(".json", f".{nbox_partition}-instance.json")
118 |     new_pred_path = pred_path.replace(".json", f".{nbox_partition}-instance.json")
119 |     with open(new_gt_path, "w") as fo_gt:
120 |         json.dump(new_gts, fo_gt)
121 |     with open(new_pred_path, "w") as fo_pred:
122 |         json.dump(new_preds, fo_pred)
123 |     return new_gt_path, new_pred_path
124 | 
125 | 
126 | def convert_to_xywh(x1, y1, x2, y2):
127 |     """
128 |     Convert top-left and bottom-right corner coordinates to [x,y,width,height] format.
129 |     """
130 |     width = x2 - x1
131 |     height = y2 - y1
132 |     return x1, y1, width, height
133 | 
134 | 
135 | def transform_json_boxes(pred_path):
136 |     with open(pred_path, "r") as f_:
137 |         res = json.load(f_)
138 |     for item in res:
139 |         item["bbox"] = convert_to_xywh(*item["bbox"])
140 |     res_path = pred_path.replace(".json", ".xywh.json")
141 |     with open(res_path, "w") as f_w:
142 |         json.dump(res, f_w)
143 |     return res_path
144 | 
145 | 
146 | if __name__ == "__main__":
147 |     IMG_ROOT = None  # set here
148 |     JSON_ANNO_PATH = None  # set here
149 |     PKL_ANNO_PATH = None  # set here
150 |     assert IMG_ROOT is not None, "Please set IMG_ROOT in the script first"
151 |     assert JSON_ANNO_PATH is not None, "Please set JSON_ANNO_PATH in the script first"
152 |     assert PKL_ANNO_PATH is not None, "Please set PKL_ANNO_PATH in the script first"
153 |     d3 = D3(IMG_ROOT, PKL_ANNO_PATH)
154 | 
155 |     parser = argparse.ArgumentParser(
156 |         "An example script for D-cube evaluation with prediction file (JSON)",
157 |         add_help=True,
158 |     )
159 |     parser.add_argument("pred_path", type=str, help="path to the prediction JSON file")
160 |     parser.add_argument(
161 |         "--partition-by-nbox",
162 |         action="store_true",
163 |         help="divide the images by num of boxes for each ref",
164 |     )
165 |     parser.add_argument(
166 |         "--partition-by-lens",
167 |         action="store_true",
168 |         help="divide the references by their lengths",
169 |     )
170 |     parser.add_argument(
171 |         "--xyxy2xywh",
172 |         action="store_true",
173 |         help="transform box coords from xyxy to xywh",
174 |     )
175 |     args = parser.parse_args()
176 |     if args.xyxy2xywh:
177 |         pred_path = transform_json_boxes(args.pred_path)
178 |     else:
179 |         pred_path = args.pred_path
180 |     pred_path = args.pred_path
181 |     if args.partition_by_nbox:
182 |         # partiton: no-instance, one-instance, multi-instance
183 |         for mode in ("pn", "p", "n"):
184 |             # for ptt in ('no', 'one', 'multi'):
185 |             for ptt in ("no", "one", "two", "three", "four", "four_more"):
186 |                 eval_on_d3(pred_path, mode=mode, nbox_partition=ptt)
187 |     else:
188 |         eval_on_d3(pred_path, mode="pn", lref_partition=args.partition_by_lens)
189 |         eval_on_d3(pred_path, mode="p", lref_partition=args.partition_by_lens)
190 |         eval_on_d3(pred_path, mode="n", lref_partition=args.partition_by_lens)
191 | 


--------------------------------------------------------------------------------
/scripts/eval_json_example.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | __author__ = "Chi Xie and Zhao Zhang"
 3 | __maintainer__ = "Chi Xie"
 4 | # this script takes the result json in, and print evaluation and analysis result on D-cube (FULL/PRES/ABS, etc.)
 5 | from pycocotools.coco import COCO
 6 | from pycocotools.cocoeval import COCOeval
 7 | 
 8 | # Eval results with COCOAPI
 9 | gt_path = "./d3_full_annotations.json"  # FULL, PRES or ABS
10 | pred_path = None  # set your prediction JSON path
11 | coco = COCO(gt_path)
12 | d3_res = coco.loadRes(pred_path)
13 | cocoEval = COCOeval(coco, d3_res, "bbox")
14 | cocoEval.evaluate()
15 | cocoEval.accumulate()
16 | cocoEval.summarize()
17 | 


--------------------------------------------------------------------------------
/scripts/get_d3_stat.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from d_cube.vis_util import plot_hist
 4 | from d_cube import D3
 5 | 
 6 | 
 7 | def vis_num_instance(cat_obj_count):
 8 |     # Assuming `cat_obj_count` is your numpy array of shape [n_cat, n_img]
 9 | 
10 |     # Calculate the total number of instances in each image
11 |     total_instances_per_image = np.sum(cat_obj_count, axis=0)
12 | 
13 |     # # Plot the histogram
14 |     # plt.hist(total_instances_per_image, bins=20)
15 |     # plt.xlabel('Number of Instances')
16 |     # plt.ylabel('Frequency')
17 |     # plt.title('Distribution of Number of Instances on a Image')
18 | 
19 |     # # Save the figure
20 |     # plt.savefig('vis_fig/instance_distribution.png', bbox_inches='tight')
21 |     # plt.close()
22 |     plot_hist(
23 |         total_instances_per_image,
24 |         bins=max(total_instances_per_image) - min(total_instances_per_image) + 1,
25 |         save_path="vis_fig/instance_dist_hist.pdf",
26 |     )
27 | 
28 | 
29 | def vis_num_category(cat_obj_count):
30 |     # Assuming `cat_obj_count` is your numpy array of shape [n_cat, n_img]
31 | 
32 |     # Calculate the number of categories in each image
33 |     num_categories_per_image = np.sum(cat_obj_count > 0, axis=0)
34 | 
35 |     # # Plot the histogram
36 |     # plt.hist(num_categories_per_image, bins=20)
37 |     # plt.xlabel('Number of Categories')
38 |     # plt.ylabel('Frequency')
39 |     # plt.title('Distribution of Number of Categories on a Image')
40 | 
41 |     # # Save the figure
42 |     # plt.savefig('vis_fig/category_distribution.png', bbox_inches='tight')
43 |     # plt.close()
44 |     plot_hist(
45 |         num_categories_per_image,
46 |         bins=max(num_categories_per_image) - min(num_categories_per_image) + 1,
47 |         save_path="vis_fig/category_dist_hist.pdf",
48 |     )
49 | 
50 | 
51 | def vis_num_img_per_cat(cat_obj_count):
52 |     num_img_per_cat = np.sum(cat_obj_count > 0, axis=1)
53 |     plot_hist(
54 |         num_img_per_cat,
55 |         bins=20,
56 |         save_path="vis_fig/nimg_pcat_hist.pdf",
57 |         x="Num. of images",
58 |     )
59 | 
60 | 
61 | def vis_num_box_per_cat(cat_obj_count):
62 |     num_box_per_cat = np.sum(cat_obj_count, axis=1)
63 |     plot_hist(
64 |         num_box_per_cat,
65 |         bins=20,
66 |         save_path="vis_fig/nbox_pcat_hist.pdf",
67 |         x="Num. of instances",
68 |     )
69 | 
70 | 
71 | def vis_num_box_per_cat_per_img(cat_obj_count):
72 |     img_obj_count = cat_obj_count.reshape(-1)
73 |     plot_hist(
74 |         img_obj_count[img_obj_count > 0],
75 |         bins=max(img_obj_count) - min(img_obj_count) + 1,
76 |         save_path="vis_fig/nbox_pcat_pimg_hist.pdf",
77 |         x="Num. of instances on a image",
78 |     )
79 | 
80 | 
81 | if __name__ == "__main__":
82 |     IMG_ROOT = None  # set here
83 |     PKL_ANNO_PATH = None  # set here
84 |     assert IMG_ROOT is not None, "Please set IMG_ROOT in the script first"
85 |     assert PKL_ANNO_PATH is not None, "Please set PKL_ANNO_PATH in the script first"
86 |     d3 = D3(IMG_ROOT, PKL_ANNO_PATH)
87 | 
88 |     cat_obj_count = d3.bbox_num_analyze()
89 |     vis_num_instance(cat_obj_count)
90 |     vis_num_category(cat_obj_count)
91 |     vis_num_img_per_cat(cat_obj_count)
92 |     vis_num_box_per_cat(cat_obj_count)
93 |     vis_num_box_per_cat_per_img(cat_obj_count)
94 | 
95 |     d3.stat_description(with_rev=False)
96 |     d3.stat_description(with_rev=True)
97 |     d3.stat_description(with_rev=False, inter_group=True)
98 |     d3.stat_description(with_rev=True, inter_group=True)
99 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | setuptools.setup(
 4 |     name='ddd-dataset',
 5 |     version='0.1.2',
 6 |     author='Chi Xie',
 7 |     author_email='chixie.personal@gmail.com',
 8 |     description='Toolkit for Description Detection Dataset ($D^3$)',
 9 |     long_description='Toolkit for Description Detection Dataset ($D^3$): A detection dataset with class names characterized by intricate and flexible expressions, for the Described Object Detection (DOD) task.',
10 |     long_description_content_type='text/markdown',
11 |     license='CC BY-NC 4.0',
12 |     packages=['d_cube'],
13 |     package_dir={"d_cube": "d_cube"},
14 |     url='https://github.com/shikras/d-cube',
15 |     project_urls={
16 |         "Bug Tracker": "https://github.com/shikras/d-cube/issues",
17 |     },
18 |     install_requires=['numpy', 'pycocotools', 'opencv-python', 'matplotlib'],
19 | 
20 |     classifiers=[
21 |         'Development Status :: 4 - Beta',
22 |         'Intended Audience :: Science/Research',
23 |         'Intended Audience :: Developers',
24 |         'Intended Audience :: Education',
25 |         'Operating System :: MacOS',
26 |         'Operating System :: Microsoft :: Windows',
27 |         'Operating System :: POSIX :: Linux',
28 |         'Programming Language :: Python :: 3',
29 |     ],
30 | )
31 | 


--------------------------------------------------------------------------------