├── .assets
├── d-cube_logo.png
└── teaser.png
├── .gitignore
├── LICENSE
├── README.md
├── d_cube
├── __init__.py
├── d3.py
├── data_util.py
└── vis_util.py
├── doc.md
├── eval_sota
├── README.md
├── groundingdino.py
├── owl_vit.py
└── sphinx.py
├── qa.md
├── requirements.txt
├── scripts
├── eval_and_analysis_json.py
├── eval_json_example.py
└── get_d3_stat.py
└── setup.py
/.assets/d-cube_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shikras/d-cube/fa0ccd6358b2bb958e8dcf810fc758717f18e4ec/.assets/d-cube_logo.png
--------------------------------------------------------------------------------
/.assets/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shikras/d-cube/fa0ccd6358b2bb958e8dcf810fc758717f18e4ec/.assets/teaser.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 | .vscode/*
6 |
7 | # C extensions
8 | *.so
9 |
10 | # Distribution / packaging
11 | .Python
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 | cover/
54 |
55 | # Translations
56 | *.mo
57 | *.pot
58 |
59 | # Django stuff:
60 | *.log
61 | local_settings.py
62 | db.sqlite3
63 | db.sqlite3-journal
64 |
65 | # Flask stuff:
66 | instance/
67 | .webassets-cache
68 |
69 | # Scrapy stuff:
70 | .scrapy
71 |
72 | # Sphinx documentation
73 | docs/_build/
74 |
75 | # PyBuilder
76 | .pybuilder/
77 | target/
78 |
79 | # Jupyter Notebook
80 | .ipynb_checkpoints
81 |
82 | # IPython
83 | profile_default/
84 | ipython_config.py
85 |
86 | # pyenv
87 | # For a library or package, you might want to ignore these files since the code is
88 | # intended to run in multiple environments; otherwise, check them in:
89 | # .python-version
90 |
91 | # pipenv
92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
95 | # install all needed dependencies.
96 | #Pipfile.lock
97 |
98 | # poetry
99 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100 | # This is especially recommended for binary packages to ensure reproducibility, and is more
101 | # commonly ignored for libraries.
102 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103 | #poetry.lock
104 |
105 | # pdm
106 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
107 | #pdm.lock
108 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
109 | # in version control.
110 | # https://pdm.fming.dev/#use-with-ide
111 | .pdm.toml
112 |
113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
114 | __pypackages__/
115 |
116 | # Celery stuff
117 | celerybeat-schedule
118 | celerybeat.pid
119 |
120 | # SageMath parsed files
121 | *.sage.py
122 |
123 | # Environments
124 | .env
125 | .venv
126 | env/
127 | venv/
128 | ENV/
129 | env.bak/
130 | venv.bak/
131 |
132 | # Spyder project settings
133 | .spyderproject
134 | .spyproject
135 |
136 | # Rope project settings
137 | .ropeproject
138 |
139 | # mkdocs documentation
140 | /site
141 |
142 | # mypy
143 | .mypy_cache/
144 | .dmypy.json
145 | dmypy.json
146 |
147 | # Pyre type checker
148 | .pyre/
149 |
150 | # pytype static type analyzer
151 | .pytype/
152 |
153 | # Cython debug symbols
154 | cython_debug/
155 |
156 | # PyCharm
157 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
158 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
159 | # and can be added to the global gitignore or merged into this file. For a more nuclear
160 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
161 | #.idea/
162 |
163 | # mac system
164 | *.DS_Store
165 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Creative Commons Attribution-NonCommercial 4.0 International
2 |
3 | Creative Commons Corporation ("Creative Commons") is not a law firm and
4 | does not provide legal services or legal advice. Distribution of
5 | Creative Commons public licenses does not create a lawyer-client or
6 | other relationship. Creative Commons makes its licenses and related
7 | information available on an "as-is" basis. Creative Commons gives no
8 | warranties regarding its licenses, any material licensed under their
9 | terms and conditions, or any related information. Creative Commons
10 | disclaims all liability for damages resulting from their use to the
11 | fullest extent possible.
12 |
13 | Using Creative Commons Public Licenses
14 |
15 | Creative Commons public licenses provide a standard set of terms and
16 | conditions that creators and other rights holders may use to share
17 | original works of authorship and other material subject to copyright and
18 | certain other rights specified in the public license below. The
19 | following considerations are for informational purposes only, are not
20 | exhaustive, and do not form part of our licenses.
21 |
22 | - Considerations for licensors: Our public licenses are intended for
23 | use by those authorized to give the public permission to use
24 | material in ways otherwise restricted by copyright and certain other
25 | rights. Our licenses are irrevocable. Licensors should read and
26 | understand the terms and conditions of the license they choose
27 | before applying it. Licensors should also secure all rights
28 | necessary before applying our licenses so that the public can reuse
29 | the material as expected. Licensors should clearly mark any material
30 | not subject to the license. This includes other CC-licensed
31 | material, or material used under an exception or limitation to
32 | copyright. More considerations for licensors :
33 | wiki.creativecommons.org/Considerations_for_licensors
34 |
35 | - Considerations for the public: By using one of our public licenses,
36 | a licensor grants the public permission to use the licensed material
37 | under specified terms and conditions. If the licensor's permission
38 | is not necessary for any reason–for example, because of any
39 | applicable exception or limitation to copyright–then that use is not
40 | regulated by the license. Our licenses grant only permissions under
41 | copyright and certain other rights that a licensor has authority to
42 | grant. Use of the licensed material may still be restricted for
43 | other reasons, including because others have copyright or other
44 | rights in the material. A licensor may make special requests, such
45 | as asking that all changes be marked or described. Although not
46 | required by our licenses, you are encouraged to respect those
47 | requests where reasonable. More considerations for the public :
48 | wiki.creativecommons.org/Considerations_for_licensees
49 |
50 | Creative Commons Attribution-NonCommercial 4.0 International Public
51 | License
52 |
53 | By exercising the Licensed Rights (defined below), You accept and agree
54 | to be bound by the terms and conditions of this Creative Commons
55 | Attribution-NonCommercial 4.0 International Public License ("Public
56 | License"). To the extent this Public License may be interpreted as a
57 | contract, You are granted the Licensed Rights in consideration of Your
58 | acceptance of these terms and conditions, and the Licensor grants You
59 | such rights in consideration of benefits the Licensor receives from
60 | making the Licensed Material available under these terms and conditions.
61 |
62 | - Section 1 – Definitions.
63 |
64 | - a. Adapted Material means material subject to Copyright and
65 | Similar Rights that is derived from or based upon the Licensed
66 | Material and in which the Licensed Material is translated,
67 | altered, arranged, transformed, or otherwise modified in a
68 | manner requiring permission under the Copyright and Similar
69 | Rights held by the Licensor. For purposes of this Public
70 | License, where the Licensed Material is a musical work,
71 | performance, or sound recording, Adapted Material is always
72 | produced where the Licensed Material is synched in timed
73 | relation with a moving image.
74 | - b. Adapter's License means the license You apply to Your
75 | Copyright and Similar Rights in Your contributions to Adapted
76 | Material in accordance with the terms and conditions of this
77 | Public License.
78 | - c. Copyright and Similar Rights means copyright and/or similar
79 | rights closely related to copyright including, without
80 | limitation, performance, broadcast, sound recording, and Sui
81 | Generis Database Rights, without regard to how the rights are
82 | labeled or categorized. For purposes of this Public License, the
83 | rights specified in Section 2(b)(1)-(2) are not Copyright and
84 | Similar Rights.
85 | - d. Effective Technological Measures means those measures that,
86 | in the absence of proper authority, may not be circumvented
87 | under laws fulfilling obligations under Article 11 of the WIPO
88 | Copyright Treaty adopted on December 20, 1996, and/or similar
89 | international agreements.
90 | - e. Exceptions and Limitations means fair use, fair dealing,
91 | and/or any other exception or limitation to Copyright and
92 | Similar Rights that applies to Your use of the Licensed
93 | Material.
94 | - f. Licensed Material means the artistic or literary work,
95 | database, or other material to which the Licensor applied this
96 | Public License.
97 | - g. Licensed Rights means the rights granted to You subject to
98 | the terms and conditions of this Public License, which are
99 | limited to all Copyright and Similar Rights that apply to Your
100 | use of the Licensed Material and that the Licensor has authority
101 | to license.
102 | - h. Licensor means the individual(s) or entity(ies) granting
103 | rights under this Public License.
104 | - i. NonCommercial means not primarily intended for or directed
105 | towards commercial advantage or monetary compensation. For
106 | purposes of this Public License, the exchange of the Licensed
107 | Material for other material subject to Copyright and Similar
108 | Rights by digital file-sharing or similar means is NonCommercial
109 | provided there is no payment of monetary compensation in
110 | connection with the exchange.
111 | - j. Share means to provide material to the public by any means or
112 | process that requires permission under the Licensed Rights, such
113 | as reproduction, public display, public performance,
114 | distribution, dissemination, communication, or importation, and
115 | to make material available to the public including in ways that
116 | members of the public may access the material from a place and
117 | at a time individually chosen by them.
118 | - k. Sui Generis Database Rights means rights other than copyright
119 | resulting from Directive 96/9/EC of the European Parliament and
120 | of the Council of 11 March 1996 on the legal protection of
121 | databases, as amended and/or succeeded, as well as other
122 | essentially equivalent rights anywhere in the world.
123 | - l. You means the individual or entity exercising the Licensed
124 | Rights under this Public License. Your has a corresponding
125 | meaning.
126 |
127 | - Section 2 – Scope.
128 |
129 | - a. License grant.
130 | - 1. Subject to the terms and conditions of this Public
131 | License, the Licensor hereby grants You a worldwide,
132 | royalty-free, non-sublicensable, non-exclusive, irrevocable
133 | license to exercise the Licensed Rights in the Licensed
134 | Material to:
135 | - A. reproduce and Share the Licensed Material, in whole
136 | or in part, for NonCommercial purposes only; and
137 | - B. produce, reproduce, and Share Adapted Material for
138 | NonCommercial purposes only.
139 | - 2. Exceptions and Limitations. For the avoidance of doubt,
140 | where Exceptions and Limitations apply to Your use, this
141 | Public License does not apply, and You do not need to comply
142 | with its terms and conditions.
143 | - 3. Term. The term of this Public License is specified in
144 | Section 6(a).
145 | - 4. Media and formats; technical modifications allowed. The
146 | Licensor authorizes You to exercise the Licensed Rights in
147 | all media and formats whether now known or hereafter
148 | created, and to make technical modifications necessary to do
149 | so. The Licensor waives and/or agrees not to assert any
150 | right or authority to forbid You from making technical
151 | modifications necessary to exercise the Licensed Rights,
152 | including technical modifications necessary to circumvent
153 | Effective Technological Measures. For purposes of this
154 | Public License, simply making modifications authorized by
155 | this Section 2(a)(4) never produces Adapted Material.
156 | - 5. Downstream recipients.
157 | - A. Offer from the Licensor – Licensed Material. Every
158 | recipient of the Licensed Material automatically
159 | receives an offer from the Licensor to exercise the
160 | Licensed Rights under the terms and conditions of this
161 | Public License.
162 | - B. No downstream restrictions. You may not offer or
163 | impose any additional or different terms or conditions
164 | on, or apply any Effective Technological Measures to,
165 | the Licensed Material if doing so restricts exercise of
166 | the Licensed Rights by any recipient of the Licensed
167 | Material.
168 | - 6. No endorsement. Nothing in this Public License
169 | constitutes or may be construed as permission to assert or
170 | imply that You are, or that Your use of the Licensed
171 | Material is, connected with, or sponsored, endorsed, or
172 | granted official status by, the Licensor or others
173 | designated to receive attribution as provided in Section
174 | 3(a)(1)(A)(i).
175 | - b. Other rights.
176 | - 1. Moral rights, such as the right of integrity, are not
177 | licensed under this Public License, nor are publicity,
178 | privacy, and/or other similar personality rights; however,
179 | to the extent possible, the Licensor waives and/or agrees
180 | not to assert any such rights held by the Licensor to the
181 | limited extent necessary to allow You to exercise the
182 | Licensed Rights, but not otherwise.
183 | - 2. Patent and trademark rights are not licensed under this
184 | Public License.
185 | - 3. To the extent possible, the Licensor waives any right to
186 | collect royalties from You for the exercise of the Licensed
187 | Rights, whether directly or through a collecting society
188 | under any voluntary or waivable statutory or compulsory
189 | licensing scheme. In all other cases the Licensor expressly
190 | reserves any right to collect such royalties, including when
191 | the Licensed Material is used other than for NonCommercial
192 | purposes.
193 |
194 | - Section 3 – License Conditions.
195 |
196 | Your exercise of the Licensed Rights is expressly made subject to
197 | the following conditions.
198 |
199 | - a. Attribution.
200 | - 1. If You Share the Licensed Material (including in modified
201 | form), You must:
202 | - A. retain the following if it is supplied by the
203 | Licensor with the Licensed Material:
204 | - i. identification of the creator(s) of the Licensed
205 | Material and any others designated to receive
206 | attribution, in any reasonable manner requested by
207 | the Licensor (including by pseudonym if designated);
208 | - ii. a copyright notice;
209 | - iii. a notice that refers to this Public License;
210 | - iv. a notice that refers to the disclaimer of
211 | warranties;
212 | - v. a URI or hyperlink to the Licensed Material to
213 | the extent reasonably practicable;
214 | - B. indicate if You modified the Licensed Material and
215 | retain an indication of any previous modifications; and
216 | - C. indicate the Licensed Material is licensed under this
217 | Public License, and include the text of, or the URI or
218 | hyperlink to, this Public License.
219 | - 2. You may satisfy the conditions in Section 3(a)(1) in any
220 | reasonable manner based on the medium, means, and context in
221 | which You Share the Licensed Material. For example, it may
222 | be reasonable to satisfy the conditions by providing a URI
223 | or hyperlink to a resource that includes the required
224 | information.
225 | - 3. If requested by the Licensor, You must remove any of the
226 | information required by Section 3(a)(1)(A) to the extent
227 | reasonably practicable.
228 | - 4. If You Share Adapted Material You produce, the Adapter's
229 | License You apply must not prevent recipients of the Adapted
230 | Material from complying with this Public License.
231 |
232 | - Section 4 – Sui Generis Database Rights.
233 |
234 | Where the Licensed Rights include Sui Generis Database Rights that
235 | apply to Your use of the Licensed Material:
236 |
237 | - a. for the avoidance of doubt, Section 2(a)(1) grants You the
238 | right to extract, reuse, reproduce, and Share all or a
239 | substantial portion of the contents of the database for
240 | NonCommercial purposes only;
241 | - b. if You include all or a substantial portion of the database
242 | contents in a database in which You have Sui Generis Database
243 | Rights, then the database in which You have Sui Generis Database
244 | Rights (but not its individual contents) is Adapted Material;
245 | and
246 | - c. You must comply with the conditions in Section 3(a) if You
247 | Share all or a substantial portion of the contents of the
248 | database.
249 |
250 | For the avoidance of doubt, this Section 4 supplements and does not
251 | replace Your obligations under this Public License where the
252 | Licensed Rights include other Copyright and Similar Rights.
253 |
254 | - Section 5 – Disclaimer of Warranties and Limitation of Liability.
255 |
256 | - a. Unless otherwise separately undertaken by the Licensor, to
257 | the extent possible, the Licensor offers the Licensed Material
258 | as-is and as-available, and makes no representations or
259 | warranties of any kind concerning the Licensed Material, whether
260 | express, implied, statutory, or other. This includes, without
261 | limitation, warranties of title, merchantability, fitness for a
262 | particular purpose, non-infringement, absence of latent or other
263 | defects, accuracy, or the presence or absence of errors, whether
264 | or not known or discoverable. Where disclaimers of warranties
265 | are not allowed in full or in part, this disclaimer may not
266 | apply to You.
267 | - b. To the extent possible, in no event will the Licensor be
268 | liable to You on any legal theory (including, without
269 | limitation, negligence) or otherwise for any direct, special,
270 | indirect, incidental, consequential, punitive, exemplary, or
271 | other losses, costs, expenses, or damages arising out of this
272 | Public License or use of the Licensed Material, even if the
273 | Licensor has been advised of the possibility of such losses,
274 | costs, expenses, or damages. Where a limitation of liability is
275 | not allowed in full or in part, this limitation may not apply to
276 | You.
277 | - c. The disclaimer of warranties and limitation of liability
278 | provided above shall be interpreted in a manner that, to the
279 | extent possible, most closely approximates an absolute
280 | disclaimer and waiver of all liability.
281 |
282 | - Section 6 – Term and Termination.
283 |
284 | - a. This Public License applies for the term of the Copyright and
285 | Similar Rights licensed here. However, if You fail to comply
286 | with this Public License, then Your rights under this Public
287 | License terminate automatically.
288 | - b. Where Your right to use the Licensed Material has terminated
289 | under Section 6(a), it reinstates:
290 |
291 | - 1. automatically as of the date the violation is cured,
292 | provided it is cured within 30 days of Your discovery of the
293 | violation; or
294 | - 2. upon express reinstatement by the Licensor.
295 |
296 | For the avoidance of doubt, this Section 6(b) does not affect
297 | any right the Licensor may have to seek remedies for Your
298 | violations of this Public License.
299 |
300 | - c. For the avoidance of doubt, the Licensor may also offer the
301 | Licensed Material under separate terms or conditions or stop
302 | distributing the Licensed Material at any time; however, doing
303 | so will not terminate this Public License.
304 | - d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
305 | License.
306 |
307 | - Section 7 – Other Terms and Conditions.
308 |
309 | - a. The Licensor shall not be bound by any additional or
310 | different terms or conditions communicated by You unless
311 | expressly agreed.
312 | - b. Any arrangements, understandings, or agreements regarding the
313 | Licensed Material not stated herein are separate from and
314 | independent of the terms and conditions of this Public License.
315 |
316 | - Section 8 – Interpretation.
317 |
318 | - a. For the avoidance of doubt, this Public License does not, and
319 | shall not be interpreted to, reduce, limit, restrict, or impose
320 | conditions on any use of the Licensed Material that could
321 | lawfully be made without permission under this Public License.
322 | - b. To the extent possible, if any provision of this Public
323 | License is deemed unenforceable, it shall be automatically
324 | reformed to the minimum extent necessary to make it enforceable.
325 | If the provision cannot be reformed, it shall be severed from
326 | this Public License without affecting the enforceability of the
327 | remaining terms and conditions.
328 | - c. No term or condition of this Public License will be waived
329 | and no failure to comply consented to unless expressly agreed to
330 | by the Licensor.
331 | - d. Nothing in this Public License constitutes or may be
332 | interpreted as a limitation upon, or waiver of, any privileges
333 | and immunities that apply to the Licensor or You, including from
334 | the legal processes of any jurisdiction or authority.
335 |
336 | Creative Commons is not a party to its public licenses. Notwithstanding,
337 | Creative Commons may elect to apply one of its public licenses to
338 | material it publishes and in those instances will be considered the
339 | "Licensor." The text of the Creative Commons public licenses is
340 | dedicated to the public domain under the CC0 Public Domain Dedication.
341 | Except for the limited purpose of indicating that material is shared
342 | under a Creative Commons public license or as otherwise permitted by the
343 | Creative Commons policies published at creativecommons.org/policies,
344 | Creative Commons does not authorize the use of the trademark "Creative
345 | Commons" or any other trademark or logo of Creative Commons without its
346 | prior written consent including, without limitation, in connection with
347 | any unauthorized modifications to any of its public licenses or any
348 | other arrangements, understandings, or agreements concerning use of
349 | licensed material. For the avoidance of doubt, this paragraph does not
350 | form part of the public licenses.
351 |
352 | Creative Commons may be contacted at creativecommons.org.
353 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
A detection/segmentation dataset with class names characterized by intricate and flexible expressions
7 |
8 | The repo is the toolbox for D3
9 |
10 | [Doc 📚]
11 |
12 | [Paper (DOD) 📄]
13 | [Paper (GRES) 📄]
14 | [Awesome-DOD 🕶️]
15 |
16 |
17 |
18 |
19 | ***
20 | Description Detection Dataset ($D^3$, /dikju:b/) is an attempt at creating a next-generation object detection dataset.
21 | Unlike traditional detection datasets, the class names of the objects are no longer just simple nouns or noun phrases, but rather complex and descriptive descriptions, such as `a dog not being held by a leash`.
22 | For each image in the dataset, any object that matches the description is annotated.
23 | The dataset provides annotations such as bounding boxes and finely crafted instance masks.
24 | We believe it will contribute to computer vision and vision-language communities.
25 |
26 |
27 |
28 | # News
29 | - [02/14/2024] Evaluation on several SOTA methods (SPHNX (the first MLLM evaluated!), G-DINO, UNINEXT, etc.) are released, together with a [leaderboard](https://github.com/shikras/d-cube/tree/main/eval_sota) for $D^3$. :fire::fire:
30 |
31 | - [10/12/2023] We released an [awesome-described-object-detection](https://github.com/Charles-Xie/awesome-described-object-detection) list to collect and track related works.
32 |
33 | - [09/22/2023] Our DOD [paper](https://arxiv.org/abs/2307.12813) just got accepted by NeurIPS 2023! :fire:
34 |
35 | - [07/25/2023] This toolkit is available on PyPI now. You can install this repo with `pip install ddd-dataset`.
36 |
37 | - [07/25/2023] The [paper preprint](https://arxiv.org/abs/2307.12813) introducing the DOD task and the $D^3$ dataset, is available on arxiv. Check it out!
38 |
39 | - [07/18/2023] We have released our Description Detection Dataset ($D^3$) and the first version of $D^3$ toolbox. You can download it now for your project.
40 |
41 | - [07/14/2023] Our GRES [paper](https://arxiv.org/abs/2305.12452) has been accepted by ICCV 2023.
42 |
43 |
44 |
45 | # Contents
46 | - [Dataset Highlight](#task-and-dataset-highlight)
47 | - [Download](#download)
48 | - [Installation](#installation)
49 | - [Usage](#usage)
50 |
51 |
52 |
53 | # Task and Dataset Highlight
54 |
55 | The $D^3$ dataset is meant for the Described Object Detection (DOD) task. In the image below we show the difference between Referring Expression Comprehension (REC), Object Detection/Open-Vocabulary Detection (OVD) and Described Object Detection (DOD). OVD detect object based on category name, and each category can have zero to multiple instances; REC grounds one region based on a language description, whether the object truly exits or not; DOD detect all instances on each image in the dataset, based on a flexible reference. Related works are tracked in the [awesome-DOD](https://github.com/Charles-Xie/awesome-described-object-detection) list.
56 |
57 | 
58 |
59 | For more information on the characteristics of this dataset, please refer to our paper.
60 |
61 |
62 |
63 | # Download
64 | Currently we host the $D^3$ dataset on cloud drives. You can download the dataset from [Google Drive](https://drive.google.com/drive/folders/11kfY12NzKPwsliLEcIYki1yUqt7PbMEi?usp=sharing) or [Baidu Pan]().
65 |
66 | After downloading the `d3_images.zip` (images in the dataset), `d3_pkl.zip` (dataset information for this toolkit) and `d3_json.zip` (annotation for evaluation), please extract these 3 zip files to your custom `IMG_ROOT`, `PKL_PATH` and `JSON_ANNO_PATH` directory. These paths will be used when you perform inference or evaluation on this dataset.
67 |
68 |
69 |
70 | # Installation
71 |
72 | ## Prerequisites
73 | This toolkit requires a few python packages like `numpy` and `pycocotools`. Other packages like `matplotlib` and `opencv-python` may also be required if you want to utilize the visualization scripts.
74 |
75 |
76 |
77 | There are multiple ways to install $D^3$ toolbox, as listed below:
78 |
79 |
80 | ## Install with pip
81 | ```bash
82 | pip install ddd-dataset
83 | ```
84 |
85 | ## Install from source
86 | ```bash
87 | git clone https://github.com/shikra/d-cube.git
88 | # option 1: install it as a python package
89 | cd d-cube
90 | python -m pip install .
91 | # done
92 |
93 | # option 2: just put the d-cube/d_cube directory in the root directory of your local repository
94 | ```
95 |
96 |
100 |
101 |
102 |
103 | # Usage
104 | Please refer to the [documentation 📚](doc.md) for more details.
105 | Our toolbox is similar to [cocoapi](https://github.com/cocodataset/cocoapi) in style.
106 |
107 | Here is a quick example of how to use $D^3$.
108 | ```python
109 | from d_cube import D3
110 | d3 = D3(IMG_ROOT, PKL_ANNO_PATH)
111 | all_img_ids = d3.get_img_ids() # get the image ids in the dataset
112 | all_img_info = d3.load_imgs(all_img_ids) # load images by passing a list of some image ids
113 | img_path = all_img_info[0]["file_name"] # obtain one image path so you can load it and inference
114 | ```
115 |
116 | Some frequently asked questions are answered in [this Q&A file](./qa.md).
117 |
118 | # Citation
119 |
120 | If you use our $D^3$ dataset, this toolbox, or otherwise find our work valuable, please cite [our paper](https://arxiv.org/abs/2307.12813):
121 |
122 | ```bibtex
123 | @inproceedings{xie2023DOD,
124 | title={Described Object Detection: Liberating Object Detection with Flexible Expressions},
125 | author={Xie, Chi and Zhang, Zhao and Wu, Yixuan and Zhu, Feng and Zhao, Rui and Liang, Shuang},
126 | booktitle={Thirty-seventh Conference on Neural Information Processing Systems (NeurIPS)},
127 | year={2023}
128 | }
129 |
130 | @inproceedings{wu2023gres,
131 | title={Advancing Referring Expression Segmentation Beyond Single Image},
132 | author={Wu, Yixuan and Zhang, Zhao and Xie, Chi and Zhu, Feng and Zhao, Rui},
133 | booktitle={International Conference on Computer Vision (ICCV)},
134 | year={2023}
135 | }
136 | ```
137 |
138 | More works related to Described Object Detection are tracked in this list: [awesome-described-object-detection](https://github.com/Charles-Xie/awesome-described-object-detection).
139 |
--------------------------------------------------------------------------------
/d_cube/__init__.py:
--------------------------------------------------------------------------------
1 | from .d3 import D3
2 |
--------------------------------------------------------------------------------
/d_cube/d3.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | __author__ = "Chi Xie and Zhao Zhang"
3 | __maintainer__ = "Chi Xie"
4 | # this is the core of the d-cube toolkit
5 | import os
6 | import os.path as osp
7 | import json
8 | from collections import defaultdict
9 |
10 | import numpy as np
11 | from pycocotools import mask
12 | import cv2
13 | import matplotlib.pyplot as plt
14 |
15 |
16 | from .data_util import *
17 |
18 |
19 | class D3:
20 | def __init__(self, img_root, anno_root):
21 | self.image_dir = img_root
22 | self.anno_dir = anno_root
23 | self.load_data()
24 |
25 | def load_data(self):
26 | file_names = ["sentences.pkl", "annotations.pkl", "images.pkl", "groups.pkl"]
27 | self.data = {
28 | name.split(".")[0]: load_pkl(osp.join(self.anno_dir, name))
29 | for name in file_names
30 | }
31 |
32 | def get_sent_ids(self, anno_ids=[], img_ids=[], group_ids=[], sent_ids=[]):
33 | """get sentence ids for D-cube.
34 |
35 | Args:
36 | anno_ids (list, optional): annotation ids to get sentence ids. Defaults to [].
37 | img_ids (list, optional): image ids to get sentence ids. Defaults to [].
38 | group_ids (list, optional): group ids to get sentence ids. Defaults to [].
39 | sent_ids (list, optional): additional sentence ids you want to include. Defaults to [].
40 |
41 | Raises:
42 | Exception: anno_ids, img_ids and group_ids cannot be used together.
43 |
44 | Returns:
45 | list: sentence ids.
46 | """
47 | img_ids = img_ids if isinstance(img_ids, list) else [img_ids]
48 | anno_ids = anno_ids if isinstance(anno_ids, list) else [anno_ids]
49 | group_ids = group_ids if isinstance(group_ids, list) else [group_ids]
50 | sent_ids = sent_ids if isinstance(sent_ids, list) else [sent_ids]
51 |
52 | if not any([img_ids, anno_ids, group_ids, sent_ids]):
53 | return list(self.data["sentences"].keys())
54 |
55 | if (
56 | (anno_ids and img_ids)
57 | or (anno_ids and group_ids)
58 | or (img_ids and group_ids)
59 | ):
60 | raise Exception("anno_ids, img_ids, group_ids can only be used alone")
61 |
62 | out_ids_set = set()
63 | if img_ids:
64 | for img_id in img_ids:
65 | imganno_ids = self.data["images"][img_id]["anno_id"]
66 | for ianno_id in imganno_ids:
67 | out_ids_set |= set(self.data["annotations"][ianno_id]["sent_id"])
68 |
69 | if group_ids:
70 | for group_id in group_ids:
71 | out_ids_set |= set(self.data["groups"][group_id]["inner_sent_id"])
72 |
73 | if anno_ids:
74 | for ianno_id in anno_ids:
75 | out_ids_set |= set(self.data["annotations"][ianno_id]["sent_id"])
76 |
77 | if sent_ids:
78 | out_ids_set &= set(sent_ids)
79 |
80 | return list(out_ids_set)
81 |
82 | def get_anno_ids(self, anno_ids=[], img_ids=[], group_ids=[], sent_ids=[]):
83 | """get annotation ids for D-cube.
84 |
85 | Args:
86 | anno_ids (list, optional): additional annotation ids you want to include. Defaults to [].
87 | img_ids (list, optional): image ids to get annotation ids. Defaults to [].
88 | group_ids (list, optional): group ids to get annotation ids. Defaults to [].
89 | sent_ids (list, optional): sentence ids to get annotation ids. Defaults to [].
90 |
91 | Raises:
92 | Exception: img_ids and group_ids cannot be used together.
93 |
94 | Returns:
95 | list: annotation ids.
96 | """
97 | img_ids = img_ids if isinstance(img_ids, list) else [img_ids]
98 | anno_ids = anno_ids if isinstance(anno_ids, list) else [anno_ids]
99 | group_ids = group_ids if isinstance(group_ids, list) else [group_ids]
100 | sent_ids = sent_ids if isinstance(sent_ids, list) else [sent_ids]
101 |
102 | if not any([img_ids, anno_ids, group_ids, sent_ids]):
103 | return list(self.data["annotations"].keys())
104 |
105 | if img_ids and group_ids:
106 | raise Exception("img_ids, group_ids can only be used alone")
107 |
108 | out_ids_set = set()
109 | if img_ids:
110 | for img_id in img_ids:
111 | out_ids_set |= set(self.data["images"][img_id]["anno_id"])
112 |
113 | if group_ids:
114 | for group_id in group_ids:
115 | for groupimg_id in self.data["groups"][group_id]["img_id"]:
116 | out_ids_set |= set(self.data["images"][groupimg_id]["anno_id"])
117 |
118 | if sent_ids and img_ids:
119 | for sent_id in sent_ids:
120 | out_ids_set &= set(self.data["sentences"][sent_id]["anno_id"])
121 | else:
122 | for sent_id in sent_ids:
123 | out_ids_set |= set(self.data["sentences"][sent_id]["anno_id"])
124 |
125 | if anno_ids:
126 | out_ids_set &= set(anno_ids)
127 |
128 | return list(out_ids_set)
129 |
130 | def get_img_ids(self, anno_ids=[], img_ids=[], group_ids=[], sent_ids=[]):
131 | """get image ids for D-cube.
132 |
133 | Args:
134 | anno_ids (list, optional): annotation ids to get image ids. Defaults to [].
135 | img_ids (list, optional): additional image ids you want to include. Defaults to [].
136 | group_ids (list, optional): group ids to get image ids. Defaults to [].
137 | sent_ids (list, optional): sentence ids to get image ids. Defaults to [].
138 |
139 | Raises:
140 | Exception: anno_ids and img_ids cannot be used together.
141 | Exception: anno_ids and group_ids cannot be used together.
142 |
143 | Returns:
144 | list: image ids.
145 | """
146 | img_ids = img_ids if isinstance(img_ids, list) else [img_ids]
147 | anno_ids = anno_ids if isinstance(anno_ids, list) else [anno_ids]
148 | group_ids = group_ids if isinstance(group_ids, list) else [group_ids]
149 | sent_ids = sent_ids if isinstance(sent_ids, list) else [sent_ids]
150 |
151 | if not any([img_ids, anno_ids, group_ids, sent_ids]):
152 | return list(self.data["images"].keys())
153 |
154 | if anno_ids and img_ids:
155 | raise Exception("anno_ids and img_ids can only be used alone")
156 | if anno_ids and group_ids:
157 | raise Exception("anno_ids and group_ids can only be used alone")
158 |
159 | out_ids_set = set()
160 | if anno_ids:
161 | for ianno_id in anno_ids:
162 | out_ids_set.add(self.data["annotations"][ianno_id]["image_id"])
163 |
164 | if group_ids:
165 | for group_id in group_ids:
166 | out_ids_set |= set(self.data["groups"][group_id]["img_id"])
167 |
168 | if sent_ids:
169 | for sent_id in sent_ids:
170 | for sentanno_id in self.data["sentences"][sent_id]["anno_id"]:
171 | out_ids_set.add(self.data["annotations"][sentanno_id]["image_id"])
172 |
173 | if img_ids:
174 | out_ids_set &= set(img_ids)
175 |
176 | return list(out_ids_set)
177 |
178 | def get_group_ids(self, anno_ids=[], img_ids=[], group_ids=[], sent_ids=[]):
179 | """get group ids for D-cube.
180 |
181 | Args:
182 | anno_ids (list, optional): annotation ids to get group ids. Defaults to [].
183 | img_ids (list, optional): image ids to get group ids. Defaults to [].
184 | group_ids (list, optional): additional group_ids you want to include. Defaults to [].
185 | sent_ids (list, optional): sentence ids to get group ids. Defaults to [].
186 |
187 | Raises:
188 | Exception: anno_ids, img_ids and sent_ids cannot be used together.
189 |
190 | Returns:
191 | list: group ids.
192 | """
193 | img_ids = img_ids if isinstance(img_ids, list) else [img_ids]
194 | anno_ids = anno_ids if isinstance(anno_ids, list) else [anno_ids]
195 | group_ids = group_ids if isinstance(group_ids, list) else [group_ids]
196 | sent_ids = sent_ids if isinstance(sent_ids, list) else [sent_ids]
197 |
198 | if not any([img_ids, anno_ids, group_ids, sent_ids]):
199 | return list(self.data["groups"].keys())
200 |
201 | if anno_ids and img_ids:
202 | raise Exception("anno_ids and img_ids can only be used alone")
203 | if anno_ids and sent_ids:
204 | raise Exception("anno_ids and sent_ids can only be used alone")
205 | if img_ids and sent_ids:
206 | raise Exception("img_ids and sent_ids can only be used alone")
207 |
208 | out_ids_set = set()
209 | if img_ids:
210 | for img_id in img_ids:
211 | out_ids_set.add(self.data["images"][img_id]["group_id"])
212 |
213 | if anno_ids:
214 | for anno_id in anno_ids:
215 | out_ids_set.add(self.data["annotations"][anno_id]["group_id"])
216 |
217 | if sent_ids:
218 | for sent_id in sent_ids:
219 | out_ids_set |= set(self.data["sentences"][sent_id]["group_id"])
220 |
221 | if group_ids:
222 | out_ids_set &= set(group_ids)
223 |
224 | return list(out_ids_set)
225 |
226 | def load_sents(self, sent_ids=None):
227 | """load sentence info.
228 |
229 | Args:
230 | sent_ids (list, int, optional): sentence ids. Defaults to None.
231 |
232 | Returns:
233 | list: a list of sentence info.
234 | """
235 | if sent_ids is not None and not isinstance(sent_ids, list):
236 | sent_ids = [sent_ids]
237 | if isinstance(sent_ids, list):
238 | return [self.data["sentences"][sent_id] for sent_id in sent_ids]
239 | else:
240 | return list(self.data["sentences"].values())
241 |
242 | def load_annos(self, anno_ids=None):
243 | """load annotation info.
244 |
245 | Args:
246 | anno_ids (list, int, optional): annotation ids. Defaults to None.
247 |
248 | Returns:
249 | list: a list of annotation info.
250 | """
251 | if anno_ids is not None and not isinstance(anno_ids, list):
252 | anno_ids = [anno_ids]
253 | if isinstance(anno_ids, list):
254 | return [self.data["annotations"][anno_id] for anno_id in anno_ids]
255 | else:
256 | return list(self.data["annotations"].values())
257 |
258 | def load_imgs(self, img_ids=None):
259 | """load image info.
260 |
261 | Args:
262 | img_ids (list, int, optional): image ids. Defaults to None.
263 |
264 | Returns:
265 | list: a list of image info.
266 | """
267 | if img_ids is not None and not isinstance(img_ids, list):
268 | img_ids = [img_ids]
269 | if isinstance(img_ids, list):
270 | return [self.data["images"][img_ids] for img_ids in img_ids]
271 | else:
272 | return list(self.data["images"].values())
273 |
274 | def load_groups(self, group_ids=None):
275 | """load group info.
276 |
277 | Args:
278 | group_ids (list, int, optional): group ids. Defaults to None.
279 |
280 | Returns:
281 | list: a list of group info.
282 | """
283 | if group_ids is not None and not isinstance(group_ids, list):
284 | group_ids = [group_ids]
285 | if isinstance(group_ids, list):
286 | return [self.data["groups"][group_ids] for group_ids in group_ids]
287 | else:
288 | return list(self.data["groups"].values())
289 |
290 | def get_mask(self, anno):
291 | rle = anno[0]["segmentation"]
292 | m = mask.decode(rle)
293 | m = np.sum(
294 | m, axis=2
295 | ) # sometimes there are multiple binary map (corresponding to multiple segs)
296 | m = m.astype(np.uint8) # convert to np.uint8
297 | # compute area
298 | area = sum(mask.area(rle)) # should be close to ann['area']
299 | return {"mask": m, "area": area}
300 |
301 | def show_mask(self, anno):
302 | M = self.get_mask(anno)
303 | msk = M["mask"]
304 | ax = plt.gca()
305 | ax.imshow(msk)
306 |
307 | def show_image_seg(
308 | self,
309 | img_ids=[],
310 | save_dir=None,
311 | show_sent=False,
312 | on_image=False,
313 | checkerboard_bg=False,
314 | is_instance=True,
315 | ):
316 | if is_instance and checkerboard_bg:
317 | raise ValueError(
318 | "Cannot apply both is_instance and checkboard_bg at the same time."
319 | )
320 | img_infos = self.load_imgs(img_ids=img_ids)
321 | for img_idx, img_info in enumerate(img_infos):
322 | img = cv2.imread(osp.join(self.image_dir, img_info["file_name"]))
323 | anno_infos = self.load_annos(img_info["anno_id"])
324 |
325 | bm_canvas = defaultdict(list)
326 | merge_canvas = defaultdict(list)
327 | for anno_info in anno_infos:
328 | for sent_id in anno_info["sent_id"]:
329 | bm_canvas[sent_id].append(anno_info["segmentation"])
330 |
331 | for sent_id, bm_list in bm_canvas.items():
332 | merge_canvas[sent_id] = merge_rle(
333 | bm_list, is_instance=is_instance, on_image=on_image
334 | )
335 |
336 | cv2.imwrite(osp.join(save_dir, f"{img_info['id']}.png"), img)
337 | for sent_id, merge_mask in merge_canvas.items():
338 | if checkerboard_bg:
339 | merge_mask = add_checkerboard_bg(img, merge_mask)
340 | elif on_image:
341 | merge_mask = visualize_mask_on_image(img, merge_mask, add_edge=True)
342 | if show_sent:
343 | sent_en = self.load_sents(sent_ids=sent_id)[0]["raw_sent"]
344 | merge_mask = paste_text(merge_mask, sent_en)
345 | cv2.imwrite(
346 | osp.join(save_dir, f"{img_info['id']}_{sent_id}.png"), merge_mask
347 | )
348 |
349 | return merge_canvas
350 |
351 | def show_group_seg(
352 | self,
353 | group_ids,
354 | save_root,
355 | show_sent=True,
356 | is_instance=True,
357 | on_image=False,
358 | checkerboard_bg=False,
359 | ):
360 | group_infos = self.load_groups(group_ids=group_ids)
361 | for group_info in group_infos:
362 | save_dir = osp.join(save_root, group_info["group_name"])
363 | os.makedirs(save_dir, exist_ok=True)
364 | self.show_image_seg(
365 | img_ids=group_info["img_id"],
366 | save_dir=save_dir,
367 | show_sent=show_sent,
368 | is_instance=is_instance,
369 | on_image=on_image,
370 | checkerboard_bg=checkerboard_bg,
371 | )
372 |
373 | def show_image_seg_bbox(
374 | self,
375 | img_ids=[],
376 | save_dir=None,
377 | show_sent=False,
378 | on_image=False,
379 | checkerboard_bg=False,
380 | is_instance=True,
381 | ):
382 | if is_instance and checkerboard_bg:
383 | raise ValueError(
384 | "Cannot apply both is_instance and checkboard_bg at the same time."
385 | )
386 | img_infos = self.load_imgs(img_ids=img_ids)
387 | for img_idx, img_info in enumerate(img_infos):
388 | img = cv2.imread(osp.join(self.image_dir, img_info["file_name"]))
389 | anno_infos = self.load_annos(img_info["anno_id"])
390 |
391 | bm_canvas = defaultdict(list)
392 | merge_canvas = defaultdict(list)
393 | sent_boxes = defaultdict(list)
394 | for anno_info in anno_infos:
395 | for sent_id in anno_info["sent_id"]:
396 | bm_canvas[sent_id].append(anno_info["segmentation"])
397 | sent_boxes[sent_id].append(anno_info["bbox"][0].tolist())
398 |
399 | for sent_id, bm_list in bm_canvas.items():
400 | merge_canvas[sent_id] = merge_rle(
401 | bm_list, is_instance=is_instance, on_image=on_image
402 | )
403 |
404 | cv2.imwrite(osp.join(save_dir, f"{img_info['id']}.png"), img)
405 | for sent_id, merge_mask in merge_canvas.items():
406 | # vis mask
407 | if checkerboard_bg:
408 | merge_mask = add_checkerboard_bg(img, merge_mask)
409 | elif on_image:
410 | merge_mask = visualize_mask_on_image(img, merge_mask, add_edge=True)
411 | # vis box
412 | bboxes = sent_boxes[sent_id]
413 | merge_mask = visualize_bbox_on_image(merge_mask, bboxes)
414 | # vis sent
415 | if show_sent:
416 | sent_en = self.load_sents(sent_ids=sent_id)[0]["raw_sent"]
417 | merge_mask = paste_text(merge_mask, sent_en)
418 | cv2.imwrite(
419 | osp.join(save_dir, f"{img_info['id']}_{sent_id}.png"), merge_mask
420 | )
421 |
422 | return merge_canvas
423 |
424 | def show_group_seg_bbox(
425 | self,
426 | group_ids,
427 | save_root,
428 | show_sent=True,
429 | is_instance=True,
430 | on_image=False,
431 | checkerboard_bg=False,
432 | ):
433 | group_infos = self.load_groups(group_ids=group_ids)
434 | for group_info in group_infos:
435 | save_dir = osp.join(save_root, group_info["group_name"])
436 | os.makedirs(save_dir, exist_ok=True)
437 | self.show_image_seg_bbox(
438 | img_ids=group_info["img_id"],
439 | save_dir=save_dir,
440 | show_sent=show_sent,
441 | is_instance=is_instance,
442 | on_image=on_image,
443 | checkerboard_bg=checkerboard_bg,
444 | )
445 |
446 | def show_image_bbox(self, img_ids=[], save_dir=None, show_sent=False):
447 | img_infos = self.load_imgs(img_ids=img_ids)
448 | for img_idx, img_info in enumerate(img_infos):
449 | img = cv2.imread(osp.join(self.image_dir, img_info["file_name"]))
450 | anno_infos = self.load_annos(img_info["anno_id"])
451 |
452 | sent_boxes = defaultdict(list)
453 | for anno_info in anno_infos:
454 | for sent_id in anno_info["sent_id"]:
455 | sent_boxes[sent_id].append(anno_info["bbox"][0].tolist())
456 |
457 | cv2.imwrite(osp.join(save_dir, f"{img_info['id']}.png"), img)
458 | for sent_id, bboxes in sent_boxes.items():
459 | merge_img = visualize_bbox_on_image(img, bboxes)
460 | if show_sent:
461 | sent_en = self.load_sents(sent_ids=sent_id)[0]["raw_sent"]
462 | merge_img = paste_text(merge_img, sent_en)
463 | cv2.imwrite(
464 | osp.join(save_dir, f"{img_info['id']}_{sent_id}.png"), merge_img
465 | )
466 |
467 | def show_group_bbox(self, group_ids, save_root, show_sent=True):
468 | group_infos = self.load_groups(group_ids=group_ids)
469 | for group_info in group_infos:
470 | save_dir = osp.join(save_root, group_info["group_name"])
471 | os.makedirs(save_dir, exist_ok=True)
472 | self.show_image_bbox(
473 | img_ids=group_info["img_id"], save_dir=save_dir, show_sent=show_sent
474 | )
475 |
476 | def stat_description(self, with_rev=False, inter_group=False):
477 | """calculate and print dataset statistics.
478 |
479 | Args:
480 | with_rev (bool, optional): consider absence descriptions or not. Defaults to False.
481 | inter_group (bool, optional): calculate under intra- or inter-group settings. Defaults to False.
482 | """
483 | stat_dict = {}
484 | # Number of sents
485 | sent_ids = list(self.data["sentences"].keys())
486 | if not with_rev:
487 | sent_ids = [sent_id for sent_id in sent_ids if not self.is_revsent(sent_id)]
488 | stat_dict["nsent"] = len(sent_ids)
489 | # Number of annos / instance # TODO: rm rev
490 | stat_dict["nanno"] = len(self.data["annotations"].keys())
491 | # Number of images
492 | stat_dict["nimg"] = len(self.data["images"].keys())
493 | # Number of groups
494 | stat_dict["ngroup"] = len(self.data["groups"].keys())
495 |
496 | # Number of img-sent pair
497 | num_img_sent = 0
498 | for img_id in self.data["images"].keys():
499 | anno_ids = self.get_anno_ids(img_ids=img_id)
500 | anno_infos = self.load_annos(anno_ids=anno_ids)
501 | cur_sent_set = set()
502 | group_sent_ids = set(
503 | self.load_groups(self.get_group_ids(img_ids=img_id))[0]["inner_sent_id"]
504 | )
505 | for anno_info in anno_infos:
506 | cur_sent_set |= set(
507 | [i for i in anno_info["sent_id"] if i in group_sent_ids]
508 | )
509 | if not with_rev:
510 | cur_sent_set = [
511 | sent_id for sent_id in cur_sent_set if not self.is_revsent(sent_id)
512 | ]
513 | num_img_sent += len(cur_sent_set)
514 | stat_dict["num_img_sent"] = num_img_sent
515 |
516 | # Number of absence img-sent pair
517 | num_anti_img_sent = 0
518 | for img_id in self.data["images"].keys():
519 | anno_ids = self.get_anno_ids(img_ids=img_id)
520 | anno_infos = self.load_annos(anno_ids=anno_ids)
521 | cur_sent_set = set()
522 | group_sent_ids = set(
523 | self.load_groups(self.get_group_ids(img_ids=img_id))[0]["inner_sent_id"]
524 | )
525 | for anno_info in anno_infos:
526 | cur_sent_set |= set(
527 | [i for i in anno_info["sent_id"] if i in group_sent_ids]
528 | )
529 | assert group_sent_ids.issuperset(
530 | cur_sent_set
531 | ), f"{group_sent_ids}, {cur_sent_set}"
532 | cur_anti_sent_set = group_sent_ids - cur_sent_set
533 | if not with_rev:
534 | cur_anti_sent_set = [
535 | sent_id
536 | for sent_id in cur_anti_sent_set
537 | if not self.is_revsent(sent_id)
538 | ]
539 | num_anti_img_sent += len(cur_anti_sent_set)
540 | stat_dict["num_anti_img_sent"] = num_anti_img_sent
541 |
542 | # Number of anno-sent pair
543 | num_anno_sent = 0
544 | anno_infos = self.load_annos()
545 | for anno_info in anno_infos:
546 | if inter_group:
547 | anno_sent_ids = [i for i in anno_info["sent_id"]]
548 | else:
549 | group_sent_ids = set(
550 | self.load_groups(anno_info["group_id"])[0]["inner_sent_id"]
551 | )
552 | anno_sent_ids = [i for i in anno_info["sent_id"] if i in group_sent_ids]
553 | if not with_rev:
554 | anno_sent_ids = [
555 | sent_id for sent_id in anno_sent_ids if not self.is_revsent(sent_id)
556 | ]
557 | num_anno_sent += len(anno_sent_ids)
558 |
559 | stat_dict["num_anno_sent"] = num_anno_sent
560 |
561 | # Number of anti anno-sent pair
562 | num_anti_anno_sent = 0
563 | anno_infos = self.load_annos()
564 | for anno_info in anno_infos:
565 | if inter_group:
566 | all_sent_ids = set(self.get_sent_ids())
567 | anno_sent_ids = anno_info["sent_id"]
568 |
569 | anti_sent_ids = [
570 | sent_id for sent_id in all_sent_ids if sent_id not in anno_sent_ids
571 | ]
572 | else:
573 | group_sent_ids = set(
574 | self.load_groups(anno_info["group_id"])[0]["inner_sent_id"]
575 | )
576 | anno_sent_ids = [i for i in anno_info["sent_id"] if i in group_sent_ids]
577 |
578 | anti_sent_ids = [
579 | sent_id
580 | for sent_id in group_sent_ids
581 | if sent_id not in anno_sent_ids
582 | ]
583 |
584 | if not with_rev:
585 | anti_sent_ids = [
586 | sent_id for sent_id in anti_sent_ids if not self.is_revsent(sent_id)
587 | ]
588 | num_anti_anno_sent += len(anti_sent_ids)
589 |
590 | stat_dict["num_anti_anno_sent"] = num_anti_anno_sent
591 |
592 | # Len of sentence
593 | totle_len = 0
594 | for sent_info in self.load_sents(sent_ids):
595 | totle_len += len(sent_info["raw_sent"].split())
596 |
597 | stat_dict["avg_sent_len"] = totle_len / stat_dict["nsent"]
598 |
599 | print(stat_dict)
600 |
601 | def is_revsent(self, sent_id):
602 | sent_info = self.load_sents(sent_ids=sent_id)
603 | return sent_info[0]["is_negative"]
604 |
605 | def data2coca(self, out_root, with_rev=False):
606 | group_infos = self.load_groups()
607 | for group_info in group_infos:
608 | sent_ids = group_info["inner_sent_id"]
609 | if not with_rev:
610 | sent_ids = [
611 | sent_id for sent_id in sent_ids if not self.is_revsent(sent_id)
612 | ]
613 | sent_infos = self.load_sents(sent_ids)
614 | for sent_info in sent_infos:
615 | sent = sent_info["raw_sent"]
616 | img_infos = self.load_imgs(group_info["img_id"])
617 | for img_info in img_infos:
618 | src_img_path = osp.join(self.image_dir, img_info["file_name"])
619 | raw_name = img_info["file_name"].split("/")[-1]
620 | out_img_dir = osp.join(out_root, "images", sent)
621 | os.makedirs(out_img_dir, exist_ok=True)
622 | out_img_path = osp.join(out_img_dir, raw_name)
623 | copy_file(src_img_path, out_img_path)
624 |
625 | out_mask_dir = osp.join(out_root, "masks", sent)
626 | os.makedirs(out_mask_dir, exist_ok=True)
627 | out_mask_path = osp.join(
628 | out_mask_dir, raw_name.replace(".jpg", ".png")
629 | )
630 |
631 | cur_anno_ids = self.get_anno_ids(
632 | img_ids=img_info["id"], sent_ids=sent_info["id"]
633 | )
634 | anno_infos = self.load_annos(cur_anno_ids)
635 | rle_list = [anno_info["segmentation"] for anno_info in anno_infos]
636 | bmask = merge2bin(rle_list, img_info["height"], img_info["width"])
637 | cv2.imwrite(out_mask_path, bmask)
638 |
639 | def convert2coco(self, out_root, anti_mode=False, is_group_separated=True):
640 | """
641 | Convert the annotation format of D^3 dataset to COCO.
642 | 1. The sent_id can be viewed as category_id in COCO.
643 | 2. If `is_group_separated` is True, `outer_sent_id` does not need to be considered.
644 | 3. if `with_rev` is False, sents that meet `is_revsent` will be ignore.
645 | """
646 | os.makedirs(out_root, exist_ok=True)
647 | coco_dict = {
648 | "images": [],
649 | "categories": [],
650 | "annotations": [],
651 | }
652 |
653 | sent_ids = self.get_sent_ids()
654 | if anti_mode == 1:
655 | sent_ids = [sent_id for sent_id in sent_ids if not self.is_revsent(sent_id)]
656 | elif anti_mode == 2:
657 | sent_ids = [sent_id for sent_id in sent_ids if self.is_revsent(sent_id)]
658 | elif anti_mode == 0:
659 | pass
660 | else:
661 | raise Exception("Unimplemented anti_mode.")
662 |
663 | sent_infos = self.load_sents(sent_ids)
664 | for isent_info in sent_infos:
665 | coco_dict["categories"].append(
666 | {
667 | "id": isent_info["id"],
668 | "name": isent_info["raw_sent"],
669 | }
670 | )
671 |
672 | item_id = 0
673 | img_infos = self.load_imgs()
674 | for iimg_info in img_infos:
675 | coco_dict["images"].append(
676 | {
677 | "id": iimg_info["id"],
678 | "file_name": iimg_info["file_name"],
679 | "height": iimg_info["height"],
680 | "width": iimg_info["width"],
681 | }
682 | )
683 |
684 | anno_ids = self.get_anno_ids(img_ids=iimg_info["id"])
685 | anno_infos = self.load_annos(anno_ids)
686 |
687 | for ianno_info in anno_infos:
688 | if is_group_separated:
689 | inner_group_sent_ids = [
690 | isent_id
691 | for isent_id in ianno_info["sent_id"]
692 | if isent_id
693 | in self.load_groups(ianno_info["group_id"])[0]["inner_sent_id"]
694 | ]
695 | cur_sent_ids = inner_group_sent_ids
696 | else:
697 | cur_sent_ids = ianno_info["sent_id"]
698 |
699 | for isent_id in cur_sent_ids:
700 | if isent_id not in sent_ids:
701 | continue
702 |
703 | seg = ianno_info["segmentation"][0].copy()
704 | if isinstance(seg, dict): # RLE
705 | counts = seg["counts"]
706 | if not isinstance(counts, str):
707 | # make it json-serializable
708 | seg["counts"] = counts.decode("ascii")
709 |
710 | coco_dict["annotations"].append(
711 | {
712 | "id": item_id,
713 | "image_id": iimg_info["id"],
714 | "category_id": isent_id,
715 | "segmentation": seg,
716 | "area": int(ianno_info["area"][0]),
717 | "bbox": [
718 | int(cord) for cord in ianno_info["bbox"][0].tolist()
719 | ],
720 | "iscrowd": 0, # TODO: ianno_info["iscrowd"]
721 | }
722 | )
723 | item_id += 1
724 |
725 | with open(osp.join(out_root, "coco_annotations.json"), "w") as f:
726 | json.dump(coco_dict, f, indent=4)
727 |
728 | def sent_analyse(self, save_dir, with_rev=False):
729 | """analyze word info in D-cube and generate word length histograms, word clouds, etc.
730 |
731 | Args:
732 | save_dir (str): path to save the visualized results.
733 | with_rev (bool, optional): consider absence descriptions or not. Defaults to False.
734 | """
735 | sent_ids = self.get_sent_ids()
736 | if not with_rev:
737 | sent_ids = [sent_id for sent_id in sent_ids if not self.is_revsent(sent_id)]
738 |
739 | sent_lens, sent_raws = [], []
740 | sent_infos = self.load_sents(sent_ids)
741 | for isent_info in sent_infos:
742 | sent_raws.append(isent_info["raw_sent"])
743 | sent_lens.append(len(isent_info["raw_sent"].split()))
744 |
745 | os.makedirs(save_dir, exist_ok=True)
746 | # plot_hist(
747 | # sent_lens,
748 | # bins=max(sent_lens) - min(sent_lens) + 1,
749 | # save_path=osp.join(save_dir, "words_hist.pdf"),
750 | # x="Lengths of descriptions",
751 | # )
752 | # generate_wordclouds(sent_raws, osp.join(save_dir, "word_clouds"))
753 |
754 | def group_analysis(self, save_dir, with_rev=False):
755 | group_infos = self.load_groups()
756 | scene_tree = defaultdict(dict)
757 |
758 | for group_info in group_infos:
759 | scene_tree[group_info["scene"]][group_info["group_name"]] = {"nimg": 0.1}
760 |
761 | # vis_group_tree(scene_tree, osp.join(save_dir, 'scene_tree.png')) # the visualized result is ugly
762 |
763 | def bbox_num_analyze(self):
764 | n_cat = len(self.data["sentences"].keys())
765 | all_img_ids = self.data["images"].keys()
766 | n_img = len(all_img_ids)
767 | cat_obj_count = np.zeros((n_cat, n_img), dtype=int)
768 | for img_id in all_img_ids:
769 | # img_cat_ids = self.get_sent_ids(img_ids=img_id)
770 | anno_ids = self.get_anno_ids(img_ids=img_id)
771 | anno_infos = self.load_annos(anno_ids=anno_ids)
772 | for anno in anno_infos:
773 | for sid in anno["sent_id"]:
774 | cat_obj_count[sid - 1, img_id] += 1
775 | return cat_obj_count
776 |
--------------------------------------------------------------------------------
/d_cube/data_util.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | __author__ = "Chi Xie and Zhao Zhang"
3 | __maintainer__ = "Chi Xie"
4 | # data utility functions are defined in the script
5 | import json
6 | import pickle
7 | import shutil
8 |
9 | # from io import StringIO
10 | # import string
11 |
12 | import numpy as np
13 | import cv2
14 | from pycocotools import mask as cocomask
15 |
16 | VOC_COLORMAP = [
17 | [128, 0, 0],
18 | [0, 128, 0],
19 | [128, 128, 0],
20 | [0, 0, 128],
21 | [128, 0, 128],
22 | [0, 128, 128],
23 | [128, 128, 128],
24 | [64, 0, 0],
25 | [192, 0, 0],
26 | [64, 128, 0],
27 | [192, 128, 0],
28 | [64, 0, 128],
29 | [192, 0, 128],
30 | [64, 128, 128],
31 | [192, 128, 128],
32 | [0, 64, 0],
33 | [128, 64, 0],
34 | [0, 192, 0],
35 | [128, 192, 0],
36 | [0, 64, 128],
37 | ]
38 |
39 |
40 | def visualize_bbox_on_image(img, bbox_list, save_path=None, thickness=3):
41 | img_copy = img.copy()
42 | for i, bbox in enumerate(bbox_list):
43 | color = tuple(VOC_COLORMAP[i % len(VOC_COLORMAP)])
44 | x, y, w, h = bbox
45 | img_copy = cv2.rectangle(
46 | img_copy, (int(x), int(y)), (int((x + w)), int(y + h)), color, thickness
47 | )
48 | if save_path:
49 | cv2.imwrite(save_path, img_copy)
50 | return img_copy
51 |
52 |
53 | def rle2bmask(rle):
54 | bm = cocomask.decode(rle)
55 | if len(bm.shape) == 3:
56 | bm = np.sum(
57 | bm, axis=2
58 | ) # sometimes there are multiple binary map (corresponding to multiple segs)
59 | bm = bm.astype(np.uint8) # convert to np.uint8
60 | return bm
61 |
62 |
63 | def merge_rle(rle_list, is_instance=True, on_image=False):
64 | if is_instance:
65 | cm_list = []
66 | for rle_idx, rle in enumerate(rle_list):
67 | color = VOC_COLORMAP[rle_idx]
68 | bm = rle2bmask(rle)
69 | cm = cv2.cvtColor(bm, cv2.COLOR_GRAY2BGR)
70 | cm_list.append(cm * color)
71 | merge_map = np.sum(cm_list, axis=0, dtype=np.uint8)
72 | else:
73 | bm_list = [rle2bmask(rle) for rle in rle_list]
74 | merge_map = np.sum(bm_list, axis=0, dtype=np.uint8)
75 | merge_map[merge_map >= 1] = 1
76 | if not on_image:
77 | color = VOC_COLORMAP[0]
78 | merge_map = cv2.cvtColor(merge_map, cv2.COLOR_GRAY2BGR)
79 | merge_map *= np.array(color, dtype=np.uint8)
80 |
81 | merge_map[merge_map > 255] = 255
82 |
83 | if not on_image:
84 | tmp_sum_map = np.sum(merge_map, axis=-1)
85 | merge_map[tmp_sum_map == 0] = 220
86 | return merge_map
87 |
88 |
89 | def merge2bin(rle_list, img_h, img_w):
90 | if rle_list:
91 | bm_list = [rle2bmask(rle) for rle in rle_list]
92 | merge_map = np.sum(bm_list, axis=0, dtype=np.uint8)
93 | merge_map[merge_map >= 1] = 255
94 | merge_map = np.expand_dims(merge_map, axis=-1)
95 | return merge_map
96 | else:
97 | return np.zeros([img_h, img_w, 1], dtype=np.uint8)
98 |
99 |
100 | def paste_text(img, text):
101 | fontFace = cv2.FONT_HERSHEY_COMPLEX_SMALL
102 | overlay = img.copy()
103 | # fontFace = cv2.FONT_HERSHEY_TRIPLEX
104 | fontScale = 1
105 | thickness = 1
106 | backgroud_alpha = 0.8
107 |
108 | retval, baseLine = cv2.getTextSize(
109 | text, fontFace=fontFace, fontScale=fontScale, thickness=thickness
110 | )
111 | topleft = (0, 0)
112 | # bottomright = (topleft[0] + retval[0], topleft[1] + retval[1]+10)
113 | bottomright = (img.shape[1], topleft[1] + retval[1] + 10)
114 |
115 | cv2.rectangle(overlay, topleft, bottomright, thickness=-1, color=(250, 250, 250))
116 | img = cv2.addWeighted(overlay, backgroud_alpha, img, 1 - backgroud_alpha, 0)
117 |
118 | cv2.putText(
119 | img,
120 | text,
121 | (0, baseLine + 10),
122 | fontScale=fontScale,
123 | fontFace=fontFace,
124 | thickness=thickness,
125 | color=(10, 10, 10),
126 | )
127 | return img
128 |
129 |
130 | def load_json(json_path, to_int=False):
131 | clean_res_dic = {}
132 | with open(json_path, "r", encoding="utf-8") as f_in:
133 | res_dic = json.load(f_in)
134 |
135 | for ikey, iv in res_dic.items():
136 | ikey = int(ikey.strip()) if to_int else ikey.strip()
137 | clean_res_dic[ikey] = iv
138 |
139 | return clean_res_dic
140 |
141 |
142 | def path_map(src_path, obj_path):
143 | def inner_map(full_path):
144 | return full_path.replace(src_path, obj_path)
145 |
146 |
147 | def save_pkl(src, obj_path):
148 | with open(obj_path, "wb") as f_out:
149 | pickle.dump(src, f_out)
150 |
151 |
152 | def load_pkl(src_path):
153 | with open(src_path, "rb") as f_in:
154 | in_pkl = pickle.load(f_in)
155 | return in_pkl
156 |
157 |
158 | def copy_file(src_path, obj_path):
159 | shutil.copy(src_path, obj_path)
160 |
161 |
162 | def sentence_analysis():
163 | return 0
164 |
165 |
166 | def add_checkerboard_bg(image, mask, save_path=None):
167 | # Create a new image with the same size as the original image
168 | new_image = np.zeros_like(image)
169 |
170 | # Define the size of the checkerboard pattern
171 | checkerboard_size = 24
172 |
173 | # Loop over each pixel in the mask
174 | for x in range(mask.shape[1]):
175 | for y in range(mask.shape[0]):
176 | # If the pixel is transparent, draw a checkerboard pattern
177 | if mask[y, x] == 0:
178 | if (x // checkerboard_size) % 2 == (y // checkerboard_size) % 2:
179 | new_image[y, x] = (255, 255, 255)
180 | else:
181 | new_image[y, x] = (128, 128, 128)
182 | # Otherwise, copy the corresponding pixel from the original image
183 | else:
184 | new_image[y, x] = image[y, x]
185 |
186 | # Save the new image with the checkerboard background
187 | if save_path:
188 | cv2.imwrite(save_path, new_image)
189 | return new_image
190 |
191 |
192 | def visualize_mask_on_image(
193 | img, mask, save_path=None, add_edge=False, dark_background=False
194 | ):
195 | # Convert the mask to a binary mask if it's not already
196 | if mask.max() > 1:
197 | mask = mask.astype(np.uint8) // 255
198 |
199 | # Convert the mask to a 3-channel mask if it's not already
200 | if len(mask.shape) == 2:
201 | mask = np.expand_dims(mask, axis=2)
202 | mask = np.tile(mask, (1, 1, 3))
203 |
204 | # Create a color map for the mask
205 | cmap = np.array([255, 117, 44], dtype=np.uint8)
206 | mask_colors = mask * cmap
207 |
208 | # Add an opaque white edge to the mask if desired
209 | if add_edge:
210 | if len(mask.shape) == 2:
211 | mask = np.expand_dims(mask, axis=2)
212 | mask = np.tile(mask, (1, 1, 3))
213 |
214 | kernel = np.ones((5, 5), dtype=np.uint8)
215 | mask_edge = cv2.erode(mask, kernel, iterations=1)
216 | mask_edge = mask - mask_edge
217 |
218 | # mask_edge = np.tile(mask_edge[:, :, np.newaxis], [1, 1, 3])
219 | mask_colors[mask_edge > 0] = 255
220 |
221 | # Overlay the mask on the masked image
222 | if dark_background:
223 | masked_img = cv2.addWeighted(img, 0.4, mask_colors, 0.6, 0)
224 | else:
225 | masked_img = img.copy()
226 | masked_img[mask > 0] = cv2.addWeighted(img, 0.4, mask_colors, 0.6, 0)[mask > 0]
227 |
228 | # Save the result to the specified path if provided
229 | if save_path is not None:
230 | cv2.imwrite(save_path, masked_img)
231 |
232 | return masked_img
233 |
234 |
235 | # def visualize_mask_on_image(img, mask, save_path=None, add_edge=False):
236 | # # Convert the mask to a binary mask if it's not already
237 | # if mask.max() > 1:
238 | # mask = mask.astype(np.uint8) // 255
239 |
240 | # # Convert the mask to a 3-channel mask if it's not already
241 | # if len(mask.shape) == 2:
242 | # mask = np.expand_dims(mask, axis=2)
243 | # mask = np.tile(mask, (1, 1, 3))
244 |
245 | # # Create a color map for the mask
246 | # cmap = np.array([255, 117, 44], dtype=np.uint8)
247 | # mask_colors = mask * cmap
248 |
249 | # # Add an opaque white edge to the mask if desired
250 | # if add_edge:
251 | # if len(mask.shape) == 2:
252 | # mask = np.expand_dims(mask, axis=2)
253 | # mask = np.tile(mask, (1, 1, 3))
254 |
255 | # kernel = np.ones((5, 5), dtype=np.uint8)
256 | # mask_edge = cv2.erode(mask, kernel, iterations=1)
257 | # mask_edge = mask - mask_edge
258 |
259 | # # mask_edge = np.tile(mask_edge[:, :, np.newaxis], [1, 1, 3])
260 | # mask_colors[mask_edge > 0] = 255
261 |
262 | # # Overlay the mask on the masked image
263 | # masked_img = cv2.addWeighted(img, 0.5, mask_colors, 0.5, 0)
264 |
265 | # # Save the result to the specified path if provided
266 | # if save_path is not None:
267 | # cv2.imwrite(save_path, masked_img)
268 |
269 | # return masked_img
270 |
--------------------------------------------------------------------------------
/d_cube/vis_util.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | __author__ = "Chi Xie and Zhao Zhang"
3 | __maintainer__ = "Chi Xie"
4 | import os
5 | from collections import Counter
6 |
7 | import spacy
8 | import matplotlib.pyplot as plt
9 | import seaborn as sns
10 | from wordcloud import WordCloud
11 |
12 | # from pycirclize import Circos
13 | # from Bio.Phylo.BaseTree import Tree
14 | # from Bio import Phylo
15 | # from newick import Node
16 |
17 |
18 | def plot_hist(data, bins=10, is_norm=False, save_path=None, x=None):
19 | sns.set_theme(style="whitegrid", font_scale=2.0)
20 | ax = sns.histplot(data, bins=bins, common_norm=is_norm, kde=False)
21 | ax.set_xlabel(x)
22 | plt.tight_layout()
23 | plt.savefig(save_path)
24 | plt.close()
25 |
26 |
27 | def plot_bars(names, nums, is_sort, save_path=None):
28 | sns.set(style="whitegrid")
29 |
30 | if is_sort:
31 | zipped = zip(nums, names)
32 | sort_zipped = sorted(zipped, key=lambda x: (x[0], x[1]))
33 | result = zip(*sort_zipped)
34 | nums, names = [list(x) for x in result]
35 |
36 | fontx = {"family": "Times New Roman", "size": 10}
37 | fig, ax = plt.subplots()
38 | fig = plt.figure(figsize=(16, 4))
39 | # sns.set_palette("PuBuGn_d")
40 | sns.barplot(names, nums, palette=sns.cubehelix_palette(80, start=0.5, rot=-0.75))
41 | fig.autofmt_xdate(rotation=90)
42 | plt.tick_params(axis="x", labelsize=10)
43 | labels = ax.get_xticklabels() + ax.get_yticklabels()
44 | [label.set_fontname("Times New Roman") for label in labels]
45 | plt.tight_layout()
46 | plt.savefig(save_path)
47 |
48 |
49 | def generate_wordclouds(sentences, save_dir):
50 | """Generates word clouds for different parts of speech in a list of sentences.
51 |
52 | Args:
53 | sentences: A list of sentences.
54 | save_dir: The directory to save the word cloud images.
55 | """
56 |
57 | os.makedirs(save_dir, exist_ok=True)
58 | # Load the spacy model
59 | nlp = spacy.load("en_core_web_sm")
60 |
61 | # Define the parts of speech to include in the word clouds
62 | pos_to_include = ["NOUN", "VERB", "ADJ", "ADV"]
63 |
64 | # Process each sentence and collect the relevant words for each part of speech
65 | words_by_pos = {pos: [] for pos in pos_to_include}
66 | for sent in sentences:
67 | doc = nlp(sent)
68 | for token in doc:
69 | if token.pos_ in pos_to_include:
70 | words_by_pos[token.pos_].append(token.lemma_.lower())
71 |
72 | # Generate a word cloud for each part of speech
73 | for pos, words in words_by_pos.items():
74 | if len(words) == 0:
75 | continue # skip parts of speech with no words
76 |
77 | # Count the frequency of each word
78 | word_counts = Counter(words)
79 |
80 | # Generate the word cloud
81 | wordcloud = WordCloud(
82 | width=800,
83 | height=800,
84 | background_color="white",
85 | max_words=200,
86 | colormap="Set2",
87 | max_font_size=150,
88 | ).generate_from_frequencies(word_counts)
89 |
90 | # Save the word cloud image
91 | filename = f"{pos.lower()}_wordcloud.png"
92 | filepath = os.path.join(save_dir, filename)
93 | wordcloud.to_file(filepath)
94 |
95 |
96 | # def vis_group_tree(data_dict, save_path):
97 |
98 | # # Create 3 randomized trees
99 | # tree_size_list = [60, 40, 50]
100 | # trees = [Tree.randomized(string.ascii_uppercase, branch_stdev=0.5) for size in tree_size_list]
101 |
102 | # # Initialize circos sector with 3 randomized tree size
103 | # sectors = {name: size for name, size in zip(list("ABC"), tree_size_list)}
104 | # circos = Circos(sectors, space=5)
105 |
106 | # colors = ["tomato", "skyblue", "limegreen"]
107 | # cmaps = ["bwr", "viridis", "Spectral"]
108 | # for idx, sector in enumerate(circos.sectors):
109 | # sector.text(sector.name, r=120, size=12)
110 | # # Plot randomized tree
111 | # tree = trees[idx]
112 | # tree_track = sector.add_track((30, 70))
113 | # tree_track.axis(fc=colors[idx], alpha=0.2)
114 | # tree_track.tree(tree, leaf_label_size=3, leaf_label_margin=21)
115 | # # Plot randomized bar
116 | # bar_track = sector.add_track((70, 90))
117 | # x = np.arange(0, int(sector.size)) + 0.5
118 | # height = np.random.randint(1, 10, int(sector.size))
119 | # bar_track.bar(x, height, facecolor=colors[idx], ec="grey", lw=0.5, hatch="//")
120 |
121 | # circos.savefig(save_path, dpi=600)
122 |
123 | # def clean_newick_key(in_str):
124 | # bad_chars = [':', ';', ',', '(', ')']
125 | # for bad_char in bad_chars:
126 | # in_str = in_str.replace(bad_char, ' ')
127 | # return in_str
128 |
129 | # def build_tree_from_dict(data):
130 | # root = Node() # create the root node
131 | # for key, value in data.items():
132 | # node = Node(name=clean_newick_key(key)) # name doesn't need to be cleaned
133 | # if value is not None:
134 | # child_node = build_tree_from_dict(value)
135 | # node.add_descendant(child_node)
136 | # root.add_descendant(node)
137 |
138 | # return root
139 |
140 |
141 | def replace_chars_in_dict_keys(d):
142 | """
143 | Replaces the characters ':', ';', ',', '(', and ')' in the keys of a nested dictionary with '_'.
144 | """
145 | new_dict = {}
146 | for k, v in d.items():
147 | if isinstance(v, dict):
148 | v = replace_chars_in_dict_keys(v)
149 | new_key = k.translate(str.maketrans(":;,()", "_____"))
150 | new_dict[new_key] = v
151 | return new_dict
152 |
153 |
154 | def build_newick_tree(tree_dict):
155 | newick_tree = ""
156 | if isinstance(tree_dict, dict):
157 | for key, value in tree_dict.items():
158 | if isinstance(value, dict):
159 | subtree = build_newick_tree(value)
160 | if subtree:
161 | newick_tree += "(" + subtree + ")" + key + ","
162 | else:
163 | newick_tree += key + ","
164 | else:
165 | newick_tree += key + ":" + str(value) + ","
166 | newick_tree = newick_tree.rstrip(",") + ")"
167 | return newick_tree
168 | else:
169 | return None
170 |
171 |
172 | # def vis_group_tree(data_dict, save_path):
173 | # data_dic = replace_chars_in_dict_keys(data_dict)
174 | # super_group_names = data_dict.keys()
175 |
176 | # # Create 3 randomized trees
177 | # tree_size_list = [60, 40, 50]
178 | # trees = [Phylo.read(StringIO(build_newick_tree(data_dict[super_group_name])), "newick") for super_group_name in super_group_names]
179 |
180 | # # Initialize circos sector with 3 randomized tree size
181 | # sectors = {name: size for name, size in zip(list("ABC"), tree_size_list)}
182 | # circos = Circos(sectors, space=5)
183 |
184 | # colors = ["tomato", "skyblue", "limegreen"]
185 | # cmaps = ["bwr", "viridis", "Spectral"]
186 | # for idx, sector in enumerate(circos.sectors):
187 | # sector.text(sector.name, r=120, size=12)
188 | # # Plot randomized tree
189 | # tree = trees[idx]
190 | # tree_track = sector.add_track((30, 70))
191 | # tree_track.axis(fc=colors[idx], alpha=0.2)
192 | # tree_track.tree(tree, leaf_label_size=3, leaf_label_margin=21)
193 | # # Plot randomized bar
194 | # bar_track = sector.add_track((70, 90))
195 | # x = np.arange(0, int(sector.size)) + 0.5
196 | # height = np.random.randint(1, 10, int(sector.size))
197 | # bar_track.bar(x, height, facecolor=colors[idx], ec="grey", lw=0.5, hatch="//")
198 |
199 | # circos.savefig(save_path, dpi=600)
200 |
--------------------------------------------------------------------------------
/doc.md:
--------------------------------------------------------------------------------
1 | # $D^3$ Toolkit Documentation
2 |
3 |
4 | ## Table of Contents
5 |
6 | - [Inference](#inference-on-d3)
7 | - [Key Concepts](#key-concepts-for-users)
8 | - [Evaluation Settings](#evaluation-settings)
9 | - [Evaluation Code and Examples](#evaluation-code-and-examples)
10 | - [Dataset statistics](#dataset-statistics)
11 |
12 |
13 |
14 |
15 | ## Inference on $D^3$
16 |
17 | ```python
18 | # import the dataset class
19 | from d_cube import D3
20 | # init a dataset instance
21 | d3 = D3(IMG_ROOT, PKL_ANNO_PATH)
22 | all_img_ids = d3.get_img_ids() # get the image ids in the dataset
23 | all_img_info = d3.load_imgs(all_img_ids) # load images by passing a list containing some image ids
24 | img_path = all_img_info[0]["file_name"] # obtain one image path so you can load it and inference
25 | # then you can load the image as input for your model
26 |
27 | group_ids = d3.get_group_ids(img_ids=[img_id]) # get the group ids by passing anno ids, image ids, etc.
28 | sent_ids = d3.get_sent_ids(group_ids=group_ids) # get the sentence ids by passing image ids, group ids, etc.
29 | sent_list = d3.load_sents(sent_ids=sent_ids)
30 | ref_list = [sent['raw_sent'] for sent in sent_list] # list[str]
31 | # use these language references in `ref_list` as the references to your REC/OVD/DOD model
32 |
33 | # save the result to a JSON file
34 | ```
35 |
36 | Concepts and structures of `anno`, `image`, `sent` and `group` are explained in [this part](#key-concepts-for-users).
37 |
38 | In [this directory](eval_sota/) we provide the inference (and evaluation) script on some existing SOTA OVD/REC methods.
39 |
40 |
41 |
42 | ### Output Format
43 | When the inference is done, you need to save a JSON file in the format below (COCO standard output JSON form):
44 | ```json
45 | [
46 | {
47 | "category_id": "int, the value of sent_id, range [1, 422]",
48 | "bbox": "list[int], [x1, y1, w, h], predicted by your model, same as COCO result format, absolute value in the range of [w, h, w, h]",
49 | "image_id": "int, img_id, can be 0, 1, 2, ....",
50 | "score": "float, predicted by your model, no restriction on its absolute value range"
51 | }
52 | ]
53 | ```
54 | This JSON file should contain a list, where each item in the list is a dictionary of one detection result.
55 |
56 | With this JSON saved, you can evaluate the JSON in the next step. See [the evaluation step](#evaluation-code-and-examples).
57 |
58 |
59 |
60 |
61 |
62 | ## Key Concepts for Users
63 |
64 | ### `anno`
65 | A Python dictionary where the keys are integers and the values are dictionaries with the following key-value pairs:
66 |
67 | * `id`: an integer representing the ID of the annotation.
68 | * `sent_id`: a list of integers representing the IDs of sentences associated with this annotation.
69 | * `segmentation`: a Run Length Encoding (RLE) representation of the annotation.
70 | * `area`: an integer representing the area of the annotation.
71 | * `iscrowd`: an integer indicating whether this annotation represents a crowd or not.
72 | * `image_id`: an integer representing the ID of the image associated with this annotation.
73 | * `bbox`: a list of four integers representing the bounding box coordinates of the annotation in the format [x, y, width, height].
74 | * `group_id`: a value that can be any object and represents the ID of the group associated with this annotation.
75 |
76 | ``` python
77 | {
78 | 1 : {
79 | "id": int,
80 | "sent_id": list,
81 | "segmentation": RLE,
82 | "area": int,
83 | "iscrowd": int,
84 | "image_id": int,
85 | "bbox": list, # [x, y, width, height]
86 | "group_id": int
87 | }
88 | }
89 | ```
90 |
91 | ### `image`
92 | A Python dictionary where the keys are integers and the values are dictionaries with the following key-value pairs:
93 |
94 | * `id`: an integer representing the ID of the image.
95 | * `file_name`: a string representing the file name of the image.
96 | * `height`: an integer representing the height of the image.
97 | * `width`: an integer representing the width of the image.
98 | * `flickr_url`: a string representing the Flickr URL of the image.
99 | * `anno_id`: a list of integers representing the IDs of annotations associated with this image.
100 | * `group_id`: an integer representing the ID of the group associated with this image.
101 | * `license`: a string representing the license of the image.
102 |
103 | ``` python
104 | {
105 | int : {
106 | "id": int,
107 | "file_name": str,
108 | "height": int,
109 | "width": int,
110 | "flickr_url": str,
111 | "anno_id": list,
112 | "group_id": int,
113 | "license": str,
114 | }
115 | }
116 | ```
117 |
118 | ### `sent`
119 | A Python dictionary where the keys are integers and the values are dictionaries with the following key-value pairs:
120 |
121 | * `id`: an integer representing the ID of the sentence.
122 | * `anno_id`: a list of integers representing the IDs of annotations associated with this sentence.
123 | * `group_id`: a list of integers representing the IDs of groups associated with this sentence.
124 | * `is_negative`: a boolean indicating whether this sentence is *absence expression* or not. `True` means *absence expression*.
125 | * `raw_sent`: a string representing the raw text of the sentence in English.
126 | * `raw_sent_zh`: a string representing the raw text of the sentence in Chinese.
127 |
128 | ``` python
129 | {
130 | int : {
131 | "id": int,
132 | "anno_id": list,
133 | "group_id": list,
134 | "is_negative": bool,
135 | "raw_sent": str,
136 | "raw_sent_zh": str
137 | }
138 | }
139 | ```
140 |
141 | ### `group`
142 | A Python dictionary where the keys are integers and the values are dictionaries with the following key-value pairs:
143 |
144 | * `id`: an integer representing the ID of the group.
145 | * `pos_sent_id`: a list of integers representing the IDs of sentences that has referred obejct in the group.
146 | * `inner_sent_id`: a list of integers representing the IDs of sentences belonging to this group.
147 | * `outer_sent_id`: a list of integers representing the IDs of outer-group sentences that has referred obejct in the group.
148 | * `img_id`: a list of integers representing the IDs of images of this group.
149 | * `scene`: a list of strings representing the scenes of this group.
150 | * `group_name`: a string representing the name of this group in English.
151 | * `group_name_zh`: a string representing the name of this group in Chinese.
152 |
153 | ``` python
154 | {
155 | int : {
156 | "id": int,
157 | "pos_sent_id": list,
158 | "inner_sent_id": list,
159 | "outer_sent_id": list,
160 | "img_id": list,
161 | "scene": list,
162 | "group_name": str,
163 | "group_name_zh": str
164 | }
165 | }
166 | ```
167 |
168 |
169 |
170 |
171 |
172 | ## Evaluation Settings
173 |
174 |
175 | ### Intra- or Inter-Group Settings
176 |
177 | The default evaluation protocol is the intra-group setting, where only a certain references are evaluated for each image.
178 |
179 | In the $D^3$ dataset, images are collected for different groups (scenarios), and the categories (descriptions) are designed based on the scenarios. For the intra-group setting, each image are only evaluated with the descriptions from the group the image belongs to. We call this **intra-scenario setting**.
180 |
181 | Note that each category is actually annotated on each image (with positive or negative instances).
182 | So you can also evaluate all categories on all images, just like traditional detection datasets. We call this **inter-scenario setting**.
183 | This is quite challenging for the DOD task as this will produce many false positive instances on current methods.
184 |
185 | For intra-group evaluation, you should use:
186 | ```
187 | sent_ids = d3.get_sent_ids(group_ids=group_ids)
188 | # only get the refs (sents) for the group the image belongs to, which is usually 4
189 | ```
190 |
191 | For inter-group evaluation, change the correponding code to:
192 |
193 | ```
194 | sent_ids = d3.get_sent_ids()
195 | # get all the refs in the dataset
196 | ```
197 |
198 | This will use all the sentences in the dataset, rather than a few sentences in the group that this image belongs to.
199 |
200 | This is the only difference in the implentation and evaluation. No further code changes need to be applied.
201 |
202 | For more information, you can refer to the Section 3.4 of the DOD paper.
203 |
204 |
205 | ### FULL, PRES and ABS
206 |
207 | FULL, PRES and ABS means the full descriptions (422 categories), presence descriptions (316 categories) and absence descriptions (106 categories).
208 |
209 | The meaning of absence descriptions are the descriptions involving the absence of some concepts, like lacking certain relationships, attributes or objects. For example, descriptions like "dog *without* leash", "person *without* helmet" and "a hat that is *not* blue" are absence ones.
210 | Similary, the descriptions involving *only* the presence of some concepts are presence descriptions.
211 |
212 | Most existing REC datasets have presence descriptions but few absence descriptions.
213 |
214 | For more details and the meaning of evaluating absence descriptions, please refer to Section 3.1 of the DOD paper.
215 |
216 |
217 |
218 |
219 | ## Evaluation Code and Examples
220 |
221 | In this part, we introduce how to evaluate the performance and get the metric values given the prediction result of a JSON file.
222 |
223 | ### Write a Snippet in Your Code
224 |
225 | This is based on [cocoapi (pycocotools)](https://github.com/cocodataset/cocoapi/tree/master/PythonAPI), and is quite simple:
226 |
227 | ```python
228 | from pycocotools.coco import COCO
229 | from pycocotools.cocoeval import COCOeval
230 |
231 | # Eval results
232 | coco = COCO(gt_path) # `gt_path` is the ground-truth JSON path (different JSON for FULL, PRES or ABS settings in our paper)
233 | d3_model = coco.loadRes(pred_path) # `pred_path` is the prediction JSON file
234 | cocoEval = COCOeval(coco, d3_model, "bbox")
235 | cocoEval.evaluate()
236 | cocoEval.accumulate()
237 | cocoEval.summarize()
238 | ```
239 |
240 | ### An Off-the-shelf Script
241 |
242 | We also provide [a script](scripts/eval_and_analysis_json.py) that can produce the evaluation results (and some additional analysis) in our paper, given a prediction JSON.
243 | You can use it by:
244 | ```shell
245 | python eval_and_analysis_json.py YOUR_PREDICTION_JSON_PATH
246 | ```
247 |
248 | A few options are provided for format conversion or more analysis:
249 | ```shell
250 | python eval_and_analysis_json.py --help
251 |
252 | usage: An example script for $D^3$ evaluation with prediction file (JSON) [-h] [--partition-by-nbox] [--partition-by-lens] [--xyxy2xywh] pred_path
253 |
254 | positional arguments:
255 | pred_path path to prediction json
256 |
257 | optional arguments:
258 | -h, --help show this help message and exit
259 | --partition-by-nbox divide the images by num of boxes for each ref
260 | --partition-by-lens divide the references by their lengths
261 | --xyxy2xywh transform box coords from xyxy to xywh
262 | ```
263 |
264 |
265 | ### Evaluation Examples on SOTA Methods
266 |
267 | See [this directory](eval_sota/) for details. We include the evaluation scripts of some methods there.
268 |
269 |
270 |
271 | ## Dataset Statistics
272 |
273 | [A python script](scripts/get_d3_stat.py) is provided for calculating the statistics of $D^3$ or visualizing figures like histograms, word clouds, etc.
274 |
275 | The specific statistics of the dataset are available in Section 3.3 of the DOD paper.
276 |
--------------------------------------------------------------------------------
/eval_sota/README.md:
--------------------------------------------------------------------------------
1 | # Evaluting SOTA Methods on $D^3$
2 |
3 | ## Leaderboard
4 |
5 | In this directory, we keep the scripts or github links (official or custom) to evaluate SOTA methods (REC/OVD/DOD/MLLM) on $D^3$:
6 |
7 | | Name | Paper | Original Tasks | Training Data | Evaluation Code | Intra-FULL/PRES/ABS/Inter-FULL/PRES/ABS | Source | Note |
8 | |:-----|:-----:|:----:|:-----:|:-----:|:-----:|:-----:|:-----:|
9 | | OFA-large | [OFA: Unifying Architectures, Tasks, and Modalities Through a Simple Sequence-to-Sequence Learning Framework (ICML 2022)](https://arxiv.org/abs/2202.03052) | REC | - | - | 4.2/4.1/4.6/0.1/0.1/0.1 | [DOD paper](https://arxiv.org/abs/2307.12813) | - |
10 | | CORA-R50 | [CORA: Adapting CLIP for Open-Vocabulary Detection with Region Prompting and Anchor Pre-Matching (CVPR 2023)](https://openaccess.thecvf.com/content/CVPR2023/papers/Wu_CORA_Adapting_CLIP_for_Open-Vocabulary_Detection_With_Region_Prompting_and_CVPR_2023_paper.pdf) | OVD | - | - | 6.2/6.7/5.0/2.0/2.2/1.3 | [DOD paper](https://arxiv.org/abs/2307.12813) | - |
11 | | OWL-ViT-large | [Simple Open-Vocabulary Object Detection with Vision Transformers (ECCV 2022)](https://www.ecva.net/papers/eccv_2022/papers_ECCV/papers/136700714.pdf) | OVD | - | [DOD official](./owl_vit.py) | 9.6/10.7/6.4/2.5/2.9/2.1 | [DOD paper](https://arxiv.org/abs/2307.12813) | Post-processing hyper-parameters may affect the performance and the result may not exactly match the paper |
12 | | SPHINX-7B | [SPHINX: The Joint Mixing of Weights, Tasks, and Visual Embeddings for Multi-modal Large Language Models (arxiv 2023)](https://arxiv.org/abs/2311.07575) | **MLLM** capable of REC | - | [DOD official](./sphinx.py) | 10.6/11.4/7.9/-/-/- | DOD authors | A lot of contribution from [Jie Li](https://github.com/theFool32) |
13 | | GLIP-T | [Grounded Language-Image Pre-training (CVPR 2022)](https://arxiv.org/abs/2112.03857) | OVD & PG | - | - | 19.1/18.3/21.5/-/-/- | GEN paper | - |
14 | | UNINEXT-huge | [Universal Instance Perception as Object Discovery and Retrieval (CVPR 2023)](https://arxiv.org/abs/2303.06674v2) | OVD & REC | - | [DOD official](https://github.com/Charles-Xie/UNINEXT_D3) | 20.0/20.6/18.1/3.3/3.9/1.6 | [DOD paper](https://arxiv.org/abs/2307.12813) | - |
15 | | Grounding-DINO-base | [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection (arxiv 2023)](https://arxiv.org/abs/2303.05499) | OVD & REC | - | [DOD official](./groundingdino.py) | 20.7/20.1/22.5/2.7/2.4/3.5 | [DOD paper](https://arxiv.org/abs/2307.12813) | Post-processing hyper-parameters may affect the performance and the result may not exactly match the paper |
16 | | OFA-DOD-base | [Described Object Detection: Liberating Object Detection with Flexible Expressions (NeurIPS 2023)](https://arxiv.org/abs/2307.12813) | DOD | - | - | 21.6/23.7/15.4/5.7/6.9/2.3 | [DOD paper](https://arxiv.org/abs/2307.12813) | - |
17 | | FIBER-B | [Coarse-to-Fine Vision-Language Pre-training with Fusion in the Backbone (NeurIPS 2022)](https://arxiv.org/abs/2206.07643) | OVD & REC | - | - | 22.7/21.5/26.0/-/-/- | GEN paper | - |
18 | | MM-Grounding-DINO | [An Open and Comprehensive Pipeline for Unified Object Grounding and Detection (arxiv 2024)](https://arxiv.org/abs/2401.02361) | DOD & OVD & REC | O365, GoldG, GRIT, V3Det | [MM-GDINO official](https://github.com/open-mmlab/mmdetection/tree/main/configs/mm_grounding_dino#zero-shot-description-detection-datasetdod) | 22.9/21.9/26.0/-/-/- | MM-GDINO paper | - |
19 | | GEN (FIBER-B) | [Generating Enhanced Negatives for Training Language-Based Object Detectors (arxiv 2024](https://arxiv.org/abs/2401.00094) | DOD | - | - | 26.0/25.2/28.1/-/-/- | GEN paper | Enhancement based on FIBER-B |
20 | | APE-large (D) | [Aligning and Prompting Everything All at Once for Universal Visual Perception (arxiv 2023)](https://arxiv.org/abs/2312.02153) | DOD & OVD & REC | COCO, LVIS, O365, OpenImages, Visual Genome, RefCOCO/+/g, SA-1B, GQA, PhraseCut, Flickr30k | [APE official](https://github.com/shenyunhang/APE) | 37.5/38.8/33.9/21.0/22.0/17.9 | APE paper | Extra training data helps for this amazing performance |
21 |
22 |
23 | Some extra notes:
24 | - Each method is currently recorded by *the variant with the highest performance* in this table, if there are multiple variants available, so it's only a leaderboard, not meant for fair comparison.
25 | - Methods like GLIP, FIBER, etc. are actually not evaluated on OVD benchmarks. For zero-shot eval on DOD, We currently do not distinguish between methods for OVD benchmarks and methods for ZS-OD, as long as it is verified with open-set detection capability.
26 |
27 | For other variants (e.g. for a fair comparison regarding data, backbone, etc.), please refer to the papers.
28 |
--------------------------------------------------------------------------------
/eval_sota/groundingdino.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | __author__ = "Chi Xie"
3 | __maintainer__ = "Chi Xie"
4 |
5 | # An example for how to run this script:
6 | # CUDA_VISIBLE_DEVICES=0
7 | # python groundingdino.py \
8 | # -c ./groundingdino/config/GroundingDINO_SwinB.cfg.py \
9 | # -p ./ckpt/groundingdino_swinb_cogcoor.pth \
10 | # -o "outputs/gdino_d3" \
11 | # --box_threshold 0.05 \
12 | # --text_threshold 0.05 \
13 | # --img-top1
14 |
15 | import argparse
16 | import json
17 | import os
18 |
19 | import numpy as np
20 | import torch
21 | from PIL import Image, ImageDraw, ImageFont
22 | from pycocotools.coco import COCO
23 | from pycocotools.cocoeval import COCOeval
24 | from tqdm import tqdm
25 |
26 | import groundingdino.datasets.transforms as T
27 | from groundingdino.models import build_model
28 | from groundingdino.util.slconfig import SLConfig
29 | from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
30 | from d_cube import D3
31 |
32 |
33 | def plot_boxes_to_image(image_pil, tgt):
34 | H, W = tgt["size"]
35 | boxes = tgt["boxes"]
36 | labels = tgt["labels"]
37 | assert len(boxes) == len(labels), "boxes and labels must have same length"
38 |
39 | draw = ImageDraw.Draw(image_pil)
40 | mask = Image.new("L", image_pil.size, 0)
41 | mask_draw = ImageDraw.Draw(mask)
42 |
43 | # draw boxes and masks
44 | for box, label in zip(boxes, labels):
45 | # from 0..1 to 0..W, 0..H
46 | box = box * torch.Tensor([W, H, W, H])
47 | # from xywh to xyxy
48 | box[:2] -= box[2:] / 2
49 | box[2:] += box[:2]
50 | # random color
51 | color = tuple(np.random.randint(0, 255, size=3).tolist())
52 | # draw
53 | x0, y0, x1, y1 = box
54 | x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
55 |
56 | draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
57 | # draw.text((x0, y0), str(label), fill=color)
58 |
59 | font = ImageFont.load_default()
60 | if hasattr(font, "getbbox"):
61 | bbox = draw.textbbox((x0, y0), str(label), font)
62 | else:
63 | w, h = draw.textsize(str(label), font)
64 | bbox = (x0, y0, w + x0, y0 + h)
65 | # bbox = draw.textbbox((x0, y0), str(label))
66 | draw.rectangle(bbox, fill=color)
67 | draw.text((x0, y0), str(label), fill="white")
68 |
69 | mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
70 | return image_pil, mask
71 |
72 |
73 | def load_image(image_path):
74 | # load image
75 | image_pil = Image.open(image_path).convert("RGB") # load image
76 |
77 | transform = T.Compose(
78 | [
79 | T.RandomResize([800], max_size=1333),
80 | T.ToTensor(),
81 | T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
82 | ]
83 | )
84 | image, _ = transform(image_pil, None) # 3, h, w
85 | return image_pil, image
86 |
87 |
88 | def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
89 | args = SLConfig.fromfile(model_config_path)
90 | args.device = "cuda" if not cpu_only else "cpu"
91 | model = build_model(args)
92 | checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
93 | load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
94 | print(load_res)
95 | _ = model.eval()
96 | return model
97 |
98 |
99 | def get_grounding_output(model, image, caption, box_threshold, text_threshold, with_logits=True, cpu_only=False):
100 | caption = caption.lower()
101 | caption = caption.strip()
102 | if not caption.endswith("."):
103 | caption = caption + "."
104 | device = "cuda" if not cpu_only else "cpu"
105 | model = model.to(device)
106 | image = image.to(device)
107 | with torch.no_grad():
108 | outputs = model(image[None], captions=[caption])
109 | logits = outputs["pred_logits"].cpu().sigmoid()[0] # (nq, 256)
110 | boxes = outputs["pred_boxes"].cpu()[0] # (nq, 4)
111 | logits.shape[0]
112 |
113 | # filter output
114 | logits_filt = logits.clone()
115 | boxes_filt = boxes.clone()
116 | filt_mask = logits_filt.max(dim=1)[0] > box_threshold
117 | logits_filt = logits_filt[filt_mask] # num_filt, 256
118 | boxes_filt = boxes_filt[filt_mask] # num_filt, 4
119 | logits_filt.shape[0]
120 |
121 | # get phrase
122 | tokenlizer = model.tokenizer
123 | tokenized = tokenlizer(caption)
124 | # build pred
125 | pred_phrases = []
126 | logits_list = []
127 | for logit, box in zip(logits_filt, boxes_filt):
128 | pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
129 | logits_list.append(logit.max().item())
130 | if with_logits:
131 | pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
132 | else:
133 | pred_phrases.append(pred_phrase)
134 |
135 | return boxes_filt, pred_phrases, logits_list
136 |
137 |
138 | def get_dataset_iter(coco):
139 | img_ids = coco.get_img_ids()
140 | for img_id in img_ids:
141 | img_info = coco.load_imgs(img_id)[0]
142 | file_name = img_info["file_name"]
143 | img_path = os.path.join(IMG_ROOT, file_name)
144 | yield img_id, img_path
145 |
146 |
147 | def eval_on_d3(pred_path, mode="pn"):
148 | assert mode in ("pn", "p", "n")
149 | if mode == "pn":
150 | gt_path = os.path.join(JSON_ANNO_PATH, "d3_full_annotations.json")
151 | elif mode == "p":
152 | gt_path = os.path.join(JSON_ANNO_PATH, "d3_pres_annotations.json")
153 | else:
154 | gt_path = os.path.join(JSON_ANNO_PATH, "d3_abs_annotations.json")
155 | coco = COCO(gt_path)
156 | d3_res = coco.loadRes(pred_path)
157 | cocoEval = COCOeval(coco, d3_res, "bbox")
158 | cocoEval.evaluate()
159 | cocoEval.accumulate()
160 | cocoEval.summarize()
161 |
162 | # comment the following if u only need intra/inter map for full/pres/abs
163 | # ===================== uncomment this if u need detailed analysis =====================
164 | # aps = cocoEval.eval["precision"][:, :, :, 0, -1]
165 | # category_ids = coco.getCatIds()
166 | # category_names = [cat["name"] for cat in coco.loadCats(category_ids)]
167 |
168 | # aps_lens = defaultdict(list)
169 | # counter_lens = defaultdict(int)
170 | # for i in range(len(category_names)):
171 | # ap = aps[:, :, i]
172 | # ap_value = ap[ap > -1].mean()
173 | # if not np.isnan(ap_value):
174 | # len_ref = len(category_names[i].split(" "))
175 | # aps_lens[len_ref].append(ap_value)
176 | # counter_lens[len_ref] += 1
177 |
178 | # ap_sum_short = sum([sum(aps_lens[i]) for i in range(0, 4)])
179 | # ap_sum_mid = sum([sum(aps_lens[i]) for i in range(4, 7)])
180 | # ap_sum_long = sum([sum(aps_lens[i]) for i in range(7, 10)])
181 | # ap_sum_very_long = sum(
182 | # [sum(aps_lens[i]) for i in range(10, max(counter_lens.keys()) + 1)]
183 | # )
184 | # c_sum_short = sum([counter_lens[i] for i in range(1, 4)])
185 | # c_sum_mid = sum([counter_lens[i] for i in range(4, 7)])
186 | # c_sum_long = sum([counter_lens[i] for i in range(7, 10)])
187 | # c_sum_very_long = sum(
188 | # [counter_lens[i] for i in range(10, max(counter_lens.keys()) + 1)]
189 | # )
190 | # map_short = ap_sum_short / c_sum_short
191 | # map_mid = ap_sum_mid / c_sum_mid
192 | # map_long = ap_sum_long / c_sum_long
193 | # map_very_long = ap_sum_very_long / c_sum_very_long
194 | # print(
195 | # f"mAP over reference length: short - {map_short:.4f}, mid - {map_mid:.4f}, long - {map_long:.4f}, very long - {map_very_long:.4f}"
196 | # )
197 | # ===================== uncomment this if u need detailed analysis =====================
198 |
199 |
200 | def inference_on_d3(data_iter, model, args, box_threshold, text_threshold):
201 | pred = []
202 | for idx, (img_id, image_path) in enumerate(tqdm(data_iter)):
203 | # load image
204 | image_pil, image = load_image(image_path)
205 | size = image_pil.size
206 | W, H = size
207 |
208 | group_ids = d3.get_group_ids(img_ids=[img_id])
209 | sent_ids = d3.get_sent_ids(group_ids=group_ids)
210 | sent_list = d3.load_sents(sent_ids=sent_ids)
211 | text_list = [sent['raw_sent'] for sent in sent_list]
212 |
213 | for sent_id, text_prompt in zip(sent_ids, text_list):
214 | # run model
215 | boxes_filt, pred_phrases, logit_list = get_grounding_output(
216 | model, image, text_prompt, box_threshold, text_threshold, cpu_only=args.cpu_only, with_logits=False,
217 | )
218 | if args.vis:
219 | pred_dict = {
220 | "boxes": boxes_filt, # [x_center, y_center, w, h]
221 | "size": [size[1], size[0]],
222 | "labels": [f"{phrase}({str(logit)[:4]})" for phrase, logit in zip(pred_phrases, logit_list)],
223 | }
224 | image_with_box = plot_boxes_to_image(image_pil.copy(), pred_dict)[0]
225 | image_with_box.save(os.path.join(output_dir, f"{img_id}_{text_prompt}.jpg"))
226 | if not logit_list:
227 | continue
228 | if args.img_top1:
229 | max_score_idx = logit_list.index(max(logit_list))
230 | bboxes, phrases, logits = [boxes_filt[max_score_idx]], [pred_phrases[max_score_idx]], [logit_list[max_score_idx]]
231 | else:
232 | bboxes, phrases, logits = boxes_filt, pred_phrases, logit_list
233 | for box, phrase, logit in zip(bboxes, phrases, logits):
234 | if len(phrase) > args.overlap_percent * len(text_prompt) or phrase == text_prompt:
235 | x1, y1, w, h = box.tolist()
236 | x0, y0 = x1 - w / 2, y1 - h / 2
237 | pred_item = {
238 | "image_id": img_id,
239 | "category_id": sent_id,
240 | "bbox": [x0 * W, y0 * H, w * W, h * H],
241 | "score": float(logit),
242 | }
243 | pred.append(pred_item)
244 |
245 | return pred
246 |
247 |
248 | if __name__ == "__main__":
249 | IMG_ROOT = None # set here
250 | JSON_ANNO_PATH = None # set here
251 | PKL_ANNO_PATH = None # set here
252 | assert IMG_ROOT is not None, "Please set IMG_ROOT in the script first"
253 | assert JSON_ANNO_PATH is not None, "Please set JSON_ANNO_PATH in the script first"
254 | assert PKL_ANNO_PATH is not None, "Please set PKL_ANNO_PATH in the script first"
255 |
256 | d3 = D3(IMG_ROOT, PKL_ANNO_PATH)
257 |
258 | parser = argparse.ArgumentParser("Grounding DINO evaluation on D-cube (https://arxiv.org/abs/2307.12813)", add_help=True)
259 | parser.add_argument("--config_file", "-c", type=str, required=True, help="path to config file")
260 | parser.add_argument(
261 | "--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file"
262 | )
263 | # parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file")
264 | # parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt")
265 | parser.add_argument(
266 | "--output_dir", "-o", type=str, default="outputs", required=True, help="output directory"
267 | )
268 | parser.add_argument("--vis", action="store_true", help="visualization on D3")
269 |
270 | parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold")
271 | parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
272 |
273 | parser.add_argument("--cpu-only", action="store_true", help="running on cpu only!, default=False")
274 | parser.add_argument("--img-top1", action="store_true", help="select only the box with top max score")
275 | # parser.add_argument("--overlap-percent", type=float, default=1.0, help="overlapping percentage between input prompt and output label")
276 | # this overlapping percentage denotes an additional post-processing technique we designed. if you turn this on, you may get higher performance by tuning this parameter.
277 | args = parser.parse_args()
278 | args.overlap_percent = 1 # by default, we do not use this technique.
279 | print(args)
280 |
281 | # cfg
282 | config_file = args.config_file # change the path of the model config file
283 | checkpoint_path = args.checkpoint_path # change the path of the model
284 | # image_path = args.image_path
285 | # text_prompt = args.text_prompt
286 | output_dir = args.output_dir
287 | box_threshold = args.box_threshold
288 | text_threshold = args.text_threshold
289 |
290 | # make dir
291 | os.makedirs(output_dir, exist_ok=True)
292 | # load model
293 | model = load_model(config_file, checkpoint_path, cpu_only=args.cpu_only)
294 |
295 | data_iter = get_dataset_iter(d3)
296 |
297 | pred = inference_on_d3(data_iter, model, args, box_threshold=box_threshold, text_threshold=text_threshold)
298 |
299 | pred_path = os.path.join(output_dir, f"prediction.json")
300 | with open(pred_path, "w") as f_:
301 | json.dump(pred, f_)
302 | eval_on_d3(pred_path, mode='pn')
303 | eval_on_d3(pred_path, mode='p')
304 | eval_on_d3(pred_path, mode='n')
305 |
--------------------------------------------------------------------------------
/eval_sota/owl_vit.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | from collections import defaultdict
4 |
5 | from tqdm import tqdm
6 | from PIL import Image
7 | import numpy as np
8 | from pycocotools.coco import COCO
9 | from pycocotools.cocoeval import COCOeval
10 | import torch
11 | from transformers import OwlViTProcessor, OwlViTForObjectDetection
12 |
13 | from d_cube import D3
14 |
15 |
16 | def write_json(json_path, json_data):
17 | with open(json_path, "w") as f_:
18 | json.dump(json_data, f_)
19 |
20 |
21 | def read_json(json_path):
22 | with open(json_path, "r") as f_:
23 | json_data = json.load(f_)
24 | return json_data
25 |
26 |
27 | def load_image_general(image_path):
28 | image_pil = Image.open(image_path)
29 | return image_pil
30 |
31 |
32 | def get_prediction(model, image, captions, cpu_only=False):
33 | for i in range(len(captions)):
34 | captions[i] = captions[i].lower()
35 | captions[i] = captions[i].strip()
36 | if not captions[i].endswith("."):
37 | captions[i] = captions[i] + "."
38 | device = "cuda" if not cpu_only else "cpu"
39 | model = model.to(device)
40 | with torch.no_grad():
41 | inputs = processor(text=[captions], images=image, return_tensors="pt").to(
42 | device
43 | )
44 | outputs = model(**inputs)
45 | target_size = torch.Tensor([image.size[::-1]]).to(device)
46 | results = processor.post_process_object_detection(
47 | outputs=outputs, target_sizes=target_size, threshold=0.1
48 | # the post precessing threshold will affect the performance obviously
49 | # you may tune it to get better performance, e.g., 0.05
50 | )
51 | boxes, scores, labels = (
52 | results[0]["boxes"],
53 | results[0]["scores"],
54 | results[0]["labels"],
55 | )
56 | return boxes, scores, labels
57 |
58 |
59 | def get_dataset_iter(coco):
60 | img_ids = coco.get_img_ids()
61 | for img_id in img_ids:
62 | img_info = coco.load_imgs(img_id)[0]
63 | file_name = img_info["file_name"]
64 | img_path = os.path.join(IMG_ROOT, file_name)
65 | yield img_id, img_path
66 |
67 |
68 | def eval_on_d3(pred_path, mode="pn"):
69 | assert mode in ("pn", "p", "n")
70 | if mode == "pn":
71 | gt_path = os.path.join(JSON_ANNO_PATH, "d3_full_annotations.json")
72 | elif mode == "p":
73 | gt_path = os.path.join(JSON_ANNO_PATH, "d3_pres_annotations.json")
74 | else:
75 | gt_path = os.path.join(JSON_ANNO_PATH, "d3_abs_annotations.json")
76 | coco = COCO(gt_path)
77 | d3_res = coco.loadRes(pred_path)
78 | cocoEval = COCOeval(coco, d3_res, "bbox")
79 | cocoEval.evaluate()
80 | cocoEval.accumulate()
81 | cocoEval.summarize()
82 |
83 | # comment the following if u only need intra/inter map for full/pres/abs
84 | # ===================== uncomment this if u need detailed analysis =====================
85 | # aps = cocoEval.eval["precision"][:, :, :, 0, -1]
86 | # category_ids = coco.getCatIds()
87 | # category_names = [cat["name"] for cat in coco.loadCats(category_ids)]
88 |
89 | # aps_lens = defaultdict(list)
90 | # counter_lens = defaultdict(int)
91 | # for i in range(len(category_names)):
92 | # ap = aps[:, :, i]
93 | # ap_value = ap[ap > -1].mean()
94 | # if not np.isnan(ap_value):
95 | # len_ref = len(category_names[i].split(" "))
96 | # aps_lens[len_ref].append(ap_value)
97 | # counter_lens[len_ref] += 1
98 |
99 | # ap_sum_short = sum([sum(aps_lens[i]) for i in range(0, 4)])
100 | # ap_sum_mid = sum([sum(aps_lens[i]) for i in range(4, 7)])
101 | # ap_sum_long = sum([sum(aps_lens[i]) for i in range(7, 10)])
102 | # ap_sum_very_long = sum(
103 | # [sum(aps_lens[i]) for i in range(10, max(counter_lens.keys()) + 1)]
104 | # )
105 | # c_sum_short = sum([counter_lens[i] for i in range(1, 4)])
106 | # c_sum_mid = sum([counter_lens[i] for i in range(4, 7)])
107 | # c_sum_long = sum([counter_lens[i] for i in range(7, 10)])
108 | # c_sum_very_long = sum(
109 | # [counter_lens[i] for i in range(10, max(counter_lens.keys()) + 1)]
110 | # )
111 | # map_short = ap_sum_short / c_sum_short
112 | # map_mid = ap_sum_mid / c_sum_mid
113 | # map_long = ap_sum_long / c_sum_long
114 | # map_very_long = ap_sum_very_long / c_sum_very_long
115 | # print(
116 | # f"mAP over reference length: short - {map_short:.4f}, mid - {map_mid:.4f}, long - {map_long:.4f}, very long - {map_very_long:.4f}"
117 | # )
118 | # ===================== uncomment this if u need detailed analysis =====================
119 |
120 |
121 | def inference_on_d3(data_iter, model):
122 | pred = []
123 | error = []
124 | for img_id, image_path in tqdm(data_iter):
125 | image = load_image_general(image_path)
126 |
127 | # ==================================== intra-group setting ====================================
128 | # each image is evaluated with the categories in its group (usually 4)
129 | group_ids = d3.get_group_ids(img_ids=[img_id])
130 | sent_ids = d3.get_sent_ids(group_ids=group_ids)
131 | # ==================================== intra-group setting ====================================
132 | # ==================================== inter-group setting ====================================
133 | # each image is evaluated with all categories in the dataset (422 for the first version of the dataset)
134 | # sent_ids = d3.get_sent_ids()
135 | # ==================================== inter-group setting ====================================
136 | sent_list = d3.load_sents(sent_ids=sent_ids)
137 | text_list = [sent["raw_sent"] for sent in sent_list]
138 |
139 | try:
140 | boxes, scores, labels = get_prediction(model, image, text_list, cpu_only=False)
141 | for box, score, label in zip(boxes, scores, labels):
142 | pred_item = {
143 | "image_id": img_id,
144 | "category_id": sent_ids[label],
145 | "bbox": convert_to_xywh(box.tolist()), # use xywh
146 | "score": float(score),
147 | }
148 | pred.append(pred_item) # the output to be saved to JSON.
149 | except:
150 | print("error!!!")
151 | return pred, error
152 |
153 |
154 | def convert_to_xywh(bbox_xyxy):
155 | """
156 | Convert top-left and bottom-right corner coordinates to [x, y, width, height] format.
157 | """
158 | x1, y1, x2, y2 = bbox_xyxy
159 | width = x2 - x1
160 | height = y2 - y1
161 | return [x1, y1, width, height]
162 |
163 |
164 | if __name__ == "__main__":
165 | IMG_ROOT = None # set here
166 | JSON_ANNO_PATH = None # set here
167 | PKL_ANNO_PATH = None # set here
168 | assert IMG_ROOT is not None, "Please set IMG_ROOT in the script first"
169 | assert JSON_ANNO_PATH is not None, "Please set JSON_ANNO_PATH in the script first"
170 | assert PKL_ANNO_PATH is not None, "Please set PKL_ANNO_PATH in the script first"
171 |
172 | d3 = D3(IMG_ROOT, PKL_ANNO_PATH)
173 |
174 | output_dir = "ovd/owlvit/"
175 | os.makedirs(output_dir, exist_ok=True)
176 |
177 | # model prediction
178 | processor = OwlViTProcessor.from_pretrained("owl-vit")
179 | model = OwlViTForObjectDetection.from_pretrained("owl-vit")
180 | data_iter = get_dataset_iter(d3)
181 | pred, error = inference_on_d3(data_iter, model)
182 |
183 | pred_path = os.path.join(output_dir, f"prediction.json")
184 | pred_path_error = os.path.join(output_dir, "error.json")
185 | write_json(pred_path, pred)
186 | write_json(pred_path_error, error)
187 | # see https://github.com/shikras/d-cube/blob/main/doc.md#output-format for the output format
188 | # the output format is identical to COCO.
189 |
190 | eval_on_d3(pred_path, mode="pn") # the FULL setting
191 | eval_on_d3(pred_path, mode="p") # the PRES setting
192 | eval_on_d3(pred_path, mode="n") # the ABS setting
193 |
--------------------------------------------------------------------------------
/eval_sota/sphinx.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | __author__ = "Chi Xie and Jie Li"
3 | __maintainer__ = "Chi Xie"
4 |
5 | import json
6 | import os
7 | from collections import defaultdict
8 | import re
9 |
10 | from PIL import Image
11 | from pycocotools.coco import COCO
12 | from pycocotools.cocoeval import COCOeval
13 |
14 | from d_cube import D3
15 |
16 |
17 | def write_json(json_path, json_data):
18 | with open(json_path, "w") as f_:
19 | json.dump(json_data, f_)
20 |
21 |
22 | def read_json(json_path):
23 | with open(json_path, "r") as f_:
24 | json_data = json.load(f_)
25 | return json_data
26 |
27 |
28 | def load_image_general(image_path):
29 | image_pil = Image.open(image_path)
30 | return image_pil
31 |
32 |
33 | def extract_boxes(input_string):
34 | # if input_string.startswith("None"):
35 | # return []
36 | # Define the pattern using regular expression
37 | pattern = r'\[([\d.,; ]+)\]'
38 |
39 | # Search for the pattern in the input string
40 | match = re.search(pattern, input_string)
41 |
42 | # If a match is found, extract and return the boxes as a list
43 | if match:
44 | boxes_str = match.group(1)
45 | boxes_list = [list(map(float, box.split(','))) for box in boxes_str.split(';')]
46 | return boxes_list
47 | else:
48 | return []
49 |
50 |
51 | def get_prediction(mllm_res, image, captions, cpu_only=False):
52 | boxes, scores, labels = [], [], []
53 | width, height = image.size
54 | for idx, res_item in enumerate(mllm_res):
55 | boxes_list = extract_boxes(res_item["answer"])
56 | for bbox in boxes_list:
57 | bbox_rescaled = get_true_bbox(image.size, bbox)
58 | boxes.append(bbox_rescaled)
59 | scores.append(1.0)
60 | labels.append(idx)
61 | return boxes, scores, labels
62 |
63 |
64 | def get_dataset_iter(coco):
65 | img_ids = coco.get_img_ids()
66 | for img_id in img_ids:
67 | img_info = coco.load_imgs(img_id)[0]
68 | file_name = img_info["file_name"]
69 | img_path = os.path.join(IMG_ROOT, file_name)
70 | yield img_id, file_name, img_path
71 |
72 |
73 | def eval_on_d3(pred_path, mode="pn"):
74 | assert mode in ("pn", "p", "n")
75 | if mode == "pn":
76 | gt_path = os.path.join(JSON_ANNO_PATH, "d3_full_annotations.json")
77 | elif mode == "p":
78 | gt_path = os.path.join(JSON_ANNO_PATH, "d3_pres_annotations.json")
79 | else:
80 | gt_path = os.path.join(JSON_ANNO_PATH, "d3_abs_annotations.json")
81 | coco = COCO(gt_path)
82 | d3_res = coco.loadRes(pred_path)
83 | cocoEval = COCOeval(coco, d3_res, "bbox")
84 | cocoEval.evaluate()
85 | cocoEval.accumulate()
86 | cocoEval.summarize()
87 |
88 |
89 | def group_sphinx_res_by_img(inference_res):
90 | inference_res_by_img = defaultdict(list)
91 | for res_item in inference_res:
92 | img_path = "/".join(res_item["image_path"].split("/")[-2:])
93 | inference_res_by_img[img_path].append(res_item)
94 | inference_res_by_img = dict(inference_res_by_img)
95 | return inference_res_by_img
96 |
97 |
98 | def get_true_bbox(img_size, bbox):
99 | width, height = img_size
100 | max_edge = max(height, width)
101 | bbox = [v * max_edge for v in bbox]
102 | diff = abs(width - height) // 2
103 | if height < width:
104 | bbox[1] -= diff
105 | bbox[3] -= diff
106 | else:
107 | bbox[0] -= diff
108 | bbox[2] -= diff
109 | return bbox
110 |
111 |
112 | def inference_on_d3(data_iter, inference_res):
113 | pred = []
114 | inf_res_by_img = group_sphinx_res_by_img(inference_res)
115 | for idx, (img_id, img_name, img_path) in enumerate(data_iter):
116 | image = load_image_general(img_path)
117 |
118 | # ==================================== intra-group setting ====================================
119 | # each image is evaluated with the categories in its group (usually 4)
120 | group_ids = d3.get_group_ids(img_ids=[img_id])
121 | sent_ids = d3.get_sent_ids(group_ids=group_ids)
122 | # ==================================== intra-group setting ====================================
123 | # ==================================== inter-group setting ====================================
124 | # each image is evaluated with all categories in the dataset (422 for the first version of the dataset)
125 | # sent_ids = d3.get_sent_ids()
126 | # ==================================== inter-group setting ====================================
127 | sent_list = d3.load_sents(sent_ids=sent_ids)
128 | text_list = [sent["raw_sent"] for sent in sent_list]
129 |
130 | boxes, scores, labels = get_prediction(inf_res_by_img[img_name], image, text_list, cpu_only=False)
131 | for box, score, label in zip(boxes, scores, labels):
132 | pred_item = {
133 | "image_id": img_id,
134 | "category_id": sent_ids[label],
135 | "bbox": convert_to_xywh(box), # use xywh
136 | "score": float(score),
137 | }
138 | pred.append(pred_item) # the output to be saved to JSON.
139 | return pred
140 |
141 |
142 | def convert_to_xywh(bbox_xyxy):
143 | """
144 | Convert top-left and bottom-right corner coordinates to [x, y, width, height] format.
145 | """
146 | x1, y1, x2, y2 = bbox_xyxy
147 | width = x2 - x1
148 | height = y2 - y1
149 | return [x1, y1, width, height]
150 |
151 |
152 | if __name__ == "__main__":
153 | IMG_ROOT = None # set here
154 | JSON_ANNO_PATH = None # set here
155 | PKL_ANNO_PATH = None # set here
156 | # ============================== SPHINX inference result file ===============
157 | SPHINX_INFERENCE_RES_PATH = None
158 | # You can download the SPHINX d3 inference result example from:
159 | # https://github.com/shikras/d-cube/files/14276682/sphinx_d3_result.json
160 | # For the inference process, please refer to SPHINX official repo (https://github.com/Alpha-VLLM/LLaMA2-Accessory)
161 | # the prompts we used are available in this JSON file
162 | # Thanks for the contribution from Jie Li (https://github.com/theFool32)
163 | # ============================== SPHINX inference result file ===============
164 | assert IMG_ROOT is not None, "Please set IMG_ROOT in the script first"
165 | assert JSON_ANNO_PATH is not None, "Please set JSON_ANNO_PATH in the script first"
166 | assert PKL_ANNO_PATH is not None, "Please set PKL_ANNO_PATH in the script first"
167 |
168 | d3 = D3(IMG_ROOT, PKL_ANNO_PATH)
169 |
170 | output_dir = "mllm/sphinx/" # or whatever you prefer
171 | inference_res = read_json(SPHINX_INFERENCE_RES_PATH)
172 |
173 | # model prediction
174 | data_iter = get_dataset_iter(d3)
175 | pred = inference_on_d3(data_iter, inference_res)
176 |
177 | pred_path = os.path.join(output_dir, f"prediction.json")
178 | write_json(pred_path, pred)
179 | # see https://github.com/shikras/d-cube/blob/main/doc.md#output-format for the output format
180 | # the output format is identical to COCO.
181 |
182 | eval_on_d3(pred_path, mode="pn") # the FULL setting
183 | eval_on_d3(pred_path, mode="p") # the PRES setting
184 | eval_on_d3(pred_path, mode="n") # the ABS setting
185 |
--------------------------------------------------------------------------------
/qa.md:
--------------------------------------------------------------------------------
1 | # Frequently Asked Questions
2 |
3 | Q:
4 | What's the difference between Intra-Group and Inter-Group setting in [the DOD paper](https://arxiv.org/abs/2307.12813), and how to set them?
5 |
6 | A:
7 | Please see [this explanation in the document](./doc.md#intra--or-inter-group-settings).
8 |
9 |
10 |
11 | Q:
12 | What's the meaning of and difference between FULL, PRES, and ABS?
13 |
14 | A:
15 | Please see [this explanation in the document](./doc.md#full-pres-and-abs).
16 |
17 |
18 |
19 | Q:
20 | How do I perform a visualization of ground truth or prediction on a image?
21 |
22 | A:
23 | You can use `d3.get_anno_ids` function and pass the `img_id` you choose as parameter to get the annotation ids for a image.
24 | After this, you can obtain the annotation details (class ids, bboxes) with `d3.load_annos`.
25 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pycocotools
3 | opencv-python
4 | matplotlib
5 |
--------------------------------------------------------------------------------
/scripts/eval_and_analysis_json.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | __author__ = "Chi Xie and Zhao Zhang"
3 | __maintainer__ = "Chi Xie"
4 | # this script takes the result json in, and print evaluation and analysis result on D-cube (FULL/PRES/ABS, etc.)
5 | import os
6 | import json
7 | import argparse
8 | from collections import defaultdict
9 |
10 | import numpy as np
11 | from pycocotools.coco import COCO
12 | from pycocotools.cocoeval import COCOeval
13 |
14 | from d_cube import D3
15 |
16 | def eval_on_d3(pred_path, mode="pn", nbox_partition=None, lref_partition=False):
17 | assert mode in ("pn", "p", "n")
18 | if mode == "pn":
19 | gt_path = os.path.join(JSON_ANNO_PATH, "d3_full_annotations.json")
20 | elif mode == "p":
21 | gt_path = os.path.join(JSON_ANNO_PATH, "d3_pres_annotations.json")
22 | else:
23 | gt_path = os.path.join(JSON_ANNO_PATH, "d3_abs_annotations.json")
24 |
25 | if nbox_partition:
26 | gt_path, pred_path = nbox_partition_json(gt_path, pred_path, nbox_partition)
27 |
28 | # Eval results
29 | coco = COCO(gt_path)
30 | d3_res = coco.loadRes(pred_path)
31 | cocoEval = COCOeval(coco, d3_res, "bbox")
32 | cocoEval.evaluate()
33 | cocoEval.accumulate()
34 | cocoEval.summarize()
35 |
36 | aps = cocoEval.eval["precision"][:, :, :, 0, -1]
37 | category_ids = coco.getCatIds()
38 | category_names = [cat["name"] for cat in coco.loadCats(category_ids)]
39 |
40 | if lref_partition:
41 | aps_lens = defaultdict(list)
42 | counter_lens = defaultdict(int)
43 | for i in range(len(category_names)):
44 | ap = aps[:, :, i]
45 | ap_value = ap[ap > -1].mean()
46 | if not np.isnan(ap_value):
47 | len_ref = len(category_names[i].split(" "))
48 | aps_lens[len_ref].append(ap_value)
49 | counter_lens[len_ref] += 1
50 |
51 | ap_sum_short = sum([sum(aps_lens[i]) for i in range(0, 4)])
52 | ap_sum_mid = sum([sum(aps_lens[i]) for i in range(4, 7)])
53 | ap_sum_long = sum([sum(aps_lens[i]) for i in range(7, 10)])
54 | ap_sum_very_long = sum(
55 | [sum(aps_lens[i]) for i in range(10, max(counter_lens.keys()) + 1)]
56 | )
57 | c_sum_short = sum([counter_lens[i] for i in range(1, 4)])
58 | c_sum_mid = sum([counter_lens[i] for i in range(4, 7)])
59 | c_sum_long = sum([counter_lens[i] for i in range(7, 10)])
60 | c_sum_very_long = sum(
61 | [counter_lens[i] for i in range(10, max(counter_lens.keys()) + 1)]
62 | )
63 | map_short = ap_sum_short / c_sum_short
64 | map_mid = ap_sum_mid / c_sum_mid
65 | map_long = ap_sum_long / c_sum_long
66 | map_very_long = ap_sum_very_long / c_sum_very_long
67 | print(
68 | f"mAP over reference length: short - {map_short:.4f}, mid - {map_mid:.4f}, long - {map_long:.4f}, very long - {map_very_long:.4f}"
69 | )
70 |
71 |
72 | def nbox_partition_json(gt_path, pred_path, nbox_partition):
73 | with open(gt_path, "r") as f_gt:
74 | gts = json.load(f_gt)
75 | with open(pred_path, "r") as f_pred:
76 | preds = json.load(f_pred)
77 |
78 | cat_obj_count = d3.bbox_num_analyze()
79 | annos = gts["annotations"]
80 | new_annos = []
81 | for ann in annos:
82 | img_id = ann["image_id"]
83 | category_id = ann["category_id"]
84 | if nbox_partition == "one" and cat_obj_count[category_id - 1, img_id] == 1:
85 | new_annos.append(ann)
86 | if nbox_partition == "multi" and cat_obj_count[category_id - 1, img_id] > 1:
87 | new_annos.append(ann)
88 | if nbox_partition == "two" and cat_obj_count[category_id - 1, img_id] == 2:
89 | new_annos.append(ann)
90 | if nbox_partition == "three" and cat_obj_count[category_id - 1, img_id] == 3:
91 | new_annos.append(ann)
92 | if nbox_partition == "four" and cat_obj_count[category_id - 1, img_id] == 4:
93 | new_annos.append(ann)
94 | if nbox_partition == "four_more" and cat_obj_count[category_id - 1, img_id] > 4:
95 | new_annos.append(ann)
96 | gts["annotations"] = new_annos
97 | new_gts = gts
98 | new_preds = []
99 | for prd in preds:
100 | img_id = prd["image_id"]
101 | category_id = prd["category_id"]
102 | if nbox_partition == "no" and cat_obj_count[category_id - 1, img_id] == 0:
103 | new_preds.append(prd)
104 | if nbox_partition == "one" and cat_obj_count[category_id - 1, img_id] == 1:
105 | new_preds.append(prd)
106 | if nbox_partition == "multi" and cat_obj_count[category_id - 1, img_id] > 1:
107 | new_preds.append(prd)
108 | if nbox_partition == "two" and cat_obj_count[category_id - 1, img_id] == 2:
109 | new_preds.append(prd)
110 | if nbox_partition == "three" and cat_obj_count[category_id - 1, img_id] == 3:
111 | new_preds.append(prd)
112 | if nbox_partition == "four" and cat_obj_count[category_id - 1, img_id] == 4:
113 | new_preds.append(prd)
114 | if nbox_partition == "four_more" and cat_obj_count[category_id - 1, img_id] > 4:
115 | new_preds.append(prd)
116 |
117 | new_gt_path = gt_path.replace(".json", f".{nbox_partition}-instance.json")
118 | new_pred_path = pred_path.replace(".json", f".{nbox_partition}-instance.json")
119 | with open(new_gt_path, "w") as fo_gt:
120 | json.dump(new_gts, fo_gt)
121 | with open(new_pred_path, "w") as fo_pred:
122 | json.dump(new_preds, fo_pred)
123 | return new_gt_path, new_pred_path
124 |
125 |
126 | def convert_to_xywh(x1, y1, x2, y2):
127 | """
128 | Convert top-left and bottom-right corner coordinates to [x,y,width,height] format.
129 | """
130 | width = x2 - x1
131 | height = y2 - y1
132 | return x1, y1, width, height
133 |
134 |
135 | def transform_json_boxes(pred_path):
136 | with open(pred_path, "r") as f_:
137 | res = json.load(f_)
138 | for item in res:
139 | item["bbox"] = convert_to_xywh(*item["bbox"])
140 | res_path = pred_path.replace(".json", ".xywh.json")
141 | with open(res_path, "w") as f_w:
142 | json.dump(res, f_w)
143 | return res_path
144 |
145 |
146 | if __name__ == "__main__":
147 | IMG_ROOT = None # set here
148 | JSON_ANNO_PATH = None # set here
149 | PKL_ANNO_PATH = None # set here
150 | assert IMG_ROOT is not None, "Please set IMG_ROOT in the script first"
151 | assert JSON_ANNO_PATH is not None, "Please set JSON_ANNO_PATH in the script first"
152 | assert PKL_ANNO_PATH is not None, "Please set PKL_ANNO_PATH in the script first"
153 | d3 = D3(IMG_ROOT, PKL_ANNO_PATH)
154 |
155 | parser = argparse.ArgumentParser(
156 | "An example script for D-cube evaluation with prediction file (JSON)",
157 | add_help=True,
158 | )
159 | parser.add_argument("pred_path", type=str, help="path to the prediction JSON file")
160 | parser.add_argument(
161 | "--partition-by-nbox",
162 | action="store_true",
163 | help="divide the images by num of boxes for each ref",
164 | )
165 | parser.add_argument(
166 | "--partition-by-lens",
167 | action="store_true",
168 | help="divide the references by their lengths",
169 | )
170 | parser.add_argument(
171 | "--xyxy2xywh",
172 | action="store_true",
173 | help="transform box coords from xyxy to xywh",
174 | )
175 | args = parser.parse_args()
176 | if args.xyxy2xywh:
177 | pred_path = transform_json_boxes(args.pred_path)
178 | else:
179 | pred_path = args.pred_path
180 | pred_path = args.pred_path
181 | if args.partition_by_nbox:
182 | # partiton: no-instance, one-instance, multi-instance
183 | for mode in ("pn", "p", "n"):
184 | # for ptt in ('no', 'one', 'multi'):
185 | for ptt in ("no", "one", "two", "three", "four", "four_more"):
186 | eval_on_d3(pred_path, mode=mode, nbox_partition=ptt)
187 | else:
188 | eval_on_d3(pred_path, mode="pn", lref_partition=args.partition_by_lens)
189 | eval_on_d3(pred_path, mode="p", lref_partition=args.partition_by_lens)
190 | eval_on_d3(pred_path, mode="n", lref_partition=args.partition_by_lens)
191 |
--------------------------------------------------------------------------------
/scripts/eval_json_example.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | __author__ = "Chi Xie and Zhao Zhang"
3 | __maintainer__ = "Chi Xie"
4 | # this script takes the result json in, and print evaluation and analysis result on D-cube (FULL/PRES/ABS, etc.)
5 | from pycocotools.coco import COCO
6 | from pycocotools.cocoeval import COCOeval
7 |
8 | # Eval results with COCOAPI
9 | gt_path = "./d3_full_annotations.json" # FULL, PRES or ABS
10 | pred_path = None # set your prediction JSON path
11 | coco = COCO(gt_path)
12 | d3_res = coco.loadRes(pred_path)
13 | cocoEval = COCOeval(coco, d3_res, "bbox")
14 | cocoEval.evaluate()
15 | cocoEval.accumulate()
16 | cocoEval.summarize()
17 |
--------------------------------------------------------------------------------
/scripts/get_d3_stat.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from d_cube.vis_util import plot_hist
4 | from d_cube import D3
5 |
6 |
7 | def vis_num_instance(cat_obj_count):
8 | # Assuming `cat_obj_count` is your numpy array of shape [n_cat, n_img]
9 |
10 | # Calculate the total number of instances in each image
11 | total_instances_per_image = np.sum(cat_obj_count, axis=0)
12 |
13 | # # Plot the histogram
14 | # plt.hist(total_instances_per_image, bins=20)
15 | # plt.xlabel('Number of Instances')
16 | # plt.ylabel('Frequency')
17 | # plt.title('Distribution of Number of Instances on a Image')
18 |
19 | # # Save the figure
20 | # plt.savefig('vis_fig/instance_distribution.png', bbox_inches='tight')
21 | # plt.close()
22 | plot_hist(
23 | total_instances_per_image,
24 | bins=max(total_instances_per_image) - min(total_instances_per_image) + 1,
25 | save_path="vis_fig/instance_dist_hist.pdf",
26 | )
27 |
28 |
29 | def vis_num_category(cat_obj_count):
30 | # Assuming `cat_obj_count` is your numpy array of shape [n_cat, n_img]
31 |
32 | # Calculate the number of categories in each image
33 | num_categories_per_image = np.sum(cat_obj_count > 0, axis=0)
34 |
35 | # # Plot the histogram
36 | # plt.hist(num_categories_per_image, bins=20)
37 | # plt.xlabel('Number of Categories')
38 | # plt.ylabel('Frequency')
39 | # plt.title('Distribution of Number of Categories on a Image')
40 |
41 | # # Save the figure
42 | # plt.savefig('vis_fig/category_distribution.png', bbox_inches='tight')
43 | # plt.close()
44 | plot_hist(
45 | num_categories_per_image,
46 | bins=max(num_categories_per_image) - min(num_categories_per_image) + 1,
47 | save_path="vis_fig/category_dist_hist.pdf",
48 | )
49 |
50 |
51 | def vis_num_img_per_cat(cat_obj_count):
52 | num_img_per_cat = np.sum(cat_obj_count > 0, axis=1)
53 | plot_hist(
54 | num_img_per_cat,
55 | bins=20,
56 | save_path="vis_fig/nimg_pcat_hist.pdf",
57 | x="Num. of images",
58 | )
59 |
60 |
61 | def vis_num_box_per_cat(cat_obj_count):
62 | num_box_per_cat = np.sum(cat_obj_count, axis=1)
63 | plot_hist(
64 | num_box_per_cat,
65 | bins=20,
66 | save_path="vis_fig/nbox_pcat_hist.pdf",
67 | x="Num. of instances",
68 | )
69 |
70 |
71 | def vis_num_box_per_cat_per_img(cat_obj_count):
72 | img_obj_count = cat_obj_count.reshape(-1)
73 | plot_hist(
74 | img_obj_count[img_obj_count > 0],
75 | bins=max(img_obj_count) - min(img_obj_count) + 1,
76 | save_path="vis_fig/nbox_pcat_pimg_hist.pdf",
77 | x="Num. of instances on a image",
78 | )
79 |
80 |
81 | if __name__ == "__main__":
82 | IMG_ROOT = None # set here
83 | PKL_ANNO_PATH = None # set here
84 | assert IMG_ROOT is not None, "Please set IMG_ROOT in the script first"
85 | assert PKL_ANNO_PATH is not None, "Please set PKL_ANNO_PATH in the script first"
86 | d3 = D3(IMG_ROOT, PKL_ANNO_PATH)
87 |
88 | cat_obj_count = d3.bbox_num_analyze()
89 | vis_num_instance(cat_obj_count)
90 | vis_num_category(cat_obj_count)
91 | vis_num_img_per_cat(cat_obj_count)
92 | vis_num_box_per_cat(cat_obj_count)
93 | vis_num_box_per_cat_per_img(cat_obj_count)
94 |
95 | d3.stat_description(with_rev=False)
96 | d3.stat_description(with_rev=True)
97 | d3.stat_description(with_rev=False, inter_group=True)
98 | d3.stat_description(with_rev=True, inter_group=True)
99 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 |
3 | setuptools.setup(
4 | name='ddd-dataset',
5 | version='0.1.2',
6 | author='Chi Xie',
7 | author_email='chixie.personal@gmail.com',
8 | description='Toolkit for Description Detection Dataset ($D^3$)',
9 | long_description='Toolkit for Description Detection Dataset ($D^3$): A detection dataset with class names characterized by intricate and flexible expressions, for the Described Object Detection (DOD) task.',
10 | long_description_content_type='text/markdown',
11 | license='CC BY-NC 4.0',
12 | packages=['d_cube'],
13 | package_dir={"d_cube": "d_cube"},
14 | url='https://github.com/shikras/d-cube',
15 | project_urls={
16 | "Bug Tracker": "https://github.com/shikras/d-cube/issues",
17 | },
18 | install_requires=['numpy', 'pycocotools', 'opencv-python', 'matplotlib'],
19 |
20 | classifiers=[
21 | 'Development Status :: 4 - Beta',
22 | 'Intended Audience :: Science/Research',
23 | 'Intended Audience :: Developers',
24 | 'Intended Audience :: Education',
25 | 'Operating System :: MacOS',
26 | 'Operating System :: Microsoft :: Windows',
27 | 'Operating System :: POSIX :: Linux',
28 | 'Programming Language :: Python :: 3',
29 | ],
30 | )
31 |
--------------------------------------------------------------------------------