├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── docs
├── docs
│ ├── ade20k.md
│ ├── coco.md
│ ├── imagenet.md
│ ├── img
│ │ ├── ade20k.png
│ │ ├── banner.png
│ │ ├── coco.jpg
│ │ ├── connect.png
│ │ ├── connect2.png
│ │ ├── examples.png
│ │ ├── imagenet.jpeg
│ │ ├── language_model.png
│ │ ├── pascalvoc2012.png
│ │ ├── results.png
│ │ ├── sotabencheval.png
│ │ └── squad20.png
│ ├── index.md
│ ├── pascalvoc.md
│ ├── squad.md
│ ├── wikitext103.md
│ └── wmt.md
├── mkdocs.yml
└── site
│ ├── img
│ └── squad20.png
│ ├── squad
│ └── index.html
│ └── wmt
│ └── index.html
├── requirements-dev.txt
├── requirements.txt
├── setup.cfg
├── setup.py
└── sotabencheval
├── __init__.py
├── core
├── __init__.py
├── cache.py
└── evaluator.py
├── image_classification
├── __init__.py
├── imagenet.py
└── utils.py
├── language_modelling
├── __init__.py
└── wikitext.py
├── machine_translation
├── __init__.py
├── languages.py
├── metrics.py
└── wmt.py
├── natural_language_inference
├── __init__.py
└── multinli.py
├── object_detection
├── __init__.py
├── coco.py
├── coco_eval.py
└── utils.py
├── question_answering
├── __init__.py
├── evaluate_v11.py
├── evaluate_v20.py
├── squad.py
└── utils.py
├── semantic_segmentation
├── __init__.py
├── ade20k.py
├── pascalvoc.py
└── utils.py
├── utils.py
└── version.py
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.egg-info
3 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: help default docs build release clean test check fmt
2 | .DEFAULT_GOAL := help
3 | PROJECT := sotabench-eval
4 |
5 |
6 | help: ## Show help.
7 | @grep -E '^[a-zA-Z2_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
8 |
9 |
10 | docs: ## Build documentation.
11 | @cd docs && make html && open _build/html/index.html
12 |
13 |
14 | build: ## Build the source and wheel distribution packages.
15 | @python3 setup.py sdist bdist_wheel
16 |
17 |
18 | release: build ## Build and upload the package to PyPI.
19 | @twine upload --repository-url https://upload.pypi.org/legacy/ dist/*
20 | @rm -fr build dist sotabench-eval.egg-info
21 |
22 |
23 | clean: ## Cleanup the project
24 | @find . -type d -name __pycache__ -delete
25 | @find . -type f -name "*.py[cod]" -delete
26 | @rm -fr build dist sotabench-eval.egg-info
27 | @rm -fr docs/_build/*
28 |
29 |
30 | test: ## Run tests and code checks.
31 | @py.test -v --cov "$(PROJECT)" "$(PROJECT)"
32 |
33 |
34 | check: ## Run code checks.
35 | @flake8 "$(PROJECT)"
36 | @pydocstyle "$(PROJECT)"
37 |
38 |
39 | fmt: ## Format the code.
40 | @black --target-version=py37 --safe --line-length=79 "$(PROJECT)"
41 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |

2 |
3 | --------------------------------------------------------------------------------
4 |
5 | [](https://badge.fury.io/py/sotabencheval) [](https://paperswithcode.github.io/sotabench-eval/)
6 |
7 | `sotabencheval` is a framework-agnostic library that contains a collection of deep learning benchmarks you can use to benchmark your models. It can be used in conjunction with the [sotabench](https://www.sotabench.com) service to record results for models, so the community can compare model performance on different tasks, as well as a continuous integration style service for your repository to benchmark your models on each commit.
8 |
9 | ## Benchmarks Supported
10 |
11 | - [ADE20K](https://paperswithcode.github.io/sotabench-eval/ade20k/) (Semantic Segmentation)
12 | - [COCO](https://paperswithcode.github.io/sotabench-eval/coco/) (Object Detection)
13 | - [ImageNet](https://paperswithcode.github.io/sotabench-eval/imagenet/) (Image Classification)
14 | - [SQuAD](https://paperswithcode.github.io/sotabench-eval/squad/) (Question Answering)
15 | - [WikiText-103](https://paperswithcode.github.io/sotabench-eval/wikitext103/) (Language Modelling)
16 | - [WMT](https://paperswithcode.github.io/sotabench-eval/wmt/) (Machine Translation)
17 |
18 | PRs welcome for further benchmarks!
19 |
20 | ## Installation
21 |
22 | Requires Python 3.6+.
23 |
24 | ```bash
25 | pip install sotabencheval
26 | ```
27 |
28 | ## Get Benching! 🏋️
29 |
30 | You should read the [full documentation here](https://paperswithcode.github.io/sotabench-eval/index.html), which contains guidance on getting started and connecting to [sotabench](https://www.sotabench.com).
31 |
32 | Integration is lightweight. For example, if you are evaluating an ImageNet model, you initialize an Evaluator object and (optionally) link to any linked paper:
33 |
34 | ```python
35 | from sotabencheval.image_classification import ImageNetEvaluator
36 | evaluator = ImageNetEvaluator(
37 | model_name='FixResNeXt-101 32x48d',
38 | paper_arxiv_id='1906.06423')
39 | ```
40 |
41 | Then for each batch of predictions your model makes on ImageNet, pass a dictionary of keys as image IDs and values as a `np.ndarray`s of logits to the `evaluator.add` method:
42 |
43 | ```python
44 | evaluator.add(output_dict=dict(zip(image_ids, batch_output)))
45 | ```
46 |
47 | The evaluation logic just needs to be written in a `sotabench.py` file and sotabench will run it on each commit and record the results:
48 |
49 |
50 |
51 | ## Contributing
52 |
53 | All contributions welcome!
54 |
55 |
56 |
57 |
--------------------------------------------------------------------------------
/docs/docs/ade20k.md:
--------------------------------------------------------------------------------
1 | # ADE20K
2 |
3 | 
4 |
5 | You can view the ADE20K leaderboard [here](https://sotabench.com/benchmarks/semantic-segmentation-on-ade20k-val).
6 |
7 | ## Getting Started
8 |
9 | You'll need the following in the root of your repository:
10 |
11 | - `sotabench.py` file - contains benchmarking logic; the server will run this on each commit
12 | - `requirements.txt` file - Python dependencies to be installed before running `sotabench.py`
13 | - `sotabench_setup.sh` *(optional)* - any advanced dependencies or setup, e.g. compilation
14 |
15 | You can write whatever you want in your `sotabench.py` file to get model predictions on the ADE20K 2012 dataset. For example,
16 | PyTorch users might use torchvision to load the dataset.
17 |
18 | But you will need to record your results for the server, and you'll want to avoid doing things like
19 | downloading the dataset on the server. So you should:
20 |
21 | - **Point to the server ADE20K data paths** - popular datasets are pre-downloaded on the server.
22 | - **Include an Evaluation object** in `sotabench.py` file to record the results.
23 | - **Use Caching** *(optional)* - to speed up evaluation by hashing the first batch of predictions.
24 |
25 | We explain how to do these various steps below.
26 |
27 | ## Server Data Location
28 |
29 | The ADE20K data is located in the root of your repository on the server at `.data/vision/ade20k`. In this folder is contained:
30 |
31 | - `ADEChallengeData2016.zip` - containing validation images and annotations
32 |
33 | Your local ADE20K files may have a different file directory structure, so you
34 | can use control flow like below to change the data path if the script is being
35 | run on sotabench servers:
36 |
37 | ``` python
38 | from sotabencheval.utils import is_server
39 |
40 | if is_server():
41 | DATA_ROOT = './.data/vision/ade20k'
42 | else: # local settings
43 | DATA_ROOT = '/home/ubuntu/my_data/'
44 | ```
45 |
46 | This will detect if `sotabench.py` is being run on the server and change behaviour accordingly.
47 |
48 | ## How Do I Initialize an Evaluator?
49 |
50 | Add this to your code - before you start batching over the dataset and making predictions:
51 |
52 | ``` python
53 | from sotabencheval.semantic_segmentation import ADE20KEvaluator
54 |
55 | evaluator = ADE20KEvaluator(model_name='My Super Model')
56 | ```
57 |
58 | If you are reproducing a model from a paper, then you can enter the arXiv ID. If you
59 | put in the same model name string as on the [leaderboard](https://sotabench.com/benchmarks/semantic-segmentation-on-ade20k-val)
60 | then you will enable direct comparison with the paper. For example:
61 |
62 | ``` python
63 | from sotabencheval.semantic_segmentation import ADE20KEvaluator
64 |
65 | evaluator = ADE20KEvaluator(model_name='OCR (HRNetV2-W48)', paper_arxiv_id='1909.11065')
66 | ```
67 |
68 | The above will directly compare with the result of the paper when run on the server.
69 |
70 | ## How Do I Evaluate Predictions?
71 |
72 | The evaluator object has an `.add()` method to submit predictions by batch or in full.
73 |
74 | For ADE20K there are two required arguments: `outputs`, a 1D np.ndarray of semantic class predictions per label,
75 | and `targets`, a 1D np.ndarray of ground truth semantic classes per pixel. In other words, it requires flattened
76 | inputs and outputs.
77 |
78 | To elaborate, suppose you are making predictions, batch by batch, and have your model output
79 | and the original targets with batch_size `32`, and image size `(520, 480)`. The shape of your outputs might look like:
80 |
81 | ``` python
82 | batch_output.shape
83 | >> (32, 150, 520, 480) # where 150 is the number of ADE20K classes
84 |
85 | batch_target.shape
86 | >> (32, 520, 480)
87 | ```
88 |
89 | We can flatten the entire output and targets to 1D vectors for each pixel:
90 |
91 | ``` python
92 | flattened_batch_output.shape
93 | >> (7987200) # flatten by taking the max class prediction
94 | # (batch_output.argmax(1).flatten() in torch with class as second dimension)
95 |
96 | flattened_batch_target.shape
97 | >> (7987200) # (batch_target.flatten() in torch)
98 | ```
99 |
100 | The output might look something like this:
101 |
102 | ``` python
103 | flattened_batch_output
104 | >> array([6, 6, 6, 6, 6, ...])
105 |
106 | flattened_batch_target
107 | >> array([6, 6, 6, 6, 6, ...])
108 | ```
109 |
110 | In both cases, the prediction and ground truth have class 6 as the semantic label for the first 5
111 | pixels - so the model is correct.
112 |
113 | These flattened arrays can then be passed into the .add() method of the evaluator
114 |
115 | ``` python
116 | my_evaluator.update(outputs=flattened_batch_output,
117 | targets=flattened_batch_target)
118 | ```
119 |
120 | You can do this all at once in a single call to `add()`, but more naturally, you will
121 | probably loop over the dataset and call the method for the outputs of each batch.
122 | That would like something like this (for a PyTorch example):
123 |
124 | ``` python
125 | evaluator = ADE20KEvaluator(model_name='OCR (HRNetV2-W48)', paper_arxiv_id='1909.11065')
126 |
127 | with torch.no_grad():
128 | for image, target in tqdm.tqdm(data_loader_test):
129 | image, target = image.to('cuda'), target.to('cuda')
130 | output = model(image)
131 | output = output['out']
132 |
133 | evaluator.add(output.argmax(1).flatten().cpu().numpy(), target.flatten().cpu().numpy())
134 | ```
135 |
136 | When you are done, you can get the results locally by running:
137 |
138 | ``` python
139 | evaluator.get_results()
140 | ```
141 |
142 | But for the server you want to save the results by running:
143 |
144 | ``` python
145 | evaluator.save()
146 | ```
147 |
148 | This method serialises the results and model metadata and stores to the server database.
149 |
150 | ## How Do I Cache Evaluation?
151 |
152 | Sotabench reruns your script on every commit. This is good because it acts like
153 | continuous integration in checking for bugs and changes, but can be annoying
154 | if the model hasn't changed and evaluation is lengthy.
155 |
156 | Fortunately sotabencheval has caching logic that you can use.
157 |
158 | The idea is that after the first batch, we hash the model outputs and the
159 | current metrics and this tells us if the model is the same given the dataset.
160 | You can include hashing within an evaluation loop like follows (in the following
161 | example for a PyTorch repository):
162 |
163 | ``` python
164 | evaluator = ADE20KEvaluator(model_name='OCR (HRNetV2-W48)', paper_arxiv_id='1909.11065')
165 |
166 | with torch.no_grad():
167 | for image, target in tqdm.tqdm(data_loader_test):
168 | image, target = image.to('cuda'), target.to('cuda')
169 | output = model(image)
170 | output = output['out']
171 |
172 | evaluator.add(output.argmax(1).flatten().cpu().numpy(), target.flatten().cpu().numpy())
173 | if evaluator.cache_exists:
174 | break
175 |
176 | evaluator.save()
177 | ```
178 |
179 | If the hash is the same as in the server, we infer that the model hasn't changed, so
180 | we simply return hashed results rather than running the whole evaluation again.
181 |
182 | Caching is very useful if you have large models, or a repository that is evaluating
183 | multiple models, as it speeds up evaluation significantly.
184 |
185 | ## Need More Help?
186 |
187 | Head on over to the [Computer Vision](https://forum.sotabench.com/c/cv) section of the sotabench
188 | forums if you have any questions or difficulties.
189 |
--------------------------------------------------------------------------------
/docs/docs/coco.md:
--------------------------------------------------------------------------------
1 | # COCO
2 |
3 | 
4 |
5 | You can view the COCO minival leaderboard [here](https://sotabench.com/benchmarks/object-detection-on-coco-minival).
6 |
7 | ## Getting Started
8 |
9 | You'll need the following in the root of your repository:
10 |
11 | - `sotabench.py` file - contains benchmarking logic; the server will run this on each commit
12 | - `requirements.txt` file - Python dependencies to be installed before running `sotabench.py`
13 | - `sotabench_setup.sh` *(optional)* - any advanced dependencies or setup, e.g. compilation
14 |
15 | You can write whatever you want in your `sotabench.py` file to get model predictions on the COCO dataset. For example,
16 | PyTorch users might use torchvision to load the dataset.
17 |
18 | But you will need to record your results for the server, and you'll want to avoid doing things like
19 | downloading the dataset on the server. So you should:
20 |
21 | - **Point to the server COCO data paths** - popular datasets are pre-downloaded on the server.
22 | - **Include an Evaluation object** in `sotabench.py` file to record the results.
23 | - **Use Caching** *(optional)* - to speed up evaluation by hashing the first batch of predictions.
24 |
25 | We explain how to do these various steps below.
26 |
27 | ## Server Data Location
28 |
29 | The COCO validation data is located in the root of your repository on the server at `.data/vision/coco`. In this folder is contained:
30 |
31 | - `annotations_trainval2017.zip` - containing annotations for the validation images
32 | - `val2017.zip` - containing the validation images
33 |
34 | Your local COCO files may have a different file directory structure, so you
35 | can use control flow like below to change the data path if the script is being
36 | run on sotabench servers:
37 |
38 | ``` python
39 | from sotabencheval.utils import is_server
40 |
41 | if is_server():
42 | DATA_ROOT = './.data/vision/coco'
43 | else: # local settings
44 | DATA_ROOT = '/home/ubuntu/my_data/'
45 | ```
46 |
47 | This will detect if `sotabench.py` is being run on the server and change behaviour accordingly.
48 |
49 | ## How Do I Initialize an Evaluator?
50 |
51 | Add this to your code - before you start batching over the dataset and making predictions:
52 |
53 | ``` python
54 | from sotabencheval.object_detection import COCOEvaluator
55 |
56 | evaluator = COCOEvaluator(model_name='My Super Model')
57 | ```
58 |
59 | If you are reproducing a model from a paper, then you can enter the arXiv ID. If you
60 | put in the same model name string as on the [leaderboard](https://sotabench.com/benchmarks/object-detection-on-coco-minival)
61 | then you will enable direct comparison with the paper's model. For example:
62 |
63 | ``` python
64 | from sotabencheval.object_detection import COCOEvaluator
65 |
66 | evaluator = COCOEvaluator(model_name='Mask R-CNN', paper_arxiv_id='1703.06870')
67 | ```
68 |
69 | The above will directly compare with the result of the paper when run on the server.
70 |
71 | ## How Do I Evaluate Predictions?
72 |
73 | The evaluator object has an [.add()](https://github.com/paperswithcode/sotabench-eval/blob/a788d17252913e5f2d24733845de80aec23101fb/sotabencheval/object_detection/coco.py#L187) method to submit predictions by batch or in full.
74 |
75 | For COCO the expected input is a list of dictionaries, where each dictionary contains detection information
76 | that will be used by the [loadRes](https://github.com/paperswithcode/sotabench-eval/blob/a788d17252913e5f2d24733845de80aec23101fb/sotabencheval/object_detection/coco_eval.py#L236) method based on the [pycocotools](https://github.com/cocodataset/cocoapi/tree/master/PythonAPI/pycocotools) API.
77 |
78 | Each detection can take a dictionary
79 | like the following:
80 |
81 | ``` python
82 | {'image_id': 397133, 'bbox': [386.1628112792969, 69.48855590820312, 110.14895629882812, 278.2847595214844],
83 | 'score': 0.999152421951294, 'category_id': 1}
84 | ```
85 |
86 | For this benchmark, only bounding box detection ('bbox') is performed at present.
87 |
88 | You can do this all at once in a single call to `add()`, but more naturally, you will
89 | probably loop over the dataset and call the method for the outputs of each batch.
90 | That would look something like this (for a PyTorch example):
91 |
92 | ``` python
93 | ...
94 |
95 | evaluator = COCOEvaluator(
96 | model_name='Mask R-CNN',
97 | paper_arxiv_id='1703.06870')
98 |
99 | with torch.no_grad():
100 | for i, (input, target) in enumerate(data_loader):
101 | ...
102 | output = model(input)
103 | # potentially formatting of the output here to be a list of dicts
104 | evaluator.add(output)
105 | ```
106 |
107 | When you are done, you can get the results locally by running:
108 |
109 | ``` python
110 | evaluator.get_results()
111 | ```
112 |
113 | But for the server you want to save the results by running:
114 |
115 | ``` python
116 | evaluator.save()
117 | ```
118 |
119 | This method serialises the results and model metadata and stores to the server database.
120 |
121 | ## How Do I Cache Evaluation?
122 |
123 | Sotabench reruns your script on every commit. This is good because it acts like
124 | continuous integration in checking for bugs and changes, but can be annoying
125 | if the model hasn't changed and evaluation is lengthy.
126 |
127 | Fortunately sotabencheval has caching logic that you can use.
128 |
129 | The idea is that after the first batch, we hash the model outputs and the
130 | current metrics and this tells us if the model is the same given the dataset.
131 | You can include hashing within an evaluation loop like follows (in the following
132 | example for a PyTorch repository):
133 |
134 | ``` python
135 | with torch.no_grad():
136 | for i, (input, target) in enumerate(data_loader):
137 | ...
138 | output = model(input)
139 | # potentially formatting of the output here to be a list of dicts
140 | evaluator.add(output)
141 |
142 | if evaluator.cache_exists:
143 | break
144 |
145 | evaluator.save()
146 | ```
147 |
148 | If the hash is the same as in the server, we infer that the model hasn't changed, so
149 | we simply return hashed results rather than running the whole evaluation again.
150 |
151 | Caching is very useful if you have large models, or a repository that is evaluating
152 | multiple models, as it speeds up evaluation significantly.
153 |
154 | ## A Full sotabench.py Example
155 |
156 | Below we show an implementation for a model from the torchvision repository. This
157 | incorporates all the features explained above: (a) using the server data root,
158 | (b) using the COCO Evaluator, and (c) caching the evaluation logic. Note that the
159 | torchbench dependency is just to get some processing logic and transforms; the evaluation
160 | is done with sotabencheval.
161 |
162 | ``` python
163 | import os
164 | import tqdm
165 | import torch
166 | from torch.utils.data import DataLoader
167 | from torchbench.utils import send_model_to_device
168 | from torchbench.object_detection.transforms import Compose, ConvertCocoPolysToMask, ToTensor
169 | import torchvision
170 | import PIL
171 |
172 | from sotabencheval.object_detection import COCOEvaluator
173 | from sotabencheval.utils import is_server
174 |
175 | if is_server():
176 | DATA_ROOT = './.data/vision/coco'
177 | else: # local settings
178 | DATA_ROOT = '/home/ubuntu/my_data/'
179 |
180 | def coco_data_to_device(input, target, device: str = "cuda", non_blocking: bool = True):
181 | input = list(inp.to(device=device, non_blocking=non_blocking) for inp in input)
182 | target = [{k: v.to(device=device, non_blocking=non_blocking) for k, v in t.items()} for t in target]
183 | return input, target
184 |
185 | def coco_collate_fn(batch):
186 | return tuple(zip(*batch))
187 |
188 | def coco_output_transform(output, target):
189 | output = [{k: v.to("cpu") for k, v in t.items()} for t in output]
190 | return output, target
191 |
192 | transforms = Compose([ConvertCocoPolysToMask(), ToTensor()])
193 |
194 | model = torchvision.models.detection.__dict__['maskrcnn_resnet50_fpn'](num_classes=91, pretrained=True)
195 |
196 | model, device = send_model_to_device(
197 | model, device='cuda', num_gpu=1
198 | )
199 | model.eval()
200 |
201 | model_output_transform = coco_output_transform
202 | send_data_to_device = coco_data_to_device
203 | collate_fn = coco_collate_fn
204 |
205 | test_dataset = torchbench.datasets.CocoDetection(
206 | root=os.path.join(DATA_ROOT, "val%s" % '2017'),
207 | annFile=os.path.join(
208 | DATA_ROOT, "annotations/instances_val%s.json" % '2017'
209 | ),
210 | transform=None,
211 | target_transform=None,
212 | transforms=transforms,
213 | download=True,
214 | )
215 | test_loader = DataLoader(
216 | test_dataset,
217 | batch_size=8,
218 | shuffle=False,
219 | num_workers=4,
220 | pin_memory=True,
221 | collate_fn=collate_fn,
222 | )
223 | test_loader.no_classes = 91 # Number of classes for COCO Detection
224 |
225 | iterator = tqdm.tqdm(test_loader, desc="Evaluation", mininterval=5)
226 |
227 | evaluator = COCOEvaluator(
228 | root=DATA_ROOT,
229 | model_name='Mask R-CNN (ResNet-50-FPN)',
230 | paper_arxiv_id='1703.06870'
231 |
232 | def prepare_for_coco_detection(predictions):
233 | coco_results = []
234 | for original_id, prediction in predictions.items():
235 | if len(prediction) == 0:
236 | continue
237 |
238 | boxes = prediction["boxes"]
239 | boxes = convert_to_xywh(boxes).tolist()
240 | scores = prediction["scores"].tolist()
241 | labels = prediction["labels"].tolist()
242 |
243 | coco_results.extend(
244 | [
245 | {
246 | "image_id": original_id,
247 | "category_id": labels[k],
248 | "bbox": box,
249 | "score": scores[k],
250 | }
251 | for k, box in enumerate(boxes)
252 | ]
253 | )
254 | return coco_results
255 |
256 | def convert_to_xywh(boxes):
257 | xmin, ymin, xmax, ymax = boxes.unbind(1)
258 | return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
259 |
260 | with torch.no_grad():
261 | for i, (input, target) in enumerate(iterator):
262 | input, target = send_data_to_device(input, target, device=device)
263 | original_output = model(input)
264 | output, target = model_output_transform(original_output, target)
265 | result = {
266 | tar["image_id"].item(): out for tar, out in zip(target, output)
267 | }
268 | result = prepare_for_coco_detection(result)
269 |
270 | evaluator.update(result)
271 |
272 | if evaluator.cache_exists:
273 | break
274 |
275 | evaluator.save()
276 | ```
277 |
278 | ## Need More Help?
279 |
280 | Head on over to the [Computer Vision](https://forum.sotabench.com/c/cv) section of the sotabench
281 | forums if you have any questions or difficulties.
282 |
--------------------------------------------------------------------------------
/docs/docs/imagenet.md:
--------------------------------------------------------------------------------
1 | # ImageNet
2 |
3 | 
4 |
5 | You can view the ImageNet leaderboard [here](https://sotabench.com/benchmarks/image-classification-on-imagenet).
6 |
7 | ## Getting Started
8 |
9 | You'll need the following in the root of your repository:
10 |
11 | - `sotabench.py` file - contains benchmarking logic; the server will run this on each commit
12 | - `requirements.txt` file - Python dependencies to be installed before running `sotabench.py`
13 | - `sotabench_setup.sh` *(optional)* - any advanced dependencies or setup, e.g. compilation
14 |
15 | You can write whatever you want in your `sotabench.py` file to get model predictions on the ImageNet dataset. For example,
16 | PyTorch users might use torchvision to load the dataset.
17 |
18 | But you will need to record your results for the server, and you'll want to avoid doing things like
19 | downloading the dataset on the server. So you should:
20 |
21 | - **Point to the server ImageNet data paths** - popular datasets are pre-downloaded on the server.
22 | - **Include an Evaluation object** in `sotabench.py` file to record the results.
23 | - **Use Caching** *(optional)* - to speed up evaluation by hashing the first batch of predictions.
24 |
25 | We explain how to do these various steps below.
26 |
27 | ## Server Data Location
28 |
29 | The ImageNet validation data is located in the root of your repository on the server at `.data/vision/imagenet`. In this folder is contained:
30 |
31 | - `ILSVRC2012_devkit_t12.tar.gz` - containing metadata
32 | - `ILSVRC2012_img_val.tar` - containing the validation images
33 |
34 | Your local ImageNet files may have a different file directory structure, so you
35 | can use control flow like below to change the data path if the script is being
36 | run on sotabench servers:
37 |
38 | ``` python
39 | from sotabencheval.utils import is_server
40 |
41 | if is_server():
42 | DATA_ROOT = './.data/vision/imagenet'
43 | else: # local settings
44 | DATA_ROOT = '/home/ubuntu/my_data/'
45 | ```
46 |
47 | This will detect if `sotabench.py` is being run on the server and change behaviour accordingly.
48 |
49 | ## How Do I Initialize an Evaluator?
50 |
51 | Add this to your code - before you start batching over the dataset and making predictions:
52 |
53 | ``` python
54 | from sotabencheval.image_classification import ImageNetEvaluator
55 |
56 | evaluator = ImageNetEvaluator(model_name='My Super Model')
57 | ```
58 |
59 | If you are reproducing a model from a paper, then you can enter the arXiv ID. If you
60 | put in the same model name string as on the [leaderboard](https://sotabench.com/benchmarks/image-classification-on-imagenet)
61 | then you will enable direct comparison with the paper's model. For example:
62 |
63 | ``` python
64 | from sotabencheval.image_classification import ImageNetEvaluator
65 |
66 | evaluator = ImageNetEvaluator(model_name='FixResNeXt-101 32x48d',
67 | paper_arxiv_id='1906.06423')
68 | ```
69 |
70 | The above will directly compare with the result of the paper when run on the server.
71 |
72 | ## How Do I Evaluate Predictions?
73 |
74 | The evaluator object has an `.add()` method to submit predictions by batch or in full.
75 |
76 | For ImageNet the expected input as a dictionary of outputs, where each key is an
77 | image ID from ImageNet and each value is a list or 1D numpy array of logits for that
78 | image ID. For example:
79 |
80 | ``` python
81 | evaluator.add({'ILSVRC2012_val_00000293': np.array([1.04243, ...]),
82 | 'ILSVRC2012_val_00000294': np.array([-2.3677, ...])})
83 | ```
84 |
85 | You can do this all at once in a single call to `add()`, but more naturally, you will
86 | probably loop over the dataset and call the method for the outputs of each batch.
87 | That would like something like this (for a PyTorch example):
88 |
89 | ``` python
90 | for i, (input, target) in enumerate(test_loader):
91 | input = input.to(device='cuda', non_blocking=True)
92 | target = target.to(device='cuda', non_blocking=True)
93 | output = model(input)
94 |
95 | image_ids = [get_img_id(img[0]) for img in test_loader.dataset.imgs[i*test_loader.batch_size:(i+1)*test_loader.batch_size]]
96 |
97 | evaluator.add(dict(zip(image_ids, list(output.cpu().numpy()))))
98 | ```
99 |
100 | When you are done, you can get the results locally by running:
101 |
102 | ``` python
103 | evaluator.get_results()
104 | ```
105 |
106 | But for the server you want to save the results by running:
107 |
108 | ``` python
109 | evaluator.save()
110 | ```
111 |
112 | This method serialises the results and model metadata and stores to the server database.
113 |
114 | ## How Do I Cache Evaluation?
115 |
116 | Sotabench reruns your script on every commit. This is good because it acts like
117 | continuous integration in checking for bugs and changes, but can be annoying
118 | if the model hasn't changed and evaluation is lengthy.
119 |
120 | Fortunately sotabencheval has caching logic that you can use.
121 |
122 | The idea is that after the first batch, we hash the model outputs and the
123 | current metrics and this tells us if the model is the same given the dataset.
124 | You can include hashing within an evaluation loop like follows (in the following
125 | example for a PyTorch repository):
126 |
127 | ``` python
128 | with torch.no_grad():
129 | for i, (input, target) in enumerate(test_loader):
130 | input = input.to(device='cuda', non_blocking=True)
131 | target = target.to(device='cuda', non_blocking=True)
132 | output = model(input)
133 |
134 | image_ids = [get_img_id(img[0]) for img in test_loader.dataset.imgs[i*test_loader.batch_size:(i+1)*test_loader.batch_size]]
135 |
136 | evaluator.add(dict(zip(image_ids, list(output.cpu().numpy()))))
137 |
138 | if evaluator.cache_exists:
139 | break
140 |
141 | evaluator.save()
142 | ```
143 |
144 | If the hash is the same as in the server, we infer that the model hasn't changed, so
145 | we simply return hashed results rather than running the whole evaluation again.
146 |
147 | Caching is very useful if you have large models, or a repository that is evaluating
148 | multiple models, as it speeds up evaluation significantly.
149 |
150 | ## A full sotabench.py example
151 |
152 | Below we show an implementation for a model from the torchvision repository. This
153 | incorporates all the features explained above: (a) using the server data root,
154 | (b) using the ImageNet Evaluator, and (c) caching the evaluation logic:
155 |
156 | ``` python
157 | import numpy as np
158 | import PIL
159 | import torch
160 | from torchvision.models.resnet import resnext101_32x8d
161 | import torchvision.transforms as transforms
162 | from torchvision.datasets import ImageNet
163 | from torch.utils.data import DataLoader
164 |
165 | from sotabencheval.image_classification import ImageNetEvaluator
166 | from sotabencheval.utils import is_server
167 |
168 | if is_server():
169 | DATA_ROOT = './.data/vision/imagenet'
170 | else: # local settings
171 | DATA_ROOT = '/home/ubuntu/my_data/'
172 |
173 | model = resnext101_32x8d(pretrained=True)
174 |
175 | input_transform = transforms.Compose([
176 | transforms.Resize(256, PIL.Image.BICUBIC),
177 | transforms.CenterCrop(224),
178 | transforms.ToTensor(),
179 | transforms.Normalize(
180 | mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
181 | ])
182 |
183 | test_dataset = ImageNet(
184 | DATA_ROOT,
185 | split="val",
186 | transform=input_transform,
187 | target_transform=None,
188 | download=True,
189 | )
190 |
191 | test_loader = DataLoader(
192 | test_dataset,
193 | batch_size=128,
194 | shuffle=False,
195 | num_workers=4,
196 | pin_memory=True,
197 | )
198 |
199 | model = model.cuda()
200 | model.eval()
201 |
202 | evaluator = ImageNetEvaluator(
203 | model_name='ResNeXt-101-32x8d',
204 | paper_arxiv_id='1611.05431')
205 |
206 | def get_img_id(image_name):
207 | return image_name.split('/')[-1].replace('.JPEG', '')
208 |
209 | with torch.no_grad():
210 | for i, (input, target) in enumerate(test_loader):
211 | input = input.to(device='cuda', non_blocking=True)
212 | target = target.to(device='cuda', non_blocking=True)
213 | output = model(input)
214 |
215 | image_ids = [get_img_id(img[0]) for img in test_loader.dataset.imgs[i*test_loader.batch_size:(i+1)*test_loader.batch_size]]
216 |
217 | evaluator.add(dict(zip(image_ids, list(output.cpu().numpy()))))
218 |
219 | if evaluator.cache_exists:
220 | break
221 |
222 | evaluator.save()
223 | ```
224 |
225 | ## Need More Help?
226 |
227 | Head on over to the [Computer Vision](https://forum.sotabench.com/c/cv) section of the sotabench
228 | forums if you have any questions or difficulties.
229 |
--------------------------------------------------------------------------------
/docs/docs/img/ade20k.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/ade20k.png
--------------------------------------------------------------------------------
/docs/docs/img/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/banner.png
--------------------------------------------------------------------------------
/docs/docs/img/coco.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/coco.jpg
--------------------------------------------------------------------------------
/docs/docs/img/connect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/connect.png
--------------------------------------------------------------------------------
/docs/docs/img/connect2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/connect2.png
--------------------------------------------------------------------------------
/docs/docs/img/examples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/examples.png
--------------------------------------------------------------------------------
/docs/docs/img/imagenet.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/imagenet.jpeg
--------------------------------------------------------------------------------
/docs/docs/img/language_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/language_model.png
--------------------------------------------------------------------------------
/docs/docs/img/pascalvoc2012.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/pascalvoc2012.png
--------------------------------------------------------------------------------
/docs/docs/img/results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/results.png
--------------------------------------------------------------------------------
/docs/docs/img/sotabencheval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/sotabencheval.png
--------------------------------------------------------------------------------
/docs/docs/img/squad20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/squad20.png
--------------------------------------------------------------------------------
/docs/docs/index.md:
--------------------------------------------------------------------------------
1 | # Welcome to sotabencheval!
2 |
3 | 
4 |
5 | You have reached the docs for the [sotabencheval](https://github.com/paperswithcode/sotabench-eval) library. This library contains a collection of deep learning benchmarks you can use to
6 | benchmark your models. It can be used in conjunction with the
7 | [sotabench.com](http://www.sotabench.com) website to record results for models, so the community
8 | can compare model performance on different tasks, as well as a continuous integration style
9 | service for your repository to benchmark your models on each commit.
10 |
11 | **sotabencheval** is a general benchmarking library, meaning it is designed to support all deep learning frameworks,
12 | and requires minimal code integration. There are alternative sotabench APIs you can use that are
13 | specialized for particular frameworks, e.g. [torchbench](https://github.com/paperswithcode/torchbench) for PyTorch.
14 |
15 |
16 | ## Getting Started : Benchmarking on ImageNet
17 |
18 | **Step One : Create a sotabench.py file in the root of your repository**
19 |
20 | This can contain whatever logic you need to load and process the dataset, and to
21 | produce model predictions for it. To record your results for sotabench, initialise
22 | an ImageNet evaluator object to name the model (and optionally) link to a paper:
23 |
24 | ``` python
25 | from sotabencheval.image_classification import ImageNetEvaluator
26 |
27 | evaluator = ImageNetEvaluator(
28 | model_name='ResNeXt-101-32x8d',
29 | paper_arxiv_id='1611.05431')
30 | ```
31 |
32 | For each batch of predictions made by your model, pass a dictionary of keys as image IDs and values as
33 | output predictions to the `evaluator.add` method:
34 |
35 | ``` python
36 | evaluator.add(dict(zip(image_ids, batch_output)))
37 | ```
38 | Then after you have accumulated all the predictions:
39 |
40 | ``` python
41 | evaluator.save()
42 | ```
43 |
44 | This will ensure results are evaluated and saved when they are run on the [sotabench](http://www.sotabench.com) server.
45 |
46 | Below you can see a working `sotabench.py` file added to the [torchvision](https://github.com/pytorch/vision) repository
47 | to test one of its models, integrating the evaluation code from above:
48 |
49 | ``` python
50 | import numpy as np
51 | import PIL
52 | import torch
53 | from torch.utils.data import DataLoader
54 | from torchvision.models.resnet import resnext101_32x8d
55 | import torchvision.transforms as transforms
56 | from torchvision.datasets import ImageNet
57 |
58 | from sotabencheval.image_classification import ImageNetEvaluator
59 | from sotabencheval.utils import is_server
60 |
61 | if is_server():
62 | DATA_ROOT = './.data/vision/imagenet'
63 | else: # local settings
64 | DATA_ROOT = '/home/ubuntu/my_data/'
65 |
66 | model = resnext101_32x8d(pretrained=True)
67 |
68 | input_transform = transforms.Compose([
69 | transforms.Resize(256, PIL.Image.BICUBIC),
70 | transforms.CenterCrop(224),
71 | transforms.ToTensor(),
72 | transforms.Normalize(
73 | mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
74 | ])
75 |
76 | test_dataset = ImageNet(
77 | DATA_ROOT,
78 | split="val",
79 | transform=input_transform,
80 | target_transform=None,
81 | download=True,
82 | )
83 |
84 | test_loader = DataLoader(
85 | test_dataset,
86 | batch_size=128,
87 | shuffle=False,
88 | num_workers=4,
89 | pin_memory=True,
90 | )
91 |
92 | model = model.cuda()
93 | model.eval()
94 |
95 | evaluator = ImageNetEvaluator(
96 | model_name='ResNeXt-101-32x8d',
97 | paper_arxiv_id='1611.05431')
98 |
99 | def get_img_id(image_name):
100 | return image_name.split('/')[-1].replace('.JPEG', '')
101 |
102 | with torch.no_grad():
103 | for i, (input, target) in enumerate(test_loader):
104 | input = input.to(device='cuda', non_blocking=True)
105 | target = target.to(device='cuda', non_blocking=True)
106 | output = model(input
107 | image_ids = [get_img_id(img[0]) for img in test_loader.dataset.imgs[i*test_loader.batch_size:(i+1)*test_loader.batch_size]]
108 | evaluator.add(dict(zip(image_ids, list(output.cpu().numpy()))))
109 |
110 | evaluator.save()
111 | ```
112 |
113 | **Step Two : Run locally to verify that it works**
114 |
115 | ```
116 | python sotabench.py
117 | ```
118 |
119 | You can also run the logic in a Jupyter Notebook if that is your preferred workflow.
120 |
121 | **Step Three : Login and connect your repository to [sotabench](http://www.sotabench.com)**
122 |
123 | Create an account on [sotabench](http://www.sotabench.com), then head to your user page. Click the
124 | **Connect a GitHub repository** button:
125 |
126 |
127 |
128 | Then follow the steps to connect the repositories that you wish to benchmark:
129 |
130 | 
131 |
132 |
133 | After you connect your repository, the sotabench servers will re-evaluate your model on every commit,
134 | to ensure the model is working and results are up-to-date - including if you add additional models to the benchmark file.
135 |
136 | ## Installation
137 |
138 | The library requires Python 3.6+. You can install via pip:
139 |
140 | ```
141 | pip install sotabencheval
142 | ```
143 |
144 | ## Support
145 |
146 | If you get stuck you can head to our [Discourse](http://forum.sotabench.com) forum where you ask
147 | questions on how to use the project. You can also find ideas for contributions,
148 | and work with others on exciting projects.
--------------------------------------------------------------------------------
/docs/docs/pascalvoc.md:
--------------------------------------------------------------------------------
1 | # PASCAL VOC 2012
2 |
3 | 
4 |
5 | You can view the PASCAL VOC 2012 leaderboard [here](https://sotabench.com/benchmarks/semantic-segmentation-on-pascal-voc-2012).
6 |
7 | ## Getting Started
8 |
9 | You'll need the following in the root of your repository:
10 |
11 | - `sotabench.py` file - contains benchmarking logic; the server will run this on each commit
12 | - `requirements.txt` file - Python dependencies to be installed before running `sotabench.py`
13 | - `sotabench_setup.sh` *(optional)* - any advanced dependencies or setup, e.g. compilation
14 |
15 | You can write whatever you want in your `sotabench.py` file to get model predictions on the VOC 2012 dataset. For example,
16 | PyTorch users might use torchvision to load the dataset.
17 |
18 | But you will need to record your results for the server, and you'll want to avoid doing things like
19 | downloading the dataset on the server. So you should:
20 |
21 | - **Point to the server VOC 2012 data paths** - popular datasets are pre-downloaded on the server.
22 | - **Include an Evaluation object** in `sotabench.py` file to record the results.
23 | - **Use Caching** *(optional)* - to speed up evaluation by hashing the first batch of predictions.
24 |
25 | We explain how to do these various steps below.
26 |
27 | ## Server Data Location
28 |
29 | The VOC 2012 data is located in the root of your repository on the server at `.data/vision/voc2012`. In this folder is contained:
30 |
31 | - `VOCtrainval_11-May-2012.tar` - containing validation images and annotations
32 |
33 | Your local VOC 2012 files may have a different file directory structure, so you
34 | can use control flow like below to change the data path if the script is being
35 | run on sotabench servers:
36 |
37 | ``` python
38 | from sotabencheval.utils import is_server
39 |
40 | if is_server():
41 | DATA_ROOT = './.data/vision/voc2012'
42 | else: # local settings
43 | DATA_ROOT = '/home/ubuntu/my_data/'
44 | ```
45 |
46 | This will detect if `sotabench.py` is being run on the server and change behaviour accordingly.
47 |
48 | ## How Do I Initialize an Evaluator?
49 |
50 | Add this to your code - before you start batching over the dataset and making predictions:
51 |
52 | ``` python
53 | from sotabencheval.semantic_segmentation import PASCALVOCEvaluator
54 |
55 | evaluator = PASCALVOCEvaluator(model_name='My Super Model')
56 | ```
57 |
58 | If you are reproducing a model from a paper, then you can enter the arXiv ID. If you
59 | put in the same model name string as on the [leaderboard](https://sotabench.com/benchmarks/semantic-segmentation-on-pascal-voc-2012)
60 | then you will enable direct comparison with the paper. For example:
61 |
62 | ``` python
63 | from sotabencheval.semantic_segmentation import PASCALVOCEvaluator
64 |
65 | evaluator = PASCALVOCEvaluator(model_name='PSPNet', paper_arxiv_id='1612.01105')
66 | ```
67 |
68 | The above will directly compare with the result of the paper when run on the server.
69 |
70 | ## How Do I Evaluate Predictions?
71 |
72 | The evaluator object has an `.add()` method to submit predictions by batch or in full.
73 |
74 | For PASCAL there are two required arguments: `outputs`, a 1D np.ndarray of semantic class predictions per label,
75 | and `targets`, a 1D np.ndarray of ground truth semantic classes per pixel. In other words, it requires flattened
76 | inputs and outputs.
77 |
78 | To elaborate, suppose you are making predictions, batch by batch, and have your model output
79 | and the original targets with batch_size `32`, and image size `(520, 480)`. The shape of your outputs might look like:
80 |
81 | ``` python
82 | batch_output.shape
83 | >> (32, 21, 520, 480) # where 21 is the number of VOC classes
84 |
85 | batch_target.shape
86 | >> (32, 520, 480)
87 | ```
88 |
89 | We can flatten the entire output and targets to 1D vectors for each pixel:
90 |
91 | ``` python
92 | flattened_batch_output.shape
93 | >> (7987200) # flatten by taking the max class prediction
94 | # (batch_output.argmax(1).flatten() in torch with class as second dimension)
95 |
96 | flattened_batch_target.shape
97 | >> (7987200) # (batch_target.flatten() in torch)
98 | ```
99 |
100 | The output might look something like this:
101 |
102 | ``` python
103 | flattened_batch_output
104 | >> array([6, 6, 6, 6, 6, ...])
105 |
106 | flattened_batch_target
107 | >> array([6, 6, 6, 6, 6, ...])
108 | ```
109 |
110 | In both cases, the prediction and ground truth have class 6 as the semantic label for the first 5
111 | pixels - so the model is correct.
112 |
113 | These flattened arrays can then be passed into the .add() method of the evaluator
114 |
115 | ``` python
116 | my_evaluator.update(outputs=flattened_batch_output,
117 | targets=flattened_batch_target)
118 | ```
119 |
120 | You can do this all at once in a single call to `add()`, but more naturally, you will
121 | probably loop over the dataset and call the method for the outputs of each batch.
122 | That would like something like this (for a PyTorch example):
123 |
124 | ``` python
125 | evaluator = PASCALVOCEvaluator(model_name='FCN (ResNet-101)', paper_arxiv_id='1605.06211')
126 |
127 | with torch.no_grad():
128 | for image, target in tqdm.tqdm(data_loader_test):
129 | image, target = image.to('cuda'), target.to('cuda')
130 | output = model(image)
131 | output = output['out']
132 |
133 | evaluator.add(output.argmax(1).flatten().cpu().numpy(), target.flatten().cpu().numpy())
134 | ```
135 |
136 | When you are done, you can get the results locally by running:
137 |
138 | ``` python
139 | evaluator.get_results()
140 | ```
141 |
142 | But for the server you want to save the results by running:
143 |
144 | ``` python
145 | evaluator.save()
146 | ```
147 |
148 | This method serialises the results and model metadata and stores to the server database.
149 |
150 | ## How Do I Cache Evaluation?
151 |
152 | Sotabench reruns your script on every commit. This is good because it acts like
153 | continuous integration in checking for bugs and changes, but can be annoying
154 | if the model hasn't changed and evaluation is lengthy.
155 |
156 | Fortunately sotabencheval has caching logic that you can use.
157 |
158 | The idea is that after the first batch, we hash the model outputs and the
159 | current metrics and this tells us if the model is the same given the dataset.
160 | You can include hashing within an evaluation loop like follows (in the following
161 | example for a PyTorch repository):
162 |
163 | ``` python
164 | evaluator = PASCALVOCEvaluator(model_name='FCN (ResNet-101)', paper_arxiv_id='1605.06211')
165 |
166 | with torch.no_grad():
167 | for image, target in tqdm.tqdm(data_loader_test):
168 | image, target = image.to('cuda'), target.to('cuda')
169 | output = model(image)
170 | output = output['out']
171 |
172 | evaluator.add(output.argmax(1).flatten().cpu().numpy(), target.flatten().cpu().numpy())
173 | if evaluator.cache_exists:
174 | break
175 |
176 | evaluator.save()
177 | ```
178 |
179 | If the hash is the same as in the server, we infer that the model hasn't changed, so
180 | we simply return hashed results rather than running the whole evaluation again.
181 |
182 | Caching is very useful if you have large models, or a repository that is evaluating
183 | multiple models, as it speeds up evaluation significantly.
184 |
185 | ## A full sotabench.py example
186 |
187 | Below we show an implementation for a model from the torchvision repository. This
188 | incorporates all the features explained above: (a) using the server data root,
189 | (b) using the ImageNet Evaluator, and (c) caching the evaluation logic:
190 |
191 | ``` python
192 | import PIL
193 | import torch
194 | import torchvision
195 | from torchvision.models.segmentation import fcn_resnet101
196 | import torchvision.transforms as transforms
197 | import tqdm
198 |
199 | from sotabench_transforms import Normalize, Compose, Resize, ToTensor
200 |
201 | from sotabencheval.semantic_segmentation import PASCALVOCEvaluator
202 | from sotabencheval.utils import is_server
203 |
204 | if is_server():
205 | DATA_ROOT = './.data/vision/voc2012'
206 | else: # local settings
207 | DATA_ROOT = '/home/ubuntu/my_data/'
208 |
209 | MODEL_NAME = 'fcn_resnet101'
210 |
211 | def cat_list(images, fill_value=0):
212 | max_size = tuple(max(s) for s in zip(*[img.shape for img in images]))
213 | batch_shape = (len(images),) + max_size
214 | batched_imgs = images[0].new(*batch_shape).fill_(fill_value)
215 | for img, pad_img in zip(images, batched_imgs):
216 | pad_img[..., : img.shape[-2], : img.shape[-1]].copy_(img)
217 | return batched_imgs
218 |
219 | def collate_fn(batch):
220 | images, targets = list(zip(*batch))
221 | batched_imgs = cat_list(images, fill_value=0)
222 | batched_targets = cat_list(targets, fill_value=255)
223 | return batched_imgs, batched_targets
224 |
225 | device = torch.device('cuda')
226 |
227 | normalize = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
228 | my_transforms = Compose([Resize((520, 480)), ToTensor(), normalize])
229 |
230 | dataset_test = torchvision.datasets.VOCSegmentation(root=DATA_ROOT, year='2012', image_set="val",
231 | transforms=my_transforms, download=True)
232 | test_sampler = torch.utils.data.SequentialSampler(dataset_test)
233 |
234 | data_loader_test = torch.utils.data.DataLoader(
235 | dataset_test, batch_size=32,
236 | sampler=test_sampler, num_workers=4,
237 | collate_fn=collate_fn)
238 |
239 | model = torchvision.models.segmentation.__dict__['fcn_resnet101'](num_classes=21, pretrained=True)
240 | model.to(device)
241 | model.eval()
242 |
243 | evaluator = PASCALVOCEvaluator(model_name='FCN (ResNet-101)', paper_arxiv_id='1605.06211')
244 |
245 | with torch.no_grad():
246 | for image, target in tqdm.tqdm(data_loader_test):
247 | image, target = image.to('cuda'), target.to('cuda')
248 | output = model(image)
249 | output = output['out']
250 |
251 | evaluator.add(output.argmax(1).flatten().cpu().numpy(), target.flatten().cpu().numpy())
252 | if evaluator.cache_exists:
253 | break
254 |
255 | evaluator.save()
256 | ```
257 |
258 | ## Need More Help?
259 |
260 | Head on over to the [Computer Vision](https://forum.sotabench.com/c/cv) section of the sotabench
261 | forums if you have any questions or difficulties.
262 |
--------------------------------------------------------------------------------
/docs/docs/squad.md:
--------------------------------------------------------------------------------
1 | # SQuAD
2 |
3 | 
4 |
5 | You can view the [SQuAD 1.1](https://sotabench.com/benchmarks/question-answering-on-squad11-dev) and
6 | [SQuAD 2.0](https://sotabench.com/benchmarks/question-answering-on-squad20-dev) leaderboards.
7 |
8 | ## Getting Started
9 |
10 | You'll need the following in the root of your repository:
11 |
12 | - `sotabench.py` file - contains benchmarking logic; the server will run this on each commit
13 | - `requirements.txt` file - Python dependencies to be installed before running `sotabench.py`
14 | - `sotabench_setup.sh` *(optional)* - any advanced dependencies or setup, e.g. compilation
15 |
16 | You can write whatever you want in your `sotabench.py` file to get model predictions on the SQuAD dataset.
17 |
18 | But you will need to record your results for the server, and you'll want to avoid doing things like
19 | downloading the dataset on the server. So you should:
20 |
21 | - **Include an Evaluation object** in `sotabench.py` file to record the results.
22 | - **Point to the server SQuAD data path** - popular datasets are pre-downloaded on the server.
23 | - **Use Caching** *(optional)* - to speed up evaluation by hashing the first batch of predictions.
24 |
25 | We explain how to do these various steps below.
26 |
27 | ## How Do I Initialize an Evaluator?
28 |
29 | Add this to your code - before you start batching over the dataset and making predictions:
30 |
31 | ``` python
32 | from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion
33 |
34 | # for SQuAD v1.1
35 | evaluator = SQuADEvaluator(model_name='My Super Model', version=SQuADVersion.V11)
36 | # for SQuAD v2.0
37 | evaluator = SQuADEvaluator(model_name='My Super Model', version=SQuADVersion.V20)
38 | ```
39 |
40 | If you are reproducing a model from a paper, then you can enter the arXiv ID. If you
41 | put in the same model name string as on the
42 | [SQuAD 1.1](https://sotabench.com/benchmarks/question-answering-on-squad11-dev) or
43 | [SQuAD 2.0](https://sotabench.com/benchmarks/question-answering-on-squad20-dev) leaderboard
44 | then you will enable direct comparison with the paper's model. For example:
45 |
46 | ``` python
47 | from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion
48 |
49 | evaluator = SQuADEvaluator(model_name='SpanBERT',
50 | paper_arxiv_id='1907.10529',
51 | version=SQuADVersion.V20)
52 | ```
53 |
54 | The above will directly compare with the result of the paper when run on the server.
55 |
56 | ## Server Data Location
57 |
58 | The SQuAD development data is located in the root of your repository on the server at `.data/nlp/squad`.
59 | In this folder is contained:
60 |
61 | - `dev-v1.1.json` - containing SQuAD v1.1 development dataset
62 | - `dev-v2.0.json` - containing SQuAD v2.0 development dataset
63 |
64 | You can use `evaluator.dataset_path: Path` to get a path to the dataset json file.
65 | In the example above it resolves to `.data/nlp/squad/dev-v2.0.json` on
66 | sotabench server and `./dev-v2.0.json` when run locally.
67 | If you want to use a non-standard file name or location when running locally
68 | you can override the defaults like this:
69 |
70 | ``` python
71 | evaluator = SQuADEvaluator(
72 | ...,
73 | local_root='mydatasets',
74 | dataset_filename='data.json'
75 | )
76 | ```
77 |
78 | ## How Do I Evaluate Predictions?
79 |
80 | The evaluator object has an `.add(answers: Dict[str, str])` method to submit predictions by batch or in full.
81 |
82 | For SQuAD the expected input is a dictionary, where keys are question ids and values are text answers.
83 | For unanswerable questions the answer should be an empty string. For example:
84 |
85 | ``` python
86 | {"57296d571d04691400779413": "itself", "5a89117e19b91f001a626f2d": ""}
87 | ```
88 |
89 | You can do this all at once in a single call to `add()`, but more naturally, you will
90 | probably loop over the dataset and call the method for the outputs of each batch.
91 | That would look something like this (for a PyTorch example):
92 |
93 | ``` python
94 | ...
95 |
96 | evaluator = SQuADEvaluator(model_name='My Super Model',
97 | paper_arxiv_id="1710.10723",
98 | version=SQuADVersion.V11)
99 |
100 | with torch.no_grad():
101 | for i, (input, target) in enumerate(data_loader):
102 | ...
103 | output = model(input)
104 | # potentially formatting of the output here to be a dict
105 | evaluator.add(output)
106 | ```
107 |
108 | When you are done, you can get the results locally by running:
109 |
110 | ``` python
111 | evaluator.get_results()
112 | ```
113 |
114 | But for the server you want to save the results by running:
115 |
116 | ``` python
117 | evaluator.save()
118 | ```
119 |
120 | This method serialises the results and model metadata and stores to the server database.
121 |
122 | ## How Do I Cache Evaluation?
123 |
124 | Sotabench reruns your script on every commit. This is good because it acts like
125 | continuous integration in checking for bugs and changes, but can be annoying
126 | if the model hasn't changed and evaluation is lengthy.
127 |
128 | Fortunately sotabencheval has caching logic that you can use.
129 |
130 | The idea is that after the first batch, we hash the model outputs and the
131 | current metrics and this tells us if the model is the same given the dataset.
132 | You can include hashing within an evaluation loop like follows (in the following
133 | example for a PyTorch repository):
134 |
135 | ``` python
136 | with torch.no_grad():
137 | for i, (input, target) in enumerate(data_loader):
138 | ...
139 | output = model(input)
140 | # potentially formatting of the output here to be a list of dicts
141 | evaluator.add(output)
142 |
143 | if evaluator.cache_exists:
144 | break
145 |
146 | evaluator.save()
147 | ```
148 |
149 | If the hash is the same as in the server, we infer that the model hasn't changed, so
150 | we simply return hashed results rather than running the whole evaluation again.
151 |
152 | Caching is very useful if you have large models, or a repository that is evaluating
153 | multiple models, as it speeds up evaluation significantly.
154 |
155 | ## A Full sotabench.py Example
156 |
157 | Below we show an implementation for a model from the AllenNLP repository. This
158 | incorporates all the features explained above: (a) using the SQuAD Evaluator,
159 | (b) using custom dataset location when run locally, and (c) the evaluation caching logic.
160 |
161 | ``` python
162 | from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion
163 |
164 | from allennlp.data import DatasetReader
165 | from allennlp.data.iterators import DataIterator
166 | from allennlp.models.archival import load_archive
167 | from allennlp.nn.util import move_to_device
168 |
169 | def load_model(url, batch_size=64):
170 | archive = load_archive(url, cuda_device=0)
171 | model = archive.model
172 | reader = DatasetReader.from_params(archive.config["dataset_reader"])
173 | iterator_params = archive.config["iterator"]
174 | iterator_params["batch_size"] = batch_size
175 | data_iterator = DataIterator.from_params(iterator_params)
176 | data_iterator.index_with(model.vocab)
177 | return model, reader, data_iterator
178 |
179 | def evaluate(model, dataset, data_iterator, evaluator):
180 | model.eval()
181 | evaluator.reset_time()
182 | for batch in data_iterator(dataset, num_epochs=1, shuffle=False):
183 | batch = move_to_device(batch, 0)
184 | predictions = model(**batch)
185 | answers = {metadata['id']: prediction
186 | for metadata, prediction in zip(batch['metadata'], predictions['best_span_str'])}
187 | evaluator.add(answers)
188 | if evaluator.cache_exists:
189 | break
190 |
191 | evaluator = SQuADEvaluator(local_root="data/nlp/squad", model_name="BiDAF (single)",
192 | paper_arxiv_id="1611.01603", version=SQuADVersion.V11)
193 |
194 | model, reader, data_iter =\
195 | load_model("https://allennlp.s3.amazonaws.com/models/bidaf-model-2017.09.15-charpad.tar.gz")
196 | dataset = reader.read(evaluator.dataset_path)
197 | evaluate(model, dataset, data_iter, evaluator)
198 | evaluator.save()
199 | print(evaluator.results)
200 | ```
201 |
202 | ## Need More Help?
203 |
204 | Head on over to the [Natural Language Processing](https://forum.sotabench.com/c/natural-language-processing) section of the sotabench
205 | forums if you have any questions or difficulties.
206 |
--------------------------------------------------------------------------------
/docs/docs/wikitext103.md:
--------------------------------------------------------------------------------
1 | # WikiText-103
2 |
3 | 
4 |
5 | You can view the WikiText-103 leaderboard [here](https://sotabench.com/benchmarks/language-modelling-on-wikitext-103).
6 |
7 | ## Getting Started
8 |
9 | You'll need the following in the root of your repository:
10 |
11 | - `sotabench.py` file - contains benchmarking logic; the server will run this on each commit
12 | - `requirements.txt` file - Python dependencies to be installed before running `sotabench.py`
13 | - `sotabench_setup.sh` *(optional)* - any advanced dependencies or setup, e.g. compilation
14 |
15 | You can write whatever you want in your `sotabench.py` file to get language model predictions on the WikiText-103 dataset.
16 |
17 | But you will need to record your results for the server, and you'll want to avoid doing things like
18 | downloading the dataset on the server. So you should:
19 |
20 | - **Point to the server WikiText-103 data path** - popular datasets are pre-downloaded on the server.
21 | - **Include an Evaluation object** in `sotabench.py` file to record the results.
22 | - **Use Caching** *(optional)* - to speed up evaluation by hashing the first batch of predictions.
23 |
24 | We explain how to do these various steps below.
25 |
26 | ## Server Data Location
27 |
28 | The WikiText-103 development data is located in the root of your repository on the server at `.data/nlp/wikitext-103/wikitext-103-v1.zip`.
29 | The archive contains a folder `wikitext-103` with the following files:
30 |
31 | - `wiki.train.tokens`
32 | - `wiki.valid.tokens`
33 | - `wiki.test.tokens`
34 |
35 | It is the original zip file released [here](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/).
36 | We are running the benchmark on the `wiki.test.tokens` dataset.
37 | We have two helper methods that will unpack the dataset for you and give you the `pathlib.Path` to the test file.
38 |
39 | The first option `test_set_path` is available once you instantiate the `WikiText103Evaluator`:
40 |
41 | ```python
42 | ...
43 |
44 | evaluator = WikiText103Evaluator(
45 | model_name="Transformer-XL Large",
46 | paper_arxiv_id="1901.02860",
47 | paper_pwc_id="transformer-xl-attentive-language-models",
48 | local_root='/content/wikitext-103'
49 | )
50 | # dataset_path is pathlib.Path and points to wikitext.test.tokens
51 | with evaluator.test_set_path.open() as f:
52 | test_data = torch.tensor(tokenizer.encode(f.read())).to("cuda")
53 | ```
54 |
55 | There is a second option available if you are evaluating multiple models and need to use the same
56 | dataset multiple times - `WikiText103Evaluator.get_test_set_path(local_root)`. This will get the path before
57 | you initialize a WikiText evaluator:
58 |
59 | ```python
60 | from sotabencheval.language_modelling import WikiText103Evaluator
61 |
62 | test_file_path = WikiText103Evaluator.get_test_set_path('/home/ubuntu/my_data/wiki103')
63 | with test_file_path.open() as f:
64 | content = f.read()
65 | ```
66 |
67 | ## How Do I Initialize an Evaluator?
68 |
69 | Add this to your code - before you start batching over the dataset and making predictions:
70 |
71 | ``` python
72 | from sotabencheval.language_modelling import WikiText103Evaluator
73 |
74 | evaluator = WikiText103Evaluator(model_name='Model name as found in paperswithcode website')
75 | ```
76 |
77 | If you are reproducing a model from a paper, then you can enter the arXiv ID. If you
78 | put in the same model name string as on the
79 | [Wikitext-103](https://sotabench.com/benchmarks/language-modelling-on-wikitext-103) leaderboard
80 | then you will enable direct comparison with the paper's model. If the `arxiv_id` is not available you
81 | can use `paperswithcode.com` id. Below is an example of an evaluator that matches `Transformer XL`:
82 |
83 | ``` python
84 | from sotabencheval.language_modelling import WikiText103Evaluator
85 |
86 | evaluator = WikiText103Evaluator(
87 | model_name="Transformer-XL Large",
88 | paper_arxiv_id="1901.02860",
89 | paper_pwc_id="transformer-xl-attentive-language-models",
90 | local_root="path_to_your_data",
91 | )
92 | ```
93 |
94 | The above will directly compare with the result of the paper when run on the server.
95 |
96 | ## How Do I Evaluate Predictions?
97 |
98 | The evaluator object has an `.add(log_probs, targets)` method to submit predictions by batch or in full.
99 | We expect you to give us the log probability of a batch of target tokens and the `target` tokens themselves.
100 | The `log_probs` can be either:
101 |
102 | - a 0d "tensor" (`np.ndarray`/`torch.tensor`) - summed log probability of all `targets` tokens
103 | - a 2d "tensor" (`np.ndarray`/`torch.tensor`) - log probabilities of each target token, the `log_probs.shape` should match `targets.shape`
104 | - a 3d "tensor" (`np.ndarray`/`torch.tensor`) - distribution of log probabilities for each position in the sequence, we will gather the probabilities of target tokens for you.
105 |
106 | It is recommended to use third or second option as it allows us to check your perplexity calculations.
107 |
108 | If your model uses subword tokenization you don't need convert subwords to full words. You are free to report probability of each subword: we will adjust the perplexity normalization accordingly. Just make sure to set `subword_tokenization=True` in your evaluator.
109 |
110 | Here is an example of how to report results (for a PyTorch example):
111 |
112 | ``` python
113 |
114 | evaluator = WikiText103Evaluator(
115 | model_name='GPT-2 Small',
116 | paper_pwc_id="language-models-are-unsupervised-multitask",
117 | local_root="path_to_your_data",
118 | subword_tokenization = True
119 | )
120 |
121 | # run you data preprocessing, in case of GPT-2 the preprocessing removes moses artifacts
122 | with torch.no_grad():
123 | model.eval()
124 | for input, target in data_loader:
125 | output = model(input)
126 | log_probs = torch.LogSoftmax(output, dim=-1)
127 | target_log_probs = output.gather(-1, targets.unsqueeze(-1))
128 | evaluator.add(target_log_probs, target)
129 | ```
130 |
131 | When you are done, you can get the results locally by running:
132 |
133 | ``` python
134 | evaluator.get_results()
135 | ```
136 |
137 | But for the server you want to save the results by running:
138 |
139 | ``` python
140 | evaluator.save()
141 | ```
142 |
143 | This method serialises the results and model metadata and stores to the server database.
144 |
145 | ## How Do I Cache Evaluation?
146 |
147 | Sotabench reruns your script on every commit. This is good because it acts like
148 | continuous integration in checking for bugs and changes, but can be annoying
149 | if the model hasn't changed and evaluation is lengthy.
150 |
151 | Fortunately sotabencheval has caching logic that you can use.
152 |
153 | The idea is that after the first batch, we hash the model outputs and the
154 | current metrics and this tells us if the model is the same given the dataset.
155 | You can include hashing within an evaluation loop like follows (in the following
156 | example for a PyTorch repository):
157 |
158 | ``` python
159 | with torch.no_grad():
160 | for input, target in data_loader:
161 | # ...
162 | output = model(input)
163 | log_probs = #...
164 | evaluator.add(log_probs, target)
165 |
166 | if evaluator.cache_exists:
167 | break
168 |
169 | evaluator.save()
170 | ```
171 |
172 | If the hash is the same as in the server, we infer that the model hasn't changed, so
173 | we simply return hashed results rather than running the whole evaluation again.
174 |
175 | Caching is very useful if you have large models, or a repository that is evaluating
176 | multiple models, as it speeds up evaluation significantly.
177 |
178 |
179 | ## A full sotabench.py example
180 |
181 | Below we show an implementation for a model from the `huggingface/transformers`. This
182 | incorporates all the features explained above: (a) using the server data,
183 | (b) using the WikiText-103 Evaluator, and (c) caching the evaluation logic:
184 |
185 | ``` python
186 | import torch
187 | from tqdm import tqdm
188 | from sotabencheval.language_modelling import WikiText103Evaluator
189 |
190 | model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'transfo-xl-wt103').to("cuda")
191 | tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', 'transfo-xl-wt103')
192 |
193 | evaluator = WikiText103Evaluator(
194 | model_name="Transformer-XL Large",
195 | paper_arxiv_id="1901.02860",
196 | paper_pwc_id="transformer-xl-attentive-language-models",
197 | local_root='/content/wikitext-103'
198 | )
199 |
200 | with evaluator.test_set_path.open() as f:
201 | test_data = torch.tensor(tokenizer.encode(f.read()))
202 |
203 | seq_len = 128
204 | with torch.no_grad():
205 | evaluator.reset_timer()
206 | model.eval()
207 | X, Y, mems = test_data[None, :-1], test_data[None, 1:], None
208 | for s in tqdm(range(0, X.shape[-1], seq_len)):
209 | x,y = X[..., s:s+seq_len].to("cuda"), Y[..., s:s+seq_len].to("cuda")
210 | log_probs, mems, *_ = model(input_ids=x, mems=mems)
211 | evaluator.add(log_probs, y)
212 | if evaluator.cache_exists:
213 | break
214 | evaluator.save()
215 | evaluator.print_results()
216 | ```
217 |
218 | You can run this example on [Google Colab](https://colab.research.google.com/drive/1Qcp1_Fgo_aMtSgf_PV1gFw1DT6hEv7fW).
219 |
220 | ## Need More Help?
221 |
222 | Head on over to the [Natural Language Processing](https://forum.sotabench.com/c/natural-language-processing) section of the sotabench forums if you have any questions or difficulties.
223 |
--------------------------------------------------------------------------------
/docs/docs/wmt.md:
--------------------------------------------------------------------------------
1 | # WMT
2 |
3 | You can view the WMT Machine Translation leaderboards:
4 |
5 | - [WMT2014 English-German](https://sotabench.com/benchmarks/machine-translation-on-wmt2014-english-german)
6 | - [WMT2014 English-French](https://sotabench.com/benchmarks/machine-translation-on-wmt2014-english-french)
7 | - [WMT2019 English-German](https://sotabench.com/benchmarks/machine-translation-on-wmt2019-english-german)
8 |
9 | ## Getting Started
10 |
11 | You'll need the following in the root of your repository:
12 |
13 | - `sotabench.py` file - contains benchmarking logic; the server will run this on each commit
14 | - `requirements.txt` file - Python dependencies to be installed before running `sotabench.py`
15 | - `sotabench_setup.sh` *(optional)* - any advanced dependencies or setup, e.g. compilation
16 |
17 | You can write whatever you want in your `sotabench.py` file to get model predictions on the WMT datasets.
18 |
19 | But you will need to record your results for the server, and you'll want to avoid doing things like
20 | downloading the dataset on the server. So you should:
21 |
22 | - **Include an Evaluation object** in `sotabench.py` file to record the results.
23 | - **Point to the server WMT data path** - popular datasets are pre-downloaded on the server.
24 | - **Use Caching** *(optional)* - to speed up evaluation by hashing the first batch of predictions.
25 |
26 | We explain how to do these various steps below.
27 |
28 | ## How Do I Initialize an Evaluator?
29 |
30 | Before you start batching over the dataset and making predictions you need
31 | to create an evaluator instance to record results for a given leaderboard.
32 | For example, to evaluate on WMT2014 News English-French test set add this
33 | to your code:
34 |
35 | ``` python
36 | from sotabencheval.machine_translation import WMTEvaluator, WMTDataset, Language
37 |
38 | evaluator = WMTEvaluator(
39 | dataset=WMTDataset.News2014,
40 | source_lang=Language.English,
41 | target_lang=Language.French,
42 | local_root='mydatasets',
43 | model_name='My Super Model'
44 | )
45 | ```
46 |
47 | You can use `evaluator.source_dataset_path: Path` and `evaluator.target_dataset_path: Path`
48 | to get paths to the source and target SGML files.
49 | In the example above the first one resolves to `.data/nlp/wmt/newstest2014-fren-src.en.sgm` on
50 | sotabench server and `mydatasets/newstest2014-fren-src.en.sgm` when run locally.
51 | If you want to use non-standard file names locally you can override the defaults like this:
52 |
53 | ``` python
54 | evaluator = WMTEvaluator(
55 | ...,
56 | local_root='mydatasets'
57 | source_dataset_filename='english.sgm',
58 | target_dataset_filename='french.sgm'
59 | )
60 | ```
61 |
62 | If you are reproducing a model from a paper, then you can enter the arXiv ID. If you
63 | put in the same model name string as on the leaderboard
64 | then you will enable direct comparison with the paper's model. For example:
65 |
66 | ``` python
67 | evaluator = WMTEvaluator(
68 | dataset=WMTDataset.News2019,
69 | source_lang=Language.English,
70 | target_lang=Language.German,
71 | local_root="mydatasets",
72 | model_name="Facebook-FAIR (single)",
73 | paper_arxiv_id="1907.06616"
74 | )
75 | ```
76 |
77 | The above will directly compare with the result of the paper when run on the server.
78 |
79 | By default the evaluator computes a detokenized mixed-case SacreBLEU score.
80 | To get a tokenized BLEU score as well, during construction of the evaluator set
81 | a `tokenization: Callable[[str], str]` parameter to a function that tokenizes
82 | an input segment and returns segment with tokens separated by space, f.e.:
83 |
84 | ``` python
85 | def get_tokenization():
86 | mt = sacremoses.MosesTokenizer()
87 | def tokenize(sentence):
88 | return mt.tokenize(sentence, return_str=True)
89 | return tokenize
90 |
91 | evaluator = WMTEvaluator(
92 | ...,
93 | tokenization=get_tokenization()
94 | )
95 | ```
96 |
97 | Instead of parsing the dataset files by yourself you can access raw segments as strings:
98 |
99 | ``` python
100 | for segment_id, text in evaluator.source_segments:
101 | # translate text
102 |
103 | # or get segments within document context
104 | for document in evaluator.source_documents:
105 | context = [segment.text for segment in document.segments]
106 | for segment in document.segments:
107 | segment_id, text = segment.id, segment.text
108 | # translate text in context
109 | ```
110 |
111 | ## How Do I Evaluate Predictions?
112 |
113 | The evaluator object has an `.add(answers: Dict[str, str])` method to submit predictions by batch or in full.
114 |
115 | For WMT the expected input is a dictionary, where keys are source segments
116 | ids and values are translated segments
117 | (segment id is created by concatenating document id and the original segment id,
118 | separted by `#`.) For example:
119 |
120 | ``` python
121 | evaluator.add({
122 | 'bbc.381790#1': 'Waliser AMs sorgen sich um "Aussehen wie Muppets"',
123 | 'bbc.381790#2': 'Unter einigen AMs herrscht Bestürzung über einen...',
124 | 'bbc.381790#3': 'Sie ist aufgrund von Plänen entstanden, den Namen...'
125 | })
126 | ```
127 |
128 | You can do this all at once in a single call to `add()`, but more naturally, you will
129 | probably loop over the dataset and call the method for the outputs of each batch.
130 | That would look something like this (for a PyTorch example):
131 |
132 | ``` python
133 | with torch.no_grad():
134 | for i, (input, target) in enumerate(data_loader):
135 | ...
136 | output = model(input)
137 | # potentially formatting of the output here to be a dict
138 | evaluator.add(output)
139 | ```
140 |
141 | When you are done, you can get the results locally by running:
142 |
143 | ``` python
144 | evaluator.get_results()
145 | ```
146 |
147 | But for the server you want to save the results by running:
148 |
149 | ``` python
150 | evaluator.save()
151 | ```
152 |
153 | This method serialises the results and model metadata and stores to the server database.
154 |
155 | ## How Do I Cache Evaluation?
156 |
157 | Sotabench reruns your script on every commit. This is good because it acts like
158 | continuous integration in checking for bugs and changes, but can be annoying
159 | if the model hasn't changed and evaluation is lengthy.
160 |
161 | Fortunately sotabencheval has caching logic that you can use.
162 |
163 | The idea is that after the first batch, we hash the model outputs and the
164 | current metrics and this tells us if the model is the same given the dataset.
165 | You can include hashing within an evaluation loop like follows (in the following
166 | example for a PyTorch repository):
167 |
168 | ``` python
169 | with torch.no_grad():
170 | for i, (input, target) in enumerate(data_loader):
171 | ...
172 | output = model(input)
173 | # potentially formatting of the output here to be a list of dicts
174 | evaluator.add(output)
175 |
176 | if evaluator.cache_exists:
177 | break
178 |
179 | evaluator.save()
180 | ```
181 |
182 | If the hash is the same as in the server, we infer that the model hasn't changed, so
183 | we simply return hashed results rather than running the whole evaluation again.
184 |
185 | Caching is very useful if you have large models, or a repository that is evaluating
186 | multiple models, as it speeds up evaluation significantly.
187 |
188 | ## A Full sotabench.py Example
189 |
190 | Below we show an implementation for a model from the torchhub repository. This
191 | incorporates all the features explained above: (a) using the WMT Evaluator,
192 | (b) accessing segments from evaluator, and (c) the evaluation caching logic.
193 | For clarity we omit batching and simply translate segment by segment.
194 |
195 | ``` python
196 | from sotabencheval.machine_translation import WMTEvaluator, WMTDataset, Language
197 | from tqdm import tqdm
198 | import torch
199 |
200 | evaluator = WMTEvaluator(
201 | dataset=WMTDataset.News2019,
202 | source_lang=Language.English,
203 | target_lang=Language.German,
204 | local_root="data/nlp/wmt",
205 | model_name="Facebook-FAIR (single)",
206 | paper_arxiv_id="1907.06616"
207 | )
208 |
209 | model = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-de.single_model',
210 | force_reload=True, tokenizer='moses', bpe='fastbpe').cuda()
211 |
212 | for sid, text in tqdm(evaluator.source_segments.items()):
213 | translated = model.translate(text)
214 | evaluator.add({sid: translated})
215 | if evaluator.cache_exists:
216 | break
217 |
218 | evaluator.save()
219 | print(evaluator.results)
220 |
221 | ```
222 |
223 | ## Need More Help?
224 |
225 | Head on over to the [Natural Language Processing](https://forum.sotabench.com/c/natural-language-processing) section of the sotabench
226 | forums if you have any questions or difficulties.
227 |
--------------------------------------------------------------------------------
/docs/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: sotabencheval Docs
2 | theme:
3 | name: 'material'
4 | palette:
5 | primary: 'cyan'
6 | accent: 'cyan'
7 | logo:
8 | icon: 'explore'
9 | markdown_extensions:
10 | - admonition
11 | - codehilite
--------------------------------------------------------------------------------
/docs/site/img/squad20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/site/img/squad20.png
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | black==19.3b0
3 | flake8==3.7.8
4 | mkdocs-material
5 | pre-commit==1.18.3
6 | pydocstyle==4.0.1
7 | pygments
8 | pytest==5.1.1
9 | pytest-cov==2.7.1
10 | recommonmark==0.6.0
11 | sphinx==2.2.0
12 | sphinx-rtd-theme==0.4.3
13 | twine==1.13.0
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Cython
2 | numpy
3 | pycocotools>=2.0.0
4 | sotabenchapi>=0.0.13
5 | tqdm>=4.32.2
6 | beautifulsoup4>=4.7.0
7 | sacrebleu==1.4.1
8 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E203,E402,W503,E701
3 |
4 | [pydocstyle]
5 | ignore = D10,D202,D203,D212,D213,D401,D403,D406,D407,D413
6 |
7 | [tool:pytest]
8 | testpaths = sotabench-eval/test
9 | python_files = test_*.py
10 | norecursedirs = .git
11 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import io
2 | from setuptools import setup, find_packages
3 | from sotabencheval.version import __version__
4 |
5 | name = "sotabencheval"
6 | author = "Atlas ML"
7 | author_email = "hello@sotabench.com"
8 | license = "Apache-2.0"
9 | url = "https://sotabench.com"
10 | description = (
11 | "Easily benchmark Machine Learning models on selected tasks and datasets"
12 | )
13 |
14 |
15 | def get_requirements():
16 | with io.open("requirements.txt") as f:
17 | return [
18 | line.strip()
19 | for line in f.readlines()
20 | if not line.strip().startswith("#")
21 | ]
22 |
23 |
24 | setup(
25 | name=name,
26 | version=__version__,
27 | author=author,
28 | author_email=author_email,
29 | maintainer=author,
30 | maintainer_email=author_email,
31 | description=description,
32 | long_description=io.open("README.md", "r", encoding="utf-8").read(),
33 | long_description_content_type="text/markdown",
34 | url=url,
35 | platforms=["Windows", "POSIX", "MacOSX"],
36 | license=license,
37 | packages=find_packages(),
38 | include_package_data=True,
39 | install_requires=get_requirements(),
40 | classifiers=[
41 | "Programming Language :: Python :: 3",
42 | "Programming Language :: Python :: 3.6",
43 | "Programming Language :: Python :: 3.7",
44 | "License :: OSI Approved :: Apache Software License",
45 | "Operating System :: OS Independent",
46 | ],
47 | )
48 |
49 |
--------------------------------------------------------------------------------
/sotabencheval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/sotabencheval/__init__.py
--------------------------------------------------------------------------------
/sotabencheval/core/__init__.py:
--------------------------------------------------------------------------------
1 | from sotabencheval.core.evaluator import BaseEvaluator
2 |
3 | __all__ = ["BaseEvaluator"]
--------------------------------------------------------------------------------
/sotabencheval/core/cache.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | CACHE_FLOAT_PRECISION = 3
4 |
5 |
6 | def cache_value(value):
7 | """
8 | Takes in a value and puts it in a format ready for hashing + caching
9 |
10 | Why? In sotabench we hash the output after the first batch as an indication of whether the model has changed or not.
11 | If the model hasn't changed, then we don't run the whole evaluation on the server - but return the same results
12 | as before. This speeds up evaluation - making "continuous evaluation" more feasible...it also means lower
13 | GPU costs for us :).
14 |
15 | We apply some rounding and reformatting so small low precision changes do not change the hash.
16 |
17 | :param value: example model output
18 | :return: formatted value (rounded and ready for hashing)
19 | """
20 | if isinstance(value, (str, int, bool)) or value is None:
21 | return value
22 | elif isinstance(value, float):
23 | return np.round(value, CACHE_FLOAT_PRECISION)
24 | elif isinstance(value, dict):
25 | return {key: cache_value(val) for key, val in sorted(value.items(), key=lambda x: x[0])}
26 | elif isinstance(value, list):
27 | return [cache_value(val) for val in value]
28 | elif isinstance(value, np.ndarray):
29 | return value.round(CACHE_FLOAT_PRECISION)
30 |
--------------------------------------------------------------------------------
/sotabencheval/core/evaluator.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | from sotabenchapi.client import Client
4 | from sotabenchapi.core import BenchmarkResult
5 | from sotabencheval.utils import is_server
6 | from sotabencheval.core.cache import cache_value
7 |
8 |
9 | class BaseEvaluator:
10 | """Base class for evaluator objects on tasks
11 |
12 | Currently SQuAD and WMT use this as a parent.
13 |
14 | TODO: Refactor ImageNet, COCO, ADE20K, PASCAL to utilise this class
15 |
16 | The core API design relies upon:
17 |
18 | (a) Initializing an Evaluator object and linking to a paper, for example:
19 |
20 | .. code-block:: python
21 |
22 | from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion
23 |
24 | evaluator = SQuADEvaluator(model_name='SpanBERT', paper_arxiv_id='1907.10529',
25 | version=SQuADVersion.V20)
26 |
27 | The paper metadata allows the results to be linked to paper results when submitted to sotabench.com.
28 |
29 | (b) Adding Predictions (usually in batch) - example below for PyTorch iterating over DataLoader:
30 |
31 | .. code-block:: python
32 |
33 | for i, (input, target) in enumerate(data_loader):
34 | ...
35 | output = model(input)
36 | # potentially formatting of the output here
37 | evaluator.add(output)
38 |
39 | These results are accumulated and then evaluated - i.e. metrics are calculated once done.
40 |
41 | (c) Saving Results
42 |
43 | .. code-block:: python
44 | evaluator.save()
45 |
46 | Gets the evaluation results for the current predictions added to the Evaluation object - calculates metrics -
47 | then run if on the server, serializes results to a sotabench_results.json file which is processed and results
48 | are stored on the server.
49 |
50 | These three steps: initialization -> adding predictions -> saving and evaluating results are the core API.
51 | They should be capable of integration with any existing evaluation logic in your repository.
52 | """
53 |
54 | def __init__(self,
55 | model_name: str = None,
56 | paper_arxiv_id: str = None,
57 | paper_pwc_id: str = None,
58 | paper_results: dict = None,
59 | model_description=None,):
60 | """
61 | Initializes a BaseEvaluator like object
62 |
63 | :param model_name: (str) The name of the model, for example 'ResNet-101', which will be saved to sotabench.com
64 | :param paper_arxiv_id: (str, optional) The paper that the model is linked to, e.g. '1906.06423'
65 | :param paper_pwc_id: (str, optional) The PWC paper id (slug), e.g. 'albert-a-lite-bert-for-self-supervised'
66 | :param paper_results: (dict, optional) If the paper you are linking to does not have results on sotabench,
67 | then you can add paper results here. This will be a dictionary with keys as metric names, and values as metric
68 | values. This will be benchmark specific.
69 | :param model_description: (str, optional) Optional description for the model; this can contain details about
70 | where the weights are from, details about training, and more. This will appear in an info box for the model
71 | when it is displayed on sotabench.com.
72 | """
73 |
74 | # Model metadata
75 |
76 | self.model_name = model_name
77 | self.paper_arxiv_id = paper_arxiv_id
78 | self.paper_pwc_id = paper_pwc_id
79 | self.paper_results = paper_results
80 | self.model_description = model_description
81 |
82 | # Backend variables for hashing and caching
83 |
84 | self.first_batch_processed = False
85 | self.batch_hash = None
86 | self.cached_results = False
87 | self.results = None
88 | self._cache_exists = None
89 |
90 | # Speed and memory metrics
91 |
92 | self.init_time = time.time()
93 | self.speed_mem_metrics = {}
94 |
95 | @property
96 | def cache_exists(self):
97 | """
98 | Checks whether the cache exists in the sotabench.com database - if so
99 | then sets self.results to cached results and returns True.
100 |
101 | You can use this property for control flow to break a for loop over a dataset
102 | after the first iteration. This prevents re-running the same calculation for the
103 | same model twice.
104 |
105 | Q: Why should the user use this?
106 | A: If you want fast "continuous evaluation" and don't want to avoid rerunning the same model over and over
107 | each time you commit something new to your repository.
108 |
109 | Examples:
110 | Breaking a for loop if the model is the same as last time we ran
111 |
112 | .. code-block:: python
113 |
114 | ...
115 |
116 | with torch.no_grad():
117 | for i, (input, target) in enumerate(iterator):
118 | ...
119 | output = model(input)
120 | # optional formatting of output here to be a list of detection dicts
121 | evaluator.add(output)
122 |
123 | if evaluator.cache_exists:
124 | break
125 |
126 | evaluator.save()
127 |
128 | This logic is for the server; it will not break the loop if you evaluate locally.
129 |
130 | :return: bool or None (if not on server)
131 | """
132 |
133 | if not is_server(): # we only check the cache on the server
134 | return None
135 |
136 | if not self.first_batch_processed:
137 | return False
138 |
139 | if self._cache_exists is not None:
140 | return self._cache_exists
141 |
142 | client = Client.public()
143 | cached_res = client.get_results_by_run_hash(self.batch_hash)
144 | if cached_res:
145 | self.results = cached_res
146 | self.cached_results = True
147 | print(
148 | "No model change detected (using the first batch run "
149 | f"hash {self.batch_hash}). Will use cached results."
150 | )
151 |
152 | self._cache_exists = True
153 | else:
154 | self._cache_exists = False
155 | return self._cache_exists
156 |
157 | def reset(self):
158 | """Resets the internal state of evaluator and allows to start over"""
159 | pass
160 |
161 | def cache_values(self, **kwargs):
162 | """
163 | Takes in keyword argument and converts to a hashable (cachable) format for each
164 |
165 | :param kwargs: keyword argument
166 | :return: cachable version of the keyword arguments
167 | """
168 | return cache_value(kwargs)
169 |
170 | def eval(self, results_generator):
171 | """Run full evaluation loop on results_genertor"""
172 | self.reset()
173 | self.reset_time()
174 | for results in results_generator:
175 | self.add(*results)
176 | if self.first_batch_processed and self.cache_exists:
177 | break
178 | self.save()
179 | return self
180 |
181 | def get_results(self):
182 | """Calculate results."""
183 | return self.results
184 |
185 | def print_results(self):
186 | """Print results."""
187 | self.get_results()
188 | print(f"results = {self.results}, speed_mem_metrics = {self.speed_mem_metrics}")
189 |
190 | def reset_time(self):
191 | """
192 | Simple method to reset the timer self.init_time. Often used before a loop, to time the evaluation
193 | appropriately, for example:
194 |
195 | .. code-block:: python
196 |
197 | from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion
198 |
199 | evaluator = SQuADEvaluator(model_name='SpanBERT', paper_arxiv_id='1907.10529',
200 | version=SQuADVersion.V20)
201 |
202 | # processing/setup logic here
203 |
204 | evaluator.reset_time()
205 |
206 | for i, (input, target) in enumerate(data_loader):
207 | ...
208 | output = model(input)
209 | # potentially formatting of the output here
210 | evaluator.add(output)
211 |
212 | evaluator.save()
213 |
214 | Above we may have processing logic inbetween the evaluator initialization and the actual evaluation loop, so
215 | we reset the timer so it's a fair timing of the evaluation (and not setup steps like data processing, loading
216 | the model etc).
217 |
218 | :return: void - resets self.init_time
219 | """
220 | self.init_time = time.time()
221 |
222 | def save(self, **kwargs):
223 | """
224 | Calculate results and then put into a BenchmarkResult object
225 |
226 | On the sotabench.com server, this will produce a JSON file serialisation in sotabench_results.json and results
227 | will be recorded on the platform.
228 |
229 | Users should save once all predictions are added, for instance:
230 |
231 | .. code-block:: python
232 |
233 | from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion
234 |
235 | evaluator = SQuADEvaluator(model_name='SpanBERT', paper_arxiv_id='1907.10529',
236 | version=SQuADVersion.V20)
237 |
238 | # processing/setup logic here
239 |
240 | evaluator.reset_time()
241 |
242 | for i, (input, target) in enumerate(data_loader):
243 | ...
244 | output = model(input)
245 | # potentially formatting of the output here
246 | evaluator.add(output)
247 |
248 | evaluator.save()
249 |
250 | Here once we have added all the predictions to the evaluator, we .save() so we evaluate and, if on the server,
251 | results are serialized and saved to the server.
252 |
253 | :return: BenchmarkResult object with results and metadata
254 | """
255 | # recalculate to ensure no mistakes made during batch-by-batch metric calculation
256 | self.get_results()
257 |
258 | return BenchmarkResult(
259 | task=self.task,
260 | config={},
261 | results=self.results,
262 | speed_mem_metrics=self.speed_mem_metrics,
263 | model=self.model_name,
264 | model_description=self.model_description,
265 | arxiv_id=self.paper_arxiv_id,
266 | pwc_id=self.paper_pwc_id,
267 | paper_results=self.paper_results,
268 | run_hash=self.batch_hash,
269 | **kwargs,
270 | )
271 |
--------------------------------------------------------------------------------
/sotabencheval/image_classification/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["ImageNetEvaluator"]
2 |
3 | from sotabencheval.image_classification.imagenet import ImageNetEvaluator
--------------------------------------------------------------------------------
/sotabencheval/image_classification/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def top_k_accuracy_score(y_true, y_pred, k=5, normalize=True):
4 | """
5 | Top k Accuracy classification score.
6 | :param y_true: the true labels (np.ndarray)
7 | :param y_pred: the predicted labels (np.ndarray)
8 | :param k: calculates top k accuracy (int)
9 | :param normalize: whether to normalize by the number of observations
10 | :return: the top k accuracy
11 | """
12 |
13 | if len(y_true.shape) == 2:
14 | y_true = y_true[0] # should be one-dimensional
15 |
16 | num_obs, num_labels = y_pred.shape
17 |
18 | idx = num_labels - k - 1
19 | counter = 0
20 | argsorted = np.argsort(y_pred, axis=1)
21 |
22 | for i in range(num_obs):
23 | if y_true[i] in argsorted[i, idx+1:]:
24 | counter += 1
25 | if normalize:
26 | return counter / num_obs
27 | else:
28 | return counter
--------------------------------------------------------------------------------
/sotabencheval/language_modelling/__init__.py:
--------------------------------------------------------------------------------
1 | from sotabencheval.language_modelling.wikitext import WikiText103Evaluator, WikiText2Evaluator, WikiTextEvaluator, WikiTextDataset
2 |
3 | __all__ = ["WikiText103Evaluator", "WikiText2Evaluator",
4 | "WikiTextEvaluator", "WikiTextDataset"]
5 |
--------------------------------------------------------------------------------
/sotabencheval/language_modelling/wikitext.py:
--------------------------------------------------------------------------------
1 | import time
2 | from enum import Enum
3 | from pathlib import Path
4 |
5 | import numpy as np
6 |
7 | from sotabencheval.core import BaseEvaluator
8 | from sotabencheval.utils import calculate_batch_hash, extract_archive, change_root_if_server, is_server, get_max_memory_allocated
9 |
10 |
11 | class WikiTextDataset(Enum):
12 | """Enum used to select the dataset on which evaluation is executed. """
13 | WikiText103 = ('WikiText-103', 245569, 267735)
14 | WikiText2 = ('WikiText-2', 245569, 33278)
15 |
16 | def __init__(self, pwc_name, testset_size, vocab_size):
17 | """
18 | Creates an enum instance
19 | :param pwc_name: the name of the dataset as it is found on paperswithcode leaderboard
20 | :param testset_size: the size of the test set in words
21 | :param vocab_size: the size of the dataset vocabluary
22 | """
23 | self.pwc_name = pwc_name
24 | self.testset_size = testset_size
25 | self.vocab_size = vocab_size
26 |
27 | def _get_path(self, local_root, local_unzip=False):
28 | root = Path(change_root_if_server(root=local_root,
29 | server_root=".data/nlp/" + self.pwc_name.lower()))
30 | zip_name = self.pwc_name.lower() + "-v1.zip"
31 | dataset_path = root / "wiki.test.tokens"
32 | if not dataset_path.exists(): # unzip
33 | extract_archive(str(root / zip_name), to_path=root.parent)
34 | return dataset_path
35 |
36 | get_path = _get_path # deprecated API, for backward compatibility with existing benchmarks
37 |
38 | def get_test_set_path(self, local_root):
39 | """
40 | Unzips the datasets and returns path to "wiki.test.tokens"
41 | :param local_root: Path to the directory where the dataset files are located locally.
42 | Ignored when run on sotabench server.
43 | """
44 | return self.get_path(local_root).parent / "wiki.test.tokens"
45 |
46 | def get_validation_set_path(self, local_root):
47 | """
48 | Unzips the datasets and returns path to "wiki.test.tokens"
49 | :param local_root: Path to the directory where the dataset files are located locally.
50 | Ignored when run on sotabench server.
51 | """
52 | return self.get_path(local_root).parent / "wiki.valid.tokens"
53 |
54 | def _to_numpy(*args):
55 | def convert(a):
56 | if hasattr(a, 'cpu') and hasattr(a, 'numpy'):
57 | return a.cpu().numpy()
58 | if isinstance(a, list):
59 | return np.array(a)
60 | return a
61 | return [convert(a) for a in args]
62 |
63 | def _gather_probs(log_probs, targets):
64 | """
65 | Gather probabilities of each target token, from the model activations after log_softmax
66 | :param log_probs: - `torch.tensor`/`np.ndarray` shape [bs x seq_len x vocab_sz]
67 | with model activations after `log_softmax`, with log probability of each word in the vocab
68 | :param targets: - `torch.tensor`/`np.ndarray` shape [bs x seq_len] with ground truth words
69 | """
70 | if hasattr(log_probs, 'gather'):
71 | # if we work with torch this method is faster than numpy implementation
72 | probs = log_probs.gather(-1, targets.unsqueeze(-1))
73 | elif isinstance(log_probs, np.ndarray):
74 | # use slower numpy implementation if we have ndarrays
75 | vocab_sz = int(log_probs.shape[-1])
76 | log_probs, targets = _to_numpy(log_probs, targets)
77 | log_probs = log_probs.reshape(-1, vocab_sz)
78 | targets = targets.reshape(-1)
79 | probs = log_probs[np.arange(log_probs.shape[0]), targets]
80 | return _to_numpy(probs, targets)
81 |
82 |
83 | class WikiTextEvaluator(BaseEvaluator):
84 | task = "Language Modelling"
85 | dataset = None # defined in a subclass
86 |
87 | def __init__(self,
88 | local_root: str = '.',
89 | model_name: str = None,
90 | paper_arxiv_id: str = None,
91 | paper_pwc_id: str = None,
92 | paper_results: dict = None,
93 | model_description=None,
94 | subword_tokenization: bool = False,
95 | text_transformation: bool = False,
96 | dataset=None):
97 | """
98 | Creates an evaluator for one of the WikiText benchmarks.
99 |
100 | :param local_root: Path to the directory where the dataset files are located locally.
101 | Ignored when run on sotabench server.
102 | :param model_name: The name of the model from the
103 | paper - if you want to link your build to a model from a
104 | machine learning paper. See the WikiText-103 benchmarks page for model names,
105 | (f.e., https://sotabench.com/benchmarks/language-modelling-on-wikitext-103)
106 | on the paper leaderboard or models yet to try tab.
107 | :param paper_arxiv_id: Optional linking to arXiv if you
108 | want to link to papers on the leaderboard; put in the
109 | corresponding paper's arXiv ID, e.g. '1901.02860'.
110 | :param paper_pwc_id: Optional linking to Papers With Code;
111 | put in the corresponding papers with code URL slug, e.g.
112 | "transformer-xl-attentive-language-models"
113 | :param paper_results: If the paper model you are reproducing
114 | does not have model results on sotabench.com, you can specify
115 | the paper results yourself through this argument, where keys
116 | are metric names, values are metric values. e.g:
117 |
118 | {'Test perplexity': 18.2 }.
119 |
120 | Ensure that the metric names match those on the sotabench
121 | leaderboard - for WikiText benchmarks it should be `Test perplexity`.
122 | :param model_description: Optional model description.
123 | :param subword_tokenization: Should be set to `True` if your model use subword tokens defaults to `False`,
124 | :param text_transformation: Should be set to `True` if you use detokenizers that removes moses artefacts, f.e. in zero shoot setting,
125 | :param dataset: internal paramtere do not set in subclasses.
126 | """
127 | super().__init__(model_name, paper_arxiv_id,
128 | paper_pwc_id, paper_results, model_description)
129 | if dataset is not None:
130 | self.dataset = dataset
131 | self.subword_tokenization = subword_tokenization
132 | self.text_transformation = text_transformation
133 | self.local_root = local_root
134 | self._neglogloss = 0
135 | self._data_set_size = 0
136 |
137 | @property
138 | def dataset_path(self): # deprecated
139 | return self.dataset.get_path(self.local_root)
140 |
141 | @property
142 | def test_set_path(self):
143 | """Returns path to test set, uses `self.local_root` when it is not on the server"""
144 | return self.get_test_set_path(self.local_root)
145 |
146 | @classmethod
147 | def get_test_set_path(cls, local_root):
148 | """
149 | Unzips the datasets and returns path to "wiki.test.tokens"
150 | :param local_root: Path to the directory where the dataset files are located locally.
151 | Ignored when run on sotabench server.
152 | """
153 | return cls.dataset.get_test_set_path(local_root)
154 |
155 | def reset(self):
156 | """
157 | Removes already added results
158 |
159 |
160 | When checking if the model should be rerun on whole dataset it is first run on a smaller subset
161 | and the results are compared with values cached on sotabench server (the check is not performed
162 | when running locally.) Ideally, the smaller subset is just the first batch, so no additional
163 | computation is needed. However, for more complex multistage pipelines it maybe simpler to
164 | run a model twice - on a small dataset and (if necessary) on the full dataset. In that case
165 | :func:`reset` needs to be called before the second run so values from the first run are not reported.
166 |
167 | .. seealso:: :func:`cache_exists`
168 | .. seealso:: :func:`reset_time`
169 | """
170 | self._neglogloss = 0
171 | self._data_set_size = 0
172 |
173 | def add(self, log_probs, targets):
174 | """
175 | Updates the evaluator with new results
176 |
177 | :param log_probs: `np.ndarray` or `torch.tensor` with log probability of target tokens can be either:
178 | - a 0d tensor
179 | summed log probability of all `targets` tokens, or
180 | - a 2d tensor [bs x seq_len]
181 | log probabilities of each target token, the shape of `log_probs`, `targets` must match.
182 | - a 3d tensor [bs x seq_len x vocab_size]
183 | distribution of log probabilities for each position in the sequence,
184 | we will gather the probabilities of target tokens for you.
185 | :param targets: a `np.ndarray` or `torch.tensor` with ids of ground truth tokens.
186 |
187 | Examples:
188 | Update the evaluator with a result for a sentence with 10 tokens:
189 |
190 | .. code-block:: python
191 | log_probs = np.array([[ 32, 582, 2731, 19, 1, 786, 5, 98693, 55362, 5 ]])
192 | targets = np.array([[ -9.8461, -9.3343, -17.8042, -11.2006, -22.3345, -14.4665, -2.0055,
193 | -14.2044, -14.7545, -5.7888]])
194 | my_evaluator.add(log_probs, targets)
195 | """
196 | if isinstance(log_probs, float):
197 | log_probs = np.array([log_probs]) # for sum to work
198 | elif log_probs.shape[:-1] == targets.shape:
199 | log_probs, targets = _gather_probs(log_probs, targets)
200 | else:
201 | assert log_probs.shape == targets.shape, f"log_probs have to be ether gathered log probabilities of targets or all probabilites, received {log_probs.shape} {repr(log_probs)}"
202 | self._neglogloss += - float(log_probs.sum())
203 | self._data_set_size += int(np.prod(list(targets.shape)))
204 |
205 | if not self.first_batch_processed:
206 | content = self.cache_values(
207 | probs=_to_numpy(log_probs)[0].reshape(-1),
208 | api_version=3)
209 | self.batch_hash = calculate_batch_hash(content)
210 | self.first_batch_processed = True
211 | return self.results
212 |
213 | def print_results(self):
214 | """ Calculates and print results. """
215 | super().print_results()
216 | print("Perplexity:", np.exp(self._neglogloss / self.dataset.testset_size),
217 | "NeglogLoss:", self._neglogloss, "Tokens Count:", self._data_set_size)
218 |
219 | print_stats = print_results
220 |
221 | def get_results(self):
222 | """
223 | Calculates the perplexity and measure the performance of the model
224 |
225 | :return: dict with `Test perplexity`
226 | """
227 | if self.cached_results:
228 | return self.results
229 | perplexity = np.exp(self._neglogloss /
230 | self.dataset.testset_size)
231 |
232 | self.results = {
233 | 'Test perplexity': perplexity
234 | }
235 | self.speed_mem_metrics['Max Memory Allocated (Total)'] = get_max_memory_allocated()
236 | exec_speed = (time.time() - self.init_time)
237 | count = self.dataset.testset_size
238 | self.speed_mem_metrics['Tasks / Evaluation Time'] = count / exec_speed
239 | self.speed_mem_metrics['Tasks'] = count
240 | self.speed_mem_metrics['Evaluation Time'] = exec_speed
241 | return self.results
242 |
243 | def save(self):
244 | """Save results to the server databese/"""
245 | return super().save(dataset=self.dataset.pwc_name)
246 |
247 |
248 | class WikiText103Evaluator(WikiTextEvaluator):
249 | """`WikiText103 `_ benchmark.
250 |
251 | Examples:
252 | Evaluate a language model from the transformers repository:
253 |
254 | .. code-block:: python
255 |
256 | import torch
257 | from tqdm import tqdm
258 | from sotabencheval.language_modelling import WikiText103Evaluator
259 |
260 | model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'transfo-xl-wt103').to("cuda")
261 | tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', 'transfo-xl-wt103')
262 |
263 | evaluator = WikiText103Evaluator(
264 | model_name="Transformer-XL Large",
265 | paper_arxiv_id="1901.02860",
266 | paper_pwc_id="transformer-xl-attentive-language-models",
267 | local_root='/content/wikitext-103'
268 | )
269 |
270 | with evaluator.test_set_path.open() as f:
271 | test_data = torch.tensor(tokenizer.encode(f.read()))
272 |
273 | seq_len = 128
274 | with torch.no_grad():
275 | evaluator.reset_timer()
276 | model.eval()
277 | X, Y, mems = test_data[None, :-1], test_data[None, 1:], None
278 | for s in tqdm(range(0, X.shape[-1], seq_len)):
279 | x,y = X[..., s:s+seq_len].to("cuda"), Y[..., s:s+seq_len].to("cuda")
280 | log_probs, mems, *_ = model(input_ids=x, mems=mems)
281 | evaluator.add(log_probs, y)
282 | if evaluator.cache_exists:
283 | break
284 | evaluator.save()
285 | evaluator.print_results()
286 | """
287 | dataset = WikiTextDataset.WikiText103
288 |
289 |
290 | class WikiText2Evaluator(WikiTextEvaluator):
291 | """`WikiText103 `_ benchmark.
292 |
293 | Examples:
294 | Evaluate a language model from the transformers repository:
295 |
296 | .. code-block:: python
297 |
298 | import torch
299 | from tqdm import tqdm
300 | from sotabencheval.language_modelling import WikiText2Evaluator
301 |
302 | model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'transfo-xl-wt103').to("cuda")
303 | tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', 'transfo-xl-wt103')
304 |
305 | evaluator = WikiText2Evaluator(
306 | model_name="Transformer-XL Large",
307 | paper_arxiv_id="1901.02860",
308 | paper_pwc_id="transformer-xl-attentive-language-models",
309 | local_root='/content/wikitext-2'
310 | )
311 |
312 | with evaluator.test_set_path.open() as f:
313 | test_data = torch.tensor(tokenizer.encode(f.read()))
314 |
315 | seq_len = 128
316 | with torch.no_grad():
317 | evaluator.reset_timer()
318 | model.eval()
319 | X, Y, mems = test_data[None, :-1], test_data[None, 1:], None
320 | for s in tqdm(range(0, X.shape[-1], seq_len)):
321 | x,y = X[..., s:s+seq_len].to("cuda"), Y[..., s:s+seq_len].to("cuda")
322 | log_probs, mems, *_ = model(input_ids=x, mems=mems)
323 | evaluator.add(log_probs, y)
324 | if evaluator.cache_exists:
325 | break
326 | evaluator.save()
327 | evaluator.print_results()
328 | """
329 | dataset = WikiTextDataset.WikiText2
330 |
--------------------------------------------------------------------------------
/sotabencheval/machine_translation/__init__.py:
--------------------------------------------------------------------------------
1 | from sotabencheval.machine_translation.wmt import WMTEvaluator, WMTDataset
2 | from sotabencheval.machine_translation.metrics import TranslationMetrics
3 | from sotabencheval.machine_translation.languages import Language
4 |
5 | __all__ = ["WMTDataset", "WMTEvaluator", "TranslationMetrics", "Language"]
6 |
--------------------------------------------------------------------------------
/sotabencheval/machine_translation/languages.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 |
3 | _full_forms = {
4 | "en": "English",
5 | "fr": "French",
6 | "de": "German",
7 | }
8 |
9 |
10 | class Language(Enum):
11 | English = "en"
12 | French = "fr"
13 | German = "de"
14 |
15 | @property
16 | def fullname(self):
17 | return _full_forms[self.value]
18 |
--------------------------------------------------------------------------------
/sotabencheval/machine_translation/metrics.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from bs4 import BeautifulSoup
3 | from pathlib import Path
4 | from typing import Dict, List, Callable
5 | from collections import OrderedDict
6 | from sacrebleu import corpus_bleu
7 |
8 |
9 | MIN_CACHE_BATCH_SIZE = 32
10 |
11 |
12 | class TranslationMetrics:
13 | def __init__(self,
14 | source_dataset_path: Path,
15 | target_dataset_path: Path,
16 | tokenization: Callable[[str], str] = None):
17 | self._src_dataset_path = source_dataset_path
18 | self._dst_dataset_path = target_dataset_path
19 | self.answers = {}
20 | self.source_documents, self.source_segments = self._load_dataset(self._src_dataset_path)
21 | self._target_documents, self._target_segments = self._load_dataset(self._dst_dataset_path)
22 | self._tokenization = tokenization
23 | self._results = None
24 |
25 | def _load_dataset(self, dataset_path):
26 | documents = read_sgm_file(dataset_path)
27 | segments = OrderedDict([(segment.id, segment.text) for doc in documents for segment in doc.segments])
28 | return documents, segments
29 |
30 | def add(self, answers: Dict[str, str]):
31 | if not answers:
32 | print("Empty batch added to results")
33 | return
34 | if set(self.answers.keys()) & set(answers.keys()):
35 | print("Multiple translations for the same segment")
36 | self.answers.update(answers)
37 |
38 | def reset(self):
39 | self._results = None
40 | self.answers = {}
41 |
42 | def evaluate(self, ignore_missing=False):
43 | if ignore_missing:
44 | keep = set(self.answers.keys())
45 | target_segments = {sid: text for sid, text in self._target_segments.items() if sid in keep}
46 | else:
47 | target_segments = self._target_segments
48 | answers = [self.answers.get(sid, "") for sid in target_segments]
49 | references = [target for target in target_segments.values()]
50 | bleu = corpus_bleu(answers, [references])
51 | self._results = {'SacreBLEU': bleu.score}
52 |
53 | if self._tokenization is not None:
54 | tok_answers = [self._tokenization(answer) for answer in answers]
55 | tok_references = [self._tokenization(target) for target in references]
56 | tok_bleu = corpus_bleu(tok_answers, [tok_references], tokenize='none', force=True)
57 | self._results['BLEU score'] = tok_bleu.score
58 |
59 | @property
60 | def has_data(self):
61 | return len(self.answers) >= MIN_CACHE_BATCH_SIZE
62 |
63 | def get_results(self, ignore_missing=False):
64 | self.evaluate(ignore_missing)
65 | return self._results
66 |
67 |
68 | @dataclass
69 | class Segment:
70 | id: str
71 | text: str
72 |
73 |
74 | @dataclass
75 | class Document:
76 | id: str
77 | segments: List[Segment]
78 |
79 |
80 | def read_sgm_file(path):
81 | with open(path, 'rb') as f:
82 | soup = BeautifulSoup(f.read(), features="html.parser")
83 |
84 | return [
85 | Document(
86 | id=doc['docid'],
87 | segments=[
88 | Segment(
89 | id=doc['docid'] + '#' + seg['id'],
90 | text=seg.text
91 | ) for seg in doc.find_all('seg')
92 | ]
93 | ) for doc in soup.find_all('doc')
94 | ]
95 |
--------------------------------------------------------------------------------
/sotabencheval/machine_translation/wmt.py:
--------------------------------------------------------------------------------
1 | from sotabencheval.core import BaseEvaluator
2 | from sotabencheval.utils import calculate_batch_hash, change_root_if_server, is_server
3 | from sotabencheval.machine_translation.languages import Language
4 | from sotabencheval.machine_translation.metrics import TranslationMetrics
5 | from sotabencheval.utils import get_max_memory_allocated
6 | from typing import Dict, Callable
7 | from pathlib import Path
8 | from enum import Enum
9 | import time
10 |
11 |
12 | class WMTDataset(Enum):
13 | News2014 = "newstest2014"
14 | News2019 = "newstest2019"
15 |
16 |
17 | class WMTEvaluator(BaseEvaluator):
18 | """Evaluator for WMT Machine Translation benchmarks.
19 |
20 | Examples:
21 | Evaluate a Transformer model from the fairseq repository on WMT2019 news test set:
22 |
23 | .. code-block:: python
24 |
25 | from sotabencheval.machine_translation import WMTEvaluator, WMTDataset, Language
26 | from tqdm import tqdm
27 | import torch
28 |
29 | evaluator = WMTEvaluator(
30 | dataset=WMTDataset.News2019,
31 | source_lang=Language.English,
32 | target_lang=Language.German,
33 | local_root="data/nlp/wmt",
34 | model_name="Facebook-FAIR (single)",
35 | paper_arxiv_id="1907.06616"
36 | )
37 |
38 | model = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-de.single_model',
39 | force_reload=True, tokenizer='moses', bpe='fastbpe').cuda()
40 |
41 | for sid, text in tqdm(evaluator.source_segments.items()):
42 | translated = model.translate(text)
43 | evaluator.add({sid: translated})
44 | if evaluator.cache_exists:
45 | break
46 |
47 | evaluator.save()
48 | print(evaluator.results)
49 | """
50 |
51 | task = "Machine Translation"
52 |
53 | _datasets = {
54 | (WMTDataset.News2014, Language.English, Language.German),
55 | (WMTDataset.News2019, Language.English, Language.German),
56 | (WMTDataset.News2014, Language.English, Language.French),
57 | }
58 |
59 | def __init__(self,
60 | dataset: WMTDataset,
61 | source_lang: Language,
62 | target_lang: Language,
63 | local_root: str = '.',
64 | source_dataset_filename: str = None,
65 | target_dataset_filename: str = None,
66 | model_name: str = None,
67 | paper_arxiv_id: str = None,
68 | paper_pwc_id: str = None,
69 | paper_results: dict = None,
70 | model_description: str = None,
71 | tokenization: Callable[[str], str] = None):
72 | """
73 | Creates an evaluator for one of the WMT benchmarks.
74 |
75 | :param dataset: Which dataset to evaluate on, f.e., WMTDataset.News2014.
76 | :param source_lang: Source language of the documents to translate.
77 | :param target_lang: Target language into which the documents are translated.
78 | :param local_root: Path to the directory where the dataset files are located locally.
79 | Ignored when run on sotabench server.
80 | :param source_dataset_filename: Local filename of the SGML file with the source documents.
81 | If None, the standard WMT filename is used, based on :param:`dataset`,
82 | :param:`source_lang` and :param:`target_lang`.
83 | Ignored when run on sotabench server.
84 | :param target_dataset_filename: Local filename of the SGML file with the reference documents.
85 | If None, the standard WMT filename is used, based on :param:`dataset`,
86 | :param:`source_lang` and :param:`target_lang`.
87 | Ignored when run on sotabench server.
88 | :param model_name: The name of the model from the
89 | paper - if you want to link your build to a model from a
90 | machine learning paper. See the WMT benchmarks pages for model names,
91 | (f.e., https://sotabench.com/benchmarks/machine-translation-on-wmt2014-english-german)
92 | on the paper leaderboard or models yet to try tabs.
93 | :param paper_arxiv_id: Optional linking to arXiv if you
94 | want to link to papers on the leaderboard; put in the
95 | corresponding paper's arXiv ID, e.g. '1907.06616'.
96 | :param paper_pwc_id: Optional linking to Papers With Code;
97 | put in the corresponding papers with code URL slug, e.g.
98 | 'facebook-fairs-wmt19-news-translation-task'
99 | :param paper_results: If the paper model you are reproducing
100 | does not have model results on sotabench.com, you can specify
101 | the paper results yourself through this argument, where keys
102 | are metric names, values are metric values. e.g:
103 |
104 | {'SacreBLEU': 42.7, 'BLEU score': 43.1}.
105 |
106 | Ensure that the metric names match those on the sotabench
107 | leaderboard - for WMT benchmarks it should be `SacreBLEU` for de-tokenized
108 | case sensitive BLEU score and `BLEU score` for tokenized BLEU.
109 | :param model_description: Optional model description.
110 | :param tokenization: An optional tokenization function to compute tokenized BLEU score.
111 | It takes a single string - a segment to tokenize, and returns a string with tokens
112 | separated by space, f.e.:
113 |
114 | tokenization = lambda seg: seg.replace("'s", " 's").replace("-", " - ")
115 |
116 | If None, only de-tokenized SacreBLEU score is reported.
117 | """
118 |
119 | super().__init__(model_name, paper_arxiv_id, paper_pwc_id, paper_results, model_description)
120 | self.root = change_root_if_server(root=local_root,
121 | server_root=".data/nlp/wmt")
122 | self.dataset = dataset
123 | self.source_lang = source_lang
124 | self.target_lang = target_lang
125 |
126 | default_src_fn, default_dst_fn = self._get_source_dataset_filename()
127 | if source_dataset_filename is None or is_server():
128 | source_dataset_filename = default_src_fn
129 |
130 | if target_dataset_filename is None or is_server():
131 | target_dataset_filename = default_dst_fn
132 |
133 | self.source_dataset_path = Path(self.root) / source_dataset_filename
134 | self.target_dataset_path = Path(self.root) / target_dataset_filename
135 |
136 | self.metrics = TranslationMetrics(self.source_dataset_path, self.target_dataset_path, tokenization)
137 |
138 | def _get_source_dataset_filename(self):
139 | if self.dataset == WMTDataset.News2014:
140 | other_lang = self.source_lang.value if self.target_lang == Language.English else self.target_lang.value
141 | source = "{0}-{1}en-src.{2}.sgm".format(self.dataset.value, other_lang, self.source_lang.value)
142 | target = "{0}-{1}en-ref.{2}.sgm".format(self.dataset.value, other_lang, self.target_lang.value)
143 | elif self.dataset == WMTDataset.News2019:
144 | source = "{0}-{1}{2}-src.{1}.sgm".format(self.dataset.value, self.source_lang.value, self.target_lang.value)
145 | target = "{0}-{1}{2}-ref.{2}.sgm".format(self.dataset.value, self.source_lang.value, self.target_lang.value)
146 | else:
147 | raise ValueError("Unknown dataset: {}".format(self.dataset))
148 | return source, target
149 |
150 | def _get_dataset_name(self):
151 | cfg = (self.dataset, self.source_lang, self.target_lang)
152 | if cfg not in WMTEvaluator._datasets:
153 | raise ValueError("Unsupported dataset configuration: {} {} {}".format(
154 | self.dataset.name,
155 | self.source_lang.name,
156 | self.target_lang.name
157 | ))
158 |
159 | ds_names = {WMTDataset.News2014: "WMT2014", WMTDataset.News2019: "WMT2019"}
160 | return "{0} {1}-{2}".format(ds_names.get(self.dataset), self.source_lang.fullname, self.target_lang.fullname)
161 |
162 | def add(self, answers: Dict[str, str]):
163 | """
164 | Updates the evaluator with new results
165 |
166 | :param answers: a dict where keys are source segments ids and values are translated segments
167 | (segment id is created by concatenating document id and the original segment id,
168 | separated by `#`.)
169 |
170 | Examples:
171 | Update the evaluator with three results:
172 |
173 | .. code-block:: python
174 |
175 | my_evaluator.add({
176 | 'bbc.381790#1': 'Waliser AMs sorgen sich um "Aussehen wie Muppets"',
177 | 'bbc.381790#2': 'Unter einigen AMs herrscht Bestürzung über einen...',
178 | 'bbc.381790#3': 'Sie ist aufgrund von Plänen entstanden, den Namen...'
179 | })
180 |
181 | .. seealso:: `source_segments`
182 | """
183 |
184 | self.metrics.add(answers)
185 |
186 | if not self.first_batch_processed and self.metrics.has_data:
187 | self.batch_hash = calculate_batch_hash(
188 | self.cache_values(answers=self.metrics.answers,
189 | metrics=self.metrics.get_results(ignore_missing=True))
190 | )
191 | self.first_batch_processed = True
192 |
193 | @property
194 | def source_segments(self):
195 | """
196 | Ordered dictionary of all segments to translate with segments ids as keys. The same segments ids
197 | have to be used when submitting translations with :func:`add`.
198 |
199 | Examples:
200 |
201 | .. code-block:: python
202 |
203 | for segment_id, text in my_evaluator.source_segments.items():
204 | translated = model(text)
205 | my_evaluator.add({segment_id: translated})
206 |
207 | .. seealso: `source_documents`
208 | """
209 |
210 | return self.metrics.source_segments
211 |
212 | @property
213 | def source_documents(self):
214 | """
215 | List of all documents to translate
216 |
217 | Examples:
218 |
219 | .. code-block:: python
220 |
221 | for document in my_evaluator.source_documents:
222 | for segment in document.segments:
223 | translated = model(segment.text)
224 | my_evaluator.add({segment.id: translated})
225 |
226 | .. seealso: `source_segments`
227 | """
228 |
229 | return self.metrics.source_documents
230 |
231 | def reset(self):
232 | """
233 | Removes already added translations
234 |
235 | When checking if the model should be rerun on whole dataset it is first run on a smaller subset
236 | and the results are compared with values cached on sotabench server (the check is not performed
237 | when running locally.) Ideally, the smaller subset is just the first batch, so no additional
238 | computation is needed. However, for more complex multistage pipelines it may be simpler to
239 | run the model twice - on a small dataset and (if necessary) on the full dataset. In that case
240 | :func:`reset` needs to be called before the second run so values from the first run are not reported.
241 |
242 | .. seealso:: :func:`cache_exists`
243 | .. seealso:: :func:`reset_time`
244 | """
245 |
246 | self.metrics.reset()
247 |
248 | def get_results(self):
249 | """
250 | Gets the results for the evaluator. Empty string is assumed for segments for which in translation
251 | was provided.
252 |
253 | :return: dict with `SacreBLEU` and `BLEU score`.
254 | """
255 |
256 | if self.cached_results:
257 | return self.results
258 | self.results = self.metrics.get_results()
259 | self.speed_mem_metrics['Max Memory Allocated (Total)'] = get_max_memory_allocated()
260 |
261 | return self.results
262 |
263 | def save(self):
264 | dataset = self._get_dataset_name()
265 |
266 | if not self.cached_results:
267 | exec_speed = (time.time() - self.init_time)
268 | self.speed_mem_metrics['Tasks / Evaluation Time'] = len(self.metrics.answers) / exec_speed
269 | self.speed_mem_metrics['Tasks'] = len(self.metrics.answers)
270 | self.speed_mem_metrics['Evaluation Time'] = exec_speed
271 | else:
272 | self.speed_mem_metrics['Tasks / Evaluation Time'] = None
273 | self.speed_mem_metrics['Tasks'] = None
274 | self.speed_mem_metrics['Evaluation Time'] = None
275 |
276 | return super().save(dataset=dataset)
277 |
278 |
--------------------------------------------------------------------------------
/sotabencheval/natural_language_inference/__init__.py:
--------------------------------------------------------------------------------
1 | from .multinli import MultiNLI
2 |
3 | __all__ = ["MultiNLI"]
4 |
--------------------------------------------------------------------------------
/sotabencheval/natural_language_inference/multinli.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import time
3 |
4 | from itertools import zip_longest
5 | from pathlib import Path
6 |
7 | from sotabencheval.core import BaseEvaluator
8 | from sotabencheval.utils import calculate_batch_hash, extract_archive, change_root_if_server, is_server, get_max_memory_allocated
9 |
10 |
11 | def read_csv(path):
12 | with path.open('r') as f:
13 | yield from csv.DictReader(f, delimiter='\t')
14 |
15 |
16 | def get_path(local_root, local_unzip=False):
17 | root = Path(change_root_if_server(root=local_root,
18 | server_root=".data/nlp/multinli"))
19 | zip_name = "MNLI.zip"
20 | dataset_path=root / "MNLI" / "dev_matched.tsv"
21 | if not dataset_path.exists(): # unzip
22 | extract_archive(str(root / zip_name), to_path=root)
23 | return (dataset_path, dataset_path.parent / "dev_mismatched.tsv")
24 |
25 |
26 | class ClassificationEvaluator:
27 | def __init__(self, file_path):
28 | self.dataset_path = file_path
29 | dataset = list(read_csv(file_path))
30 | self.targets = {d['pairID']: d['gold_label'] for d in dataset}
31 | self.dataset = {d['pairID']: (d['sentence1'], d['sentence2']) for d in dataset}
32 | self.reset()
33 |
34 | def reset(self):
35 | self.answers = {}
36 |
37 | @property
38 | def count(self):
39 | return len(self.answers)
40 |
41 | def add(self, pairIds, preds):
42 | for pairId, pred in zip(pairIds,preds):
43 | if pairId not in self.targets:
44 | continue
45 | if pairId not in self.answers:
46 | self.answers[pairId] = pred
47 | else:
48 | print(f"Double prediction for {pairId} former: {self.answers[pairId]} new: {pred}")
49 |
50 | @property
51 | def has_enough_for_cache_hash(self):
52 | return self.count >= 100
53 |
54 | @property
55 | def accuracy(self):
56 | correct = [self.targets[k] == a for k,a in self.answers.items() if a is not None]
57 | accuracy = sum(correct) / self.count if self.count > 0 else 0
58 | if self.count != len(self.targets):
59 | return (accuracy, f"partial on {self.count} out of {len(self.targets)}")
60 | return accuracy
61 |
62 |
63 | class MultiNLI(BaseEvaluator):
64 | task = "Natural Language Inference"
65 | dataset = 'MultiNLI' # defined in subclass
66 |
67 | def __init__(self,
68 | local_root: str = '.',
69 | model_name: str = None,
70 | paper_arxiv_id: str = None,
71 | paper_pwc_id: str = None,
72 | paper_results: dict = None,
73 | model_description=None):
74 |
75 | super().__init__(model_name, paper_arxiv_id,
76 | paper_pwc_id, paper_results, model_description)
77 | self.local_root = local_root
78 | paths = self.dataset_paths
79 | self.matched = ClassificationEvaluator(paths[0])
80 | self.mismatched = ClassificationEvaluator(paths[1])
81 | self.reset()
82 |
83 | @property
84 | def dataset_paths(self):
85 | return get_path(self.local_root)
86 |
87 | @property
88 | def data_generator(self):
89 | for v1, v2 in zip_longest(self.matched.dataset.items(), self.mismatched.dataset.items()):
90 | if v1 is not None:
91 | yield v1
92 | if v2 is not None:
93 | yield v2
94 |
95 | def reset(self):
96 | self.matched.reset()
97 | self.mismatched.reset()
98 | self.batch_hash = None
99 | self.reset_time()
100 |
101 | def add(self, pairIds, predictions):
102 | """
103 | pairIDToLabel - Dictionary mapping pairID (str) to label (str)
104 | """
105 | if isinstance(pairIds, str):
106 | pairIds = [pairIds]
107 | predictions = [predictions]
108 |
109 | self.matched.add(pairIds, predictions)
110 | self.mismatched.add(pairIds, predictions)
111 | if self.batch_hash is None and self.matched.count + self.mismatched.count > 100:
112 | content = self.cache_values(matched=self.matched.answers, mismatched=self.mismatched.answers)
113 | self.batch_hash = calculate_batch_hash(content)
114 | self.first_batch_processed = True #TODO: do we need this if we have self.batch_hash
115 |
116 |
117 | def get_results(self):
118 | if self.cached_results:
119 | return self.results
120 | self.results = {
121 | 'Matched': self.matched.accuracy,
122 | 'Mismatched': self.mismatched.accuracy
123 | }
124 | self.speed_mem_metrics['Max Memory Allocated (Total)'] = get_max_memory_allocated()
125 | exec_speed = (time.time() - self.init_time)
126 | count = self.mismatched.count + self.matched.count
127 | self.speed_mem_metrics['Tasks / Evaluation Time'] = count / exec_speed
128 | self.speed_mem_metrics['Tasks'] = count
129 | self.speed_mem_metrics['Evaluation Time'] = exec_speed
130 | return self.results
131 |
132 | def save(self):
133 |
134 |
135 | return super().save(dataset=self.dataset)
136 |
--------------------------------------------------------------------------------
/sotabencheval/object_detection/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["COCOEvaluator"]
2 |
3 | from sotabencheval.object_detection.coco import COCOEvaluator
--------------------------------------------------------------------------------
/sotabencheval/object_detection/coco.py:
--------------------------------------------------------------------------------
1 | # Some of the processing logic here is based on the torchvision COCO dataset
2 | # https://github.com/pytorch/vision/blob/master/torchvision/datasets/coco.py
3 |
4 | import copy
5 | import numpy as np
6 | import os
7 | from pycocotools.coco import COCO
8 | from sotabenchapi.client import Client
9 | from sotabenchapi.core import BenchmarkResult, check_inputs
10 | import time
11 |
12 | from sotabencheval.utils import calculate_batch_hash, extract_archive, change_root_if_server, is_server
13 | from sotabencheval.utils import get_max_memory_allocated
14 | from sotabencheval.object_detection.coco_eval import CocoEvaluator
15 | from sotabencheval.object_detection.utils import get_coco_metrics
16 |
17 |
18 | class COCOEvaluator(object):
19 | """`COCO `_ benchmark.
20 |
21 | Examples:
22 | Evaluate a ResNeXt model from the torchvision repository:
23 |
24 | .. code-block:: python
25 |
26 | ...
27 |
28 | evaluator = COCOEvaluator(model_name='Mask R-CNN', paper_arxiv_id='1703.06870')
29 |
30 | with torch.no_grad():
31 | for i, (input, __) in enumerate(iterator):
32 | ...
33 | output = model(input)
34 | # optional formatting of output here to be a list of detection dicts
35 | evaluator.add(output)
36 |
37 | if evaluator.cache_exists:
38 | break
39 |
40 | evaluator.save()
41 | """
42 |
43 | task = "Object Detection"
44 |
45 | def __init__(self,
46 | root: str = '.',
47 | split: str = "val",
48 | dataset_year: str = "2017",
49 | model_name: str = None,
50 | paper_arxiv_id: str = None,
51 | paper_pwc_id: str = None,
52 | paper_results: dict = None,
53 | model_description=None,):
54 | """Initializes a COCO Evaluator object
55 |
56 | Args:
57 | root (string): Root directory of the COCO Dataset - where the
58 | label data is located (or will be downloaded to).
59 | split (str) : the split for COCO to use, e.g. 'val'
60 | dataset_year (str): the dataset year for COCO to use
61 | model_name (str, optional): The name of the model from the
62 | paper - if you want to link your build to a machine learning
63 | paper. See the COCO benchmark page for model names,
64 | https://sotabench.com/benchmarks/object-detection-on-coco-minival,
65 | e.g. on the paper leaderboard tab.
66 | paper_arxiv_id (str, optional): Optional linking to arXiv if you
67 | want to link to papers on the leaderboard; put in the
68 | corresponding paper's arXiv ID, e.g. '1611.05431'.
69 | paper_pwc_id (str, optional): Optional linking to Papers With Code;
70 | put in the corresponding papers with code URL slug, e.g.
71 | 'u-gat-it-unsupervised-generative-attentional'
72 | paper_results (dict, optional) : If the paper you are reproducing
73 | does not have model results on sotabench.com, you can specify
74 | the paper results yourself through this argument, where keys
75 | are metric names, values are metric values. e.g::
76 |
77 | {'box AP': 0.349, 'AP50': 0.592, ...}.
78 |
79 | Ensure that the metric names match those on the sotabench
80 | leaderboard - for COCO it should be 'box AP', 'AP50',
81 | 'AP75', 'APS', 'APM', 'APL'
82 | model_description (str, optional): Optional model description.
83 | """
84 | root = self.root = change_root_if_server(root=root,
85 | server_root="./.data/vision/coco")
86 |
87 | # Model metadata
88 |
89 | self.model_name = model_name
90 | self.paper_arxiv_id = paper_arxiv_id
91 | self.paper_pwc_id = paper_pwc_id
92 | self.paper_results = paper_results
93 | self.model_description = model_description
94 | self.split = split
95 |
96 | annFile = os.path.join(
97 | root, "annotations/instances_%s%s.json" % (self.split, dataset_year)
98 | )
99 |
100 | self._download(annFile)
101 |
102 | self.coco = COCO(annFile)
103 | self.iou_types = ['bbox']
104 | self.coco_evaluator = CocoEvaluator(self.coco, self.iou_types)
105 |
106 | self.detections = []
107 | self.results = None
108 |
109 | # Backend variables for hashing and caching
110 |
111 | self.first_batch_processed = False
112 | self.batch_hash = None
113 | self.cached_results = False
114 |
115 | # Speed and memory metrics
116 |
117 | self.speed_mem_metrics = {}
118 | self.init_time = time.time()
119 |
120 | def _download(self, annFile):
121 | """
122 | Utility function for downloading the COCO annotation file
123 |
124 | :param annFile: path of the annotations file
125 | :return: void - extracts the archive
126 | """
127 | if not os.path.isfile(annFile):
128 | if "2017" in annFile:
129 | annotations_dir_zip = os.path.join(
130 | self.root, "annotations_train%s2017.zip" % self.split
131 | )
132 | elif "2014" in annFile:
133 | annotations_dir_zip = os.path.join(
134 | self.root, "annotations_train%s2014.zip" % self.split
135 | )
136 | else:
137 | annotations_dir_zip = None
138 |
139 | if annotations_dir_zip is not None:
140 | print('Attempt to extract annotations file at {zip_loc}'.format(zip_loc=annotations_dir_zip))
141 | extract_archive(from_path=annotations_dir_zip, to_path=self.root)
142 |
143 | @property
144 | def cache_exists(self):
145 | """
146 | Checks whether the cache exists in the sotabench.com database - if so
147 | then sets self.results to cached results and returns True.
148 |
149 | You can use this property for control flow to break a for loop over a dataset
150 | after the first iteration. This prevents rerunning the same calculation for the
151 | same model twice.
152 |
153 | Examples:
154 | Breaking a for loop
155 |
156 | .. code-block:: python
157 |
158 | ...
159 |
160 | with torch.no_grad():
161 | for i, (input, target) in enumerate(iterator):
162 | ...
163 | output = model(input)
164 | # optional formatting of output here to be a list of detection dicts
165 | evaluator.add(output)
166 |
167 | if evaluator.cache_exists:
168 | break
169 |
170 | evaluator.save()
171 |
172 | :return: bool or None (if not in check mode)
173 | """
174 | if not self.first_batch_processed:
175 | raise ValueError('No batches of data have been processed so no batch_hash exists')
176 |
177 | if not is_server(): # we only check the cache on the server
178 | return None
179 |
180 | client = Client.public()
181 | cached_res = client.get_results_by_run_hash(self.batch_hash)
182 | if cached_res:
183 | self.results = cached_res
184 | self.cached_results = True
185 | print(
186 | "No model change detected (using the first batch run "
187 | "hash). Will use cached results."
188 | )
189 | return True
190 |
191 | return False
192 |
193 | @staticmethod
194 | def cache_format_ann(ann):
195 | """
196 | Cache formats an annotation dictionary with rounding. the reason we need to round is that if we have
197 | small floating point originated differences, then changes the hash of the predictions.
198 |
199 | :param ann (dict): A detection dictionary
200 |
201 | :return: ann : A detection dictionary but with rounded values
202 | """
203 | ann['bbox'] = [np.round(el, 3) for el in ann['bbox']]
204 | ann['score'] = np.round(ann['score'], 3)
205 |
206 | if 'segmentation' in ann:
207 | ann['segmentation'] = [np.round(el, 3) for el in ann['segmentation']]
208 |
209 | if 'area' in ann:
210 | ann['area'] = np.round(ann['area'], 3)
211 |
212 | return ann
213 |
214 | def cache_values(self, annotations, metrics):
215 | """
216 | Takes in annotations and metrics, and formats the data to calculate the hash for the cache
217 | :param annotations: list of detections
218 | :param metrics: dictionary of final AP metrics
219 | :return: list of data (combining annotations and metrics)
220 | """
221 | metrics = {k: np.round(v, 3) for k, v in metrics.items()}
222 | new_annotations = copy.deepcopy(annotations)
223 | new_annotations = [self.cache_format_ann(ann) for ann in new_annotations]
224 |
225 | return new_annotations + [metrics]
226 |
227 | def add(self, detections: list):
228 | """
229 | Update the evaluator with new detections
230 |
231 | :param annotations (list): List of detections, that will be used by the COCO.loadRes method in the
232 | pycocotools API. Each detection can take a dictionary format like the following:
233 |
234 | {'image_id': 397133, 'bbox': [386.1628112792969, 69.48855590820312, 110.14895629882812, 278.2847595214844],
235 | 'score': 0.999152421951294, 'category_id': 1}
236 |
237 | I.e is a list of dictionaries.
238 |
239 | :return: void - updates self.detection with the new IDSs and prediction
240 |
241 | Examples:
242 | Update the evaluator with two results:
243 |
244 | .. code-block:: python
245 |
246 | my_evaluator.add([{'image_id': 397133, 'bbox': [386.1628112792969, 69.48855590820312,
247 | 110.14895629882812, 278.2847595214844], 'score': 0.999152421951294, 'category_id': 1}])
248 | """
249 | self.detections.extend(detections)
250 |
251 | self.coco_evaluator.update(detections)
252 |
253 | if not self.first_batch_processed:
254 | self.coco_evaluator.evaluate()
255 | self.coco_evaluator.accumulate()
256 |
257 | if any([detection['bbox'] for detection in detections]): # we can only hash if we have predictions
258 | self.batch_hash = calculate_batch_hash(
259 | self.cache_values(annotations=detections, metrics=get_coco_metrics(self.coco_evaluator)))
260 | self.first_batch_processed = True
261 |
262 | def get_results(self):
263 | """
264 | Reruns the evaluation using the accumulated detections, returns COCO results with AP metrics
265 |
266 | :return: dict with COCO AP metrics
267 | """
268 | if self.cached_results:
269 | return self.results
270 |
271 | self.coco_evaluator = CocoEvaluator(self.coco, self.iou_types)
272 | self.coco_evaluator.update(self.detections)
273 | self.coco_evaluator.evaluate()
274 | self.coco_evaluator.accumulate()
275 | self.coco_evaluator.summarize()
276 |
277 | self.results = get_coco_metrics(self.coco_evaluator)
278 | self.speed_mem_metrics['Max Memory Allocated (Total)'] = get_max_memory_allocated()
279 |
280 | return self.results
281 |
282 | def reset_time(self):
283 | """
284 | Simple method to reset the timer self.init_time. Often used before a loop, to time the evaluation
285 | appropriately, for example:
286 |
287 | :return: void - resets self.init_time
288 | """
289 | self.init_time = time.time()
290 |
291 | def save(self):
292 | """
293 | Calculate results and then put into a BenchmarkResult object
294 |
295 | On the sotabench.com server, this will produce a JSON file serialisation and results will be recorded
296 | on the platform.
297 |
298 | :return: BenchmarkResult object with results and metadata
299 | """
300 | # recalculate to ensure no mistakes made during batch-by-batch metric calculation
301 | self.get_results()
302 |
303 | # If this is the first time the model is run, then we record evaluation time information
304 |
305 | if not self.cached_results:
306 | unique_image_ids = set([d['image_id'] for d in self.detections])
307 | exec_speed = (time.time() - self.init_time)
308 | self.speed_mem_metrics['Tasks / Evaluation Time'] = len(unique_image_ids) / exec_speed
309 | self.speed_mem_metrics['Tasks'] = len(unique_image_ids)
310 | self.speed_mem_metrics['Evaluation Time'] = exec_speed
311 | else:
312 | self.speed_mem_metrics['Tasks / Evaluation Time'] = None
313 | self.speed_mem_metrics['Tasks'] = None
314 | self.speed_mem_metrics['Evaluation Time'] = None
315 |
316 | return BenchmarkResult(
317 | task=self.task,
318 | config={},
319 | dataset='COCO minival',
320 | results=self.results,
321 | speed_mem_metrics=self.speed_mem_metrics,
322 | model=self.model_name,
323 | model_description=self.model_description,
324 | arxiv_id=self.paper_arxiv_id,
325 | pwc_id=self.paper_pwc_id,
326 | paper_results=self.paper_results,
327 | run_hash=self.batch_hash,
328 | )
329 |
--------------------------------------------------------------------------------
/sotabencheval/object_detection/coco_eval.py:
--------------------------------------------------------------------------------
1 | # Code is based on https://github.com/pytorch/vision/blob/master/references/detection/
2 |
3 | import numpy as np
4 | import copy
5 |
6 | from pycocotools.cocoeval import COCOeval
7 | from pycocotools.coco import COCO
8 | import pycocotools.mask as mask_util
9 |
10 | from collections import defaultdict
11 |
12 |
13 | class CocoEvaluator(object):
14 | """
15 | For now this only does BBOX detection - so 'bbox' is the only acceptable iou_type
16 | """
17 | def __init__(self, coco_gt, iou_types):
18 | assert isinstance(iou_types, (list, tuple))
19 | coco_gt = copy.deepcopy(coco_gt)
20 | self.coco_gt = coco_gt
21 |
22 | self.iou_types = iou_types
23 | self.coco_eval = {}
24 | for iou_type in iou_types:
25 | self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
26 |
27 | self.annotation_list = []
28 |
29 | def update(self, annotation_list):
30 | assert(type(annotation_list) == list)
31 |
32 | self.annotation_list.extend(annotation_list)
33 |
34 | for iou_type in self.iou_types:
35 | coco_dt = loadRes(self.coco_gt, self.annotation_list) if self.annotation_list else COCO()
36 | coco_eval = self.coco_eval[iou_type]
37 | coco_eval.cocoDt = coco_dt
38 | coco_eval.params.imgIds = self.coco_gt.getImgIds()
39 |
40 | def accumulate(self):
41 | for coco_eval in self.coco_eval.values():
42 | coco_eval.accumulate()
43 |
44 | def evaluate(self):
45 | for coco_eval in self.coco_eval.values():
46 | coco_eval.evaluate()
47 |
48 | def summarize(self):
49 | for iou_type, coco_eval in self.coco_eval.items():
50 | # print("IoU metric: {}".format(iou_type))
51 | coco_eval.summarize()
52 |
53 |
54 | #################################################################
55 | # From pycocotools, just removed the prints and fixed
56 | # a Python3 bug about unicode not defined
57 | #################################################################
58 |
59 | # Ideally, pycocotools wouldn't have hard-coded prints
60 | # so that we could avoid copy-pasting those two functions
61 |
62 |
63 | def createIndex(self):
64 | # create index
65 | # print('creating index...')
66 | anns, cats, imgs = {}, {}, {}
67 | imgToAnns, catToImgs = defaultdict(list), defaultdict(list)
68 | if "annotations" in self.dataset:
69 | for ann in self.dataset["annotations"]:
70 | imgToAnns[ann["image_id"]].append(ann)
71 | anns[ann["id"]] = ann
72 |
73 | if "images" in self.dataset:
74 | for img in self.dataset["images"]:
75 | imgs[img["id"]] = img
76 |
77 | if "categories" in self.dataset:
78 | for cat in self.dataset["categories"]:
79 | cats[cat["id"]] = cat
80 |
81 | if "annotations" in self.dataset and "categories" in self.dataset:
82 | for ann in self.dataset["annotations"]:
83 | catToImgs[ann["category_id"]].append(ann["image_id"])
84 |
85 | # print('index created!')
86 |
87 | # create class members
88 | self.anns = anns
89 | self.imgToAnns = imgToAnns
90 | self.catToImgs = catToImgs
91 | self.imgs = imgs
92 | self.cats = cats
93 |
94 |
95 | maskUtils = mask_util
96 |
97 |
98 | def loadRes(coco, anns):
99 | """Load result file and return a result api object.
100 |
101 | ``anns`` is a list of dicts containing the results
102 |
103 | In the original pycoco api, a results file is passed in, whereas in this
104 | case we bypass the json file loading and ask for a list of dictionary
105 | annotations to be passed directly in
106 |
107 | Returns:
108 | res (obj): result api object.
109 | """
110 | res = COCO()
111 | res.dataset["images"] = [img for img in coco.dataset["images"]]
112 |
113 | # print('Loading and preparing results...')
114 | # tic = time.time()
115 | # if isinstance(resFile, torch._six.string_classes):
116 | # anns = json.load(open(resFile))
117 | # elif type(resFile) == np.ndarray:
118 | # anns = self.loadNumpyAnnotations(resFile)
119 | # else:
120 | # anns = resFile
121 | assert type(anns) == list, "results in not an array of objects"
122 | annsImgIds = [ann["image_id"] for ann in anns]
123 | assert set(annsImgIds) == (
124 | set(annsImgIds) & set(coco.getImgIds())
125 | ), "Results do not correspond to current coco set"
126 | if "caption" in anns[0]:
127 | imgIds = set([img["id"] for img in res.dataset["images"]]) & set(
128 | [ann["image_id"] for ann in anns]
129 | )
130 | res.dataset["images"] = [
131 | img for img in res.dataset["images"] if img["id"] in imgIds
132 | ]
133 | for id, ann in enumerate(anns):
134 | ann["id"] = id + 1
135 | elif "bbox" in anns[0] and not anns[0]["bbox"] == []:
136 | res.dataset["categories"] = copy.deepcopy(coco.dataset["categories"])
137 | for id, ann in enumerate(anns):
138 | bb = ann["bbox"]
139 | x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
140 | if "segmentation" not in ann:
141 | ann["segmentation"] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
142 | ann["area"] = bb[2] * bb[3]
143 | ann["id"] = id + 1
144 | ann["iscrowd"] = 0
145 | elif "segmentation" in anns[0]:
146 | res.dataset["categories"] = copy.deepcopy(coco.dataset["categories"])
147 | for id, ann in enumerate(anns):
148 | # now only support compressed RLE format as segmentation results
149 | ann["area"] = maskUtils.area(ann["segmentation"])
150 | if "bbox" not in ann:
151 | ann["bbox"] = maskUtils.toBbox(ann["segmentation"])
152 | ann["id"] = id + 1
153 | ann["iscrowd"] = 0
154 | elif "keypoints" in anns[0]:
155 | res.dataset["categories"] = copy.deepcopy(coco.dataset["categories"])
156 | for id, ann in enumerate(anns):
157 | s = ann["keypoints"]
158 | x = s[0::3]
159 | y = s[1::3]
160 | x0, x1, y0, y1 = np.min(x), np.max(x), np.min(y), np.max(y)
161 | ann["area"] = (x1 - x0) * (y1 - y0)
162 | ann["id"] = id + 1
163 | ann["bbox"] = [x0, y0, x1 - x0, y1 - y0]
164 | # print('DONE (t={:0.2f}s)'.format(time.time()- tic))
165 |
166 | res.dataset["annotations"] = anns
167 | createIndex(res)
168 | return res
169 |
--------------------------------------------------------------------------------
/sotabencheval/object_detection/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def get_coco_metrics(coco_evaluator):
4 |
5 | metrics = {
6 | "box AP": None,
7 | "AP50": None,
8 | "AP75": None,
9 | "APS": None,
10 | "APM": None,
11 | "APL": None,
12 | }
13 | iouThrs = [None, 0.5, 0.75, None, None, None]
14 | maxDets = [100] + [coco_evaluator.coco_eval["bbox"].params.maxDets[2]] * 5
15 | areaRngs = ["all", "all", "all", "small", "medium", "large"]
16 | bounding_box_params = coco_evaluator.coco_eval["bbox"].params
17 |
18 | for metric_no, metric in enumerate(metrics):
19 | aind = [
20 | i
21 | for i, aRng in enumerate(bounding_box_params.areaRngLbl)
22 | if aRng == areaRngs[metric_no]
23 | ]
24 | mind = [
25 | i
26 | for i, mDet in enumerate(bounding_box_params.maxDets)
27 | if mDet == maxDets[metric_no]
28 | ]
29 |
30 | s = coco_evaluator.coco_eval["bbox"].eval["precision"]
31 |
32 | # IoU
33 | if iouThrs[metric_no] is not None:
34 | t = np.where(iouThrs[metric_no] == bounding_box_params.iouThrs)[0]
35 | s = s[t]
36 | s = s[:, :, :, aind, mind]
37 |
38 | if len(s[s > -1]) == 0:
39 | mean_s = -1
40 | else:
41 | mean_s = np.mean(s[s > -1])
42 |
43 | metrics[metric] = mean_s
44 |
45 | return metrics
46 |
--------------------------------------------------------------------------------
/sotabencheval/question_answering/__init__.py:
--------------------------------------------------------------------------------
1 | from sotabencheval.question_answering.squad import SQuADEvaluator, SQuADVersion
2 |
3 | __all__ = ["SQuADEvaluator", "SQuADVersion"]
4 |
--------------------------------------------------------------------------------
/sotabencheval/question_answering/evaluate_v11.py:
--------------------------------------------------------------------------------
1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """
2 | from __future__ import print_function
3 | from collections import Counter
4 | import string
5 | import re
6 | import argparse
7 | import json
8 | import sys
9 |
10 |
11 | def normalize_answer(s):
12 | """Lower text and remove punctuation, articles and extra whitespace."""
13 | def remove_articles(text):
14 | return re.sub(r'\b(a|an|the)\b', ' ', text)
15 |
16 | def white_space_fix(text):
17 | return ' '.join(text.split())
18 |
19 | def remove_punc(text):
20 | exclude = set(string.punctuation)
21 | return ''.join(ch for ch in text if ch not in exclude)
22 |
23 | def lower(text):
24 | return text.lower()
25 |
26 | return white_space_fix(remove_articles(remove_punc(lower(s))))
27 |
28 |
29 | def f1_score(prediction, ground_truth):
30 | prediction_tokens = normalize_answer(prediction).split()
31 | ground_truth_tokens = normalize_answer(ground_truth).split()
32 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
33 | num_same = sum(common.values())
34 | if num_same == 0:
35 | return 0
36 | precision = 1.0 * num_same / len(prediction_tokens)
37 | recall = 1.0 * num_same / len(ground_truth_tokens)
38 | f1 = (2 * precision * recall) / (precision + recall)
39 | return f1
40 |
41 |
42 | def exact_match_score(prediction, ground_truth):
43 | return (normalize_answer(prediction) == normalize_answer(ground_truth))
44 |
45 |
46 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
47 | scores_for_ground_truths = []
48 | for ground_truth in ground_truths:
49 | score = metric_fn(prediction, ground_truth)
50 | scores_for_ground_truths.append(score)
51 | return max(scores_for_ground_truths)
52 |
53 |
54 | def evaluate(dataset, predictions):
55 | f1 = exact_match = total = 0
56 | for article in dataset:
57 | for paragraph in article['paragraphs']:
58 | for qa in paragraph['qas']:
59 | total += 1
60 | if qa['id'] not in predictions:
61 | message = 'Unanswered question ' + qa['id'] + \
62 | ' will receive score 0.'
63 | print(message, file=sys.stderr)
64 | continue
65 | ground_truths = list(map(lambda x: x['text'], qa['answers']))
66 | prediction = predictions[qa['id']]
67 | exact_match += metric_max_over_ground_truths(
68 | exact_match_score, prediction, ground_truths)
69 | f1 += metric_max_over_ground_truths(
70 | f1_score, prediction, ground_truths)
71 |
72 | exact_match = 100.0 * exact_match / total
73 | f1 = 100.0 * f1 / total
74 |
75 | return {'exact_match': exact_match, 'f1': f1}
76 |
77 |
78 | if __name__ == '__main__':
79 | expected_version = '1.1'
80 | parser = argparse.ArgumentParser(
81 | description='Evaluation for SQuAD ' + expected_version)
82 | parser.add_argument('dataset_file', help='Dataset file')
83 | parser.add_argument('prediction_file', help='Prediction File')
84 | args = parser.parse_args()
85 | with open(args.dataset_file) as dataset_file:
86 | dataset_json = json.load(dataset_file)
87 | if (dataset_json['version'] != expected_version):
88 | print('Evaluation expects v-' + expected_version +
89 | ', but got dataset with v-' + dataset_json['version'],
90 | file=sys.stderr)
91 | dataset = dataset_json['data']
92 | with open(args.prediction_file) as prediction_file:
93 | predictions = json.load(prediction_file)
94 | print(json.dumps(evaluate(dataset, predictions)))
95 |
--------------------------------------------------------------------------------
/sotabencheval/question_answering/evaluate_v20.py:
--------------------------------------------------------------------------------
1 | """Official evaluation script for SQuAD version 2.0.
2 |
3 | In addition to basic functionality, we also compute additional statistics and
4 | plot precision-recall curves if an additional na_prob.json file is provided.
5 | This file is expected to map question ID's to the model's predicted probability
6 | that a question is unanswerable.
7 | """
8 | import argparse
9 | import collections
10 | import json
11 | import numpy as np
12 | import os
13 | import re
14 | import string
15 | import sys
16 |
17 | OPTS = None
18 |
19 | def parse_args():
20 | parser = argparse.ArgumentParser('Official evaluation script for SQuAD version 2.0.')
21 | parser.add_argument('data_file', metavar='data.json', help='Input data JSON file.')
22 | parser.add_argument('pred_file', metavar='pred.json', help='Model predictions.')
23 | parser.add_argument('--out-file', '-o', metavar='eval.json',
24 | help='Write accuracy metrics to file (default is stdout).')
25 | parser.add_argument('--na-prob-file', '-n', metavar='na_prob.json',
26 | help='Model estimates of probability of no answer.')
27 | parser.add_argument('--na-prob-thresh', '-t', type=float, default=1.0,
28 | help='Predict "" if no-answer probability exceeds this (default = 1.0).')
29 | parser.add_argument('--out-image-dir', '-p', metavar='out_images', default=None,
30 | help='Save precision-recall curves to directory.')
31 | parser.add_argument('--verbose', '-v', action='store_true')
32 | if len(sys.argv) == 1:
33 | parser.print_help()
34 | sys.exit(1)
35 | return parser.parse_args()
36 |
37 | def make_qid_to_has_ans(dataset):
38 | qid_to_has_ans = {}
39 | for article in dataset:
40 | for p in article['paragraphs']:
41 | for qa in p['qas']:
42 | qid_to_has_ans[qa['id']] = bool(qa['answers'])
43 | return qid_to_has_ans
44 |
45 | def normalize_answer(s):
46 | """Lower text and remove punctuation, articles and extra whitespace."""
47 | def remove_articles(text):
48 | regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
49 | return re.sub(regex, ' ', text)
50 | def white_space_fix(text):
51 | return ' '.join(text.split())
52 | def remove_punc(text):
53 | exclude = set(string.punctuation)
54 | return ''.join(ch for ch in text if ch not in exclude)
55 | def lower(text):
56 | return text.lower()
57 | return white_space_fix(remove_articles(remove_punc(lower(s))))
58 |
59 | def get_tokens(s):
60 | if not s: return []
61 | return normalize_answer(s).split()
62 |
63 | def compute_exact(a_gold, a_pred):
64 | return int(normalize_answer(a_gold) == normalize_answer(a_pred))
65 |
66 | def compute_f1(a_gold, a_pred):
67 | gold_toks = get_tokens(a_gold)
68 | pred_toks = get_tokens(a_pred)
69 | common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
70 | num_same = sum(common.values())
71 | if len(gold_toks) == 0 or len(pred_toks) == 0:
72 | # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
73 | return int(gold_toks == pred_toks)
74 | if num_same == 0:
75 | return 0
76 | precision = 1.0 * num_same / len(pred_toks)
77 | recall = 1.0 * num_same / len(gold_toks)
78 | f1 = (2 * precision * recall) / (precision + recall)
79 | return f1
80 |
81 | def get_raw_scores(dataset, preds):
82 | exact_scores = {}
83 | f1_scores = {}
84 | for article in dataset:
85 | for p in article['paragraphs']:
86 | for qa in p['qas']:
87 | qid = qa['id']
88 | gold_answers = [a['text'] for a in qa['answers']
89 | if normalize_answer(a['text'])]
90 | if not gold_answers:
91 | # For unanswerable questions, only correct answer is empty string
92 | gold_answers = ['']
93 | if qid not in preds:
94 | print('Missing prediction for %s' % qid)
95 | continue
96 | a_pred = preds[qid]
97 | # Take max over all gold answers
98 | exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers)
99 | f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers)
100 | return exact_scores, f1_scores
101 |
102 | def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
103 | new_scores = {}
104 | for qid, s in scores.items():
105 | pred_na = na_probs[qid] > na_prob_thresh
106 | if pred_na:
107 | new_scores[qid] = float(not qid_to_has_ans[qid])
108 | else:
109 | new_scores[qid] = s
110 | return new_scores
111 |
112 | def make_eval_dict(exact_scores, f1_scores, qid_list=None):
113 | if not qid_list:
114 | total = len(exact_scores)
115 | return collections.OrderedDict([
116 | ('exact', 100.0 * sum(exact_scores.values()) / total),
117 | ('f1', 100.0 * sum(f1_scores.values()) / total),
118 | ('total', total),
119 | ])
120 | else:
121 | total = len(qid_list)
122 | return collections.OrderedDict([
123 | ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
124 | ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
125 | ('total', total),
126 | ])
127 |
128 | def merge_eval(main_eval, new_eval, prefix):
129 | for k in new_eval:
130 | main_eval['%s_%s' % (prefix, k)] = new_eval[k]
131 |
132 | def plot_pr_curve(precisions, recalls, out_image, title):
133 | plt.step(recalls, precisions, color='b', alpha=0.2, where='post')
134 | plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b')
135 | plt.xlabel('Recall')
136 | plt.ylabel('Precision')
137 | plt.xlim([0.0, 1.05])
138 | plt.ylim([0.0, 1.05])
139 | plt.title(title)
140 | plt.savefig(out_image)
141 | plt.clf()
142 |
143 | def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans,
144 | out_image=None, title=None):
145 | qid_list = sorted(na_probs, key=lambda k: na_probs[k])
146 | true_pos = 0.0
147 | cur_p = 1.0
148 | cur_r = 0.0
149 | precisions = [1.0]
150 | recalls = [0.0]
151 | avg_prec = 0.0
152 | for i, qid in enumerate(qid_list):
153 | if qid_to_has_ans[qid]:
154 | true_pos += scores[qid]
155 | cur_p = true_pos / float(i+1)
156 | cur_r = true_pos / float(num_true_pos)
157 | if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]:
158 | # i.e., if we can put a threshold after this point
159 | avg_prec += cur_p * (cur_r - recalls[-1])
160 | precisions.append(cur_p)
161 | recalls.append(cur_r)
162 | if out_image:
163 | plot_pr_curve(precisions, recalls, out_image, title)
164 | return {'ap': 100.0 * avg_prec}
165 |
166 | def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs,
167 | qid_to_has_ans, out_image_dir):
168 | if out_image_dir and not os.path.exists(out_image_dir):
169 | os.makedirs(out_image_dir)
170 | num_true_pos = sum(1 for v in qid_to_has_ans.values() if v)
171 | if num_true_pos == 0:
172 | return
173 | pr_exact = make_precision_recall_eval(
174 | exact_raw, na_probs, num_true_pos, qid_to_has_ans,
175 | out_image=os.path.join(out_image_dir, 'pr_exact.png'),
176 | title='Precision-Recall curve for Exact Match score')
177 | pr_f1 = make_precision_recall_eval(
178 | f1_raw, na_probs, num_true_pos, qid_to_has_ans,
179 | out_image=os.path.join(out_image_dir, 'pr_f1.png'),
180 | title='Precision-Recall curve for F1 score')
181 | oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()}
182 | pr_oracle = make_precision_recall_eval(
183 | oracle_scores, na_probs, num_true_pos, qid_to_has_ans,
184 | out_image=os.path.join(out_image_dir, 'pr_oracle.png'),
185 | title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)')
186 | merge_eval(main_eval, pr_exact, 'pr_exact')
187 | merge_eval(main_eval, pr_f1, 'pr_f1')
188 | merge_eval(main_eval, pr_oracle, 'pr_oracle')
189 |
190 | def histogram_na_prob(na_probs, qid_list, image_dir, name):
191 | if not qid_list:
192 | return
193 | x = [na_probs[k] for k in qid_list]
194 | weights = np.ones_like(x) / float(len(x))
195 | plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0))
196 | plt.xlabel('Model probability of no-answer')
197 | plt.ylabel('Proportion of dataset')
198 | plt.title('Histogram of no-answer probability: %s' % name)
199 | plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name))
200 | plt.clf()
201 |
202 | def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
203 | num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
204 | cur_score = num_no_ans
205 | best_score = cur_score
206 | best_thresh = 0.0
207 | qid_list = sorted(na_probs, key=lambda k: na_probs[k])
208 | for i, qid in enumerate(qid_list):
209 | if qid not in scores: continue
210 | if qid_to_has_ans[qid]:
211 | diff = scores[qid]
212 | else:
213 | if preds[qid]:
214 | diff = -1
215 | else:
216 | diff = 0
217 | cur_score += diff
218 | if cur_score > best_score:
219 | best_score = cur_score
220 | best_thresh = na_probs[qid]
221 | return 100.0 * best_score / len(scores), best_thresh
222 |
223 | def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
224 | best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
225 | best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
226 | main_eval['best_exact'] = best_exact
227 | main_eval['best_exact_thresh'] = exact_thresh
228 | main_eval['best_f1'] = best_f1
229 | main_eval['best_f1_thresh'] = f1_thresh
230 |
231 | def main():
232 | with open(OPTS.data_file) as f:
233 | dataset_json = json.load(f)
234 | dataset = dataset_json['data']
235 | with open(OPTS.pred_file) as f:
236 | preds = json.load(f)
237 | if OPTS.na_prob_file:
238 | with open(OPTS.na_prob_file) as f:
239 | na_probs = json.load(f)
240 | else:
241 | na_probs = {k: 0.0 for k in preds}
242 | qid_to_has_ans = make_qid_to_has_ans(dataset) # maps qid to True/False
243 | has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
244 | no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
245 | exact_raw, f1_raw = get_raw_scores(dataset, preds)
246 | exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
247 | OPTS.na_prob_thresh)
248 | f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
249 | OPTS.na_prob_thresh)
250 | out_eval = make_eval_dict(exact_thresh, f1_thresh)
251 | if has_ans_qids:
252 | has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
253 | merge_eval(out_eval, has_ans_eval, 'HasAns')
254 | if no_ans_qids:
255 | no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
256 | merge_eval(out_eval, no_ans_eval, 'NoAns')
257 | if OPTS.na_prob_file:
258 | find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans)
259 | if OPTS.na_prob_file and OPTS.out_image_dir:
260 | run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs,
261 | qid_to_has_ans, OPTS.out_image_dir)
262 | histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, 'hasAns')
263 | histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, 'noAns')
264 | if OPTS.out_file:
265 | with open(OPTS.out_file, 'w') as f:
266 | json.dump(out_eval, f)
267 | else:
268 | print(json.dumps(out_eval, indent=2))
269 |
270 | if __name__ == '__main__':
271 | OPTS = parse_args()
272 | if OPTS.out_image_dir:
273 | import matplotlib
274 | matplotlib.use('Agg')
275 | import matplotlib.pyplot as plt
276 | main()
277 |
278 |
--------------------------------------------------------------------------------
/sotabencheval/question_answering/squad.py:
--------------------------------------------------------------------------------
1 | from sotabencheval.core import BaseEvaluator
2 | from sotabencheval.utils import calculate_batch_hash, change_root_if_server, is_server, get_max_memory_allocated
3 | from sotabencheval.question_answering.utils import *
4 | from typing import Dict
5 | from enum import Enum
6 | from pathlib import Path
7 | import json
8 | import time
9 |
10 | class SQuADVersion(Enum):
11 | V11 = 'v1.1'
12 | V20 = 'v2.0'
13 |
14 |
15 | class SQuADEvaluator(BaseEvaluator):
16 | """Evaluator for Stanford Question Answering Dataset v1.1 and v2.0 benchmarks.
17 |
18 | Examples:
19 | Evaluate a BiDAF model from the AllenNLP repository on SQuAD 1.1 development set:
20 |
21 | .. code-block:: python
22 |
23 | from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion
24 |
25 | from allennlp.data import DatasetReader
26 | from allennlp.data.iterators import DataIterator
27 | from allennlp.models.archival import load_archive
28 | from allennlp.nn.util import move_to_device
29 |
30 | def load_model(url, batch_size=64):
31 | archive = load_archive(url, cuda_device=0)
32 | model = archive.model
33 | reader = DatasetReader.from_params(archive.config["dataset_reader"])
34 | iterator_params = archive.config["iterator"]
35 | iterator_params["batch_size"] = batch_size
36 | data_iterator = DataIterator.from_params(iterator_params)
37 | data_iterator.index_with(model.vocab)
38 | return model, reader, data_iterator
39 |
40 | def evaluate(model, dataset, data_iterator, evaluator):
41 | model.eval()
42 | evaluator.reset_time()
43 | for batch in data_iterator(dataset, num_epochs=1, shuffle=False):
44 | batch = move_to_device(batch, 0)
45 | predictions = model(**batch)
46 | answers = {metadata['id']: prediction
47 | for metadata, prediction in zip(batch['metadata'], predictions['best_span_str'])}
48 | evaluator.add(answers)
49 | if evaluator.cache_exists:
50 | break
51 |
52 | evaluator = SQuADEvaluator(local_root="data/nlp/squad", model_name="BiDAF (single)",
53 | paper_arxiv_id="1611.01603", version=SQuADVersion.V11)
54 |
55 | model, reader, data_iter =\
56 | load_model("https://allennlp.s3.amazonaws.com/models/bidaf-model-2017.09.15-charpad.tar.gz")
57 | dataset = reader.read(evaluator.dataset_path)
58 | evaluate(model, dataset, data_iter, evaluator)
59 | evaluator.save()
60 | print(evaluator.results)
61 | """
62 |
63 | task = "Question Answering"
64 |
65 | def __init__(self,
66 | local_root: str = '.',
67 | dataset_filename: str = None,
68 | model_name: str = None,
69 | paper_arxiv_id: str = None,
70 | paper_pwc_id: str = None,
71 | paper_results: dict = None,
72 | model_description=None,
73 | version: SQuADVersion = SQuADVersion.V20):
74 | """
75 | Creates an evaluator for SQuAD v1.1 or v2.0 Question Answering benchmarks.
76 |
77 | :param local_root: Path to the directory where the dataset files are located locally.
78 | Ignored when run on sotabench server.
79 | :param dataset_filename: Local filename of the JSON file with the SQuAD dataset.
80 | If None, the standard filename is used, based on :param:`version`.
81 | Ignored when run on sotabench server.
82 | :param model_name: The name of the model from the
83 | paper - if you want to link your build to a model from a
84 | machine learning paper. See the SQuAD benchmarks pages for model names,
85 | (f.e., https://sotabench.com/benchmarks/question-answering-on-squad11-dev)
86 | on the paper leaderboard or models yet to try tabs.
87 | :param paper_arxiv_id: Optional linking to arXiv if you
88 | want to link to papers on the leaderboard; put in the
89 | corresponding paper's arXiv ID, e.g. '1907.10529'.
90 | :param paper_pwc_id: Optional linking to Papers With Code;
91 | put in the corresponding papers with code URL slug, e.g.
92 | 'spanbert-improving-pre-training-by'
93 | :param paper_results: If the paper model you are reproducing
94 | does not have model results on sotabench.com, you can specify
95 | the paper results yourself through this argument, where keys
96 | are metric names, values are metric values. e.g:
97 |
98 | {'EM': 0.858, 'F1': 0.873}.
99 |
100 | Ensure that the metric names match those on the sotabench
101 | leaderboard - for SQuAD benchmarks it should be `EM` for exact match
102 | and `F1` for F1 score. Make sure to use results of evaluation on a development set.
103 | :param model_description: Optional model description.
104 | :param version: Which dataset to evaluate on, either `SQuADVersion.V11` or `SQuADVersion.V20`.
105 | """
106 | super().__init__(model_name, paper_arxiv_id, paper_pwc_id, paper_results, model_description)
107 | self.root = change_root_if_server(root=local_root,
108 | server_root=".data/nlp/squad")
109 | self.version = version
110 | if dataset_filename is None or is_server():
111 | dataset_filename = "dev-{}.json".format(version.value)
112 | self.dataset_path = Path(self.root) / dataset_filename
113 |
114 | self.metrics = SQuADMetrics(self.dataset_path, version)
115 |
116 | def add(self, answers: Dict[str, str]):
117 | """
118 | Updates the evaluator with new results
119 |
120 | :param answers: a dictionary, where keys are question ids and values are text answers.
121 | For unanswerable questions (SQuAD v2.0) the answer should be an empty string.
122 |
123 | Examples:
124 | Update the evaluator with two results:
125 |
126 | .. code-block:: python
127 |
128 | my_evaluator.add({
129 | "57296d571d04691400779413": "itself",
130 | "5a89117e19b91f001a626f2d": ""
131 | })
132 | """
133 |
134 | self.metrics.add(answers)
135 |
136 | if not self.first_batch_processed and self.metrics.has_data:
137 | self.batch_hash = calculate_batch_hash(
138 | self.cache_values(answers=self.metrics.answers,
139 | metrics=self.metrics.get_results(ignore_missing=True))
140 | )
141 | self.first_batch_processed = True
142 |
143 | def reset(self):
144 | """
145 | Removes already added answers
146 |
147 | When checking if the model should be rerun on whole dataset it is first run on a smaller subset
148 | and the results are compared with values cached on sotabench server (the check is not performed
149 | when running locally.) Ideally, the smaller subset is just the first batch, so no additional
150 | computation is needed. However, for more complex multistage pipelines it may be simpler to
151 | run the model twice - on a small dataset and (if necessary) on the full dataset. In that case
152 | :func:`reset` needs to be called before the second run so values from the first run are not reported.
153 |
154 | .. seealso:: :func:`cache_exists`
155 | .. seealso:: :func:`reset_time`
156 | """
157 |
158 | self.metrics.reset()
159 | self.reset_time()
160 |
161 | def get_results(self):
162 | """
163 | Gets the results for the evaluator.
164 |
165 | :return: dict with `EM` (exact match score) and `F1`.
166 | """
167 |
168 | if self.cached_results:
169 | return self.results
170 | self.results = self.metrics.get_results()
171 | self.speed_mem_metrics['Max Memory Allocated (Total)'] = get_max_memory_allocated()
172 |
173 | return self.results
174 |
175 | def save(self):
176 | dataset = "SQuAD{} dev".format(self.metrics.version.value[1:])
177 |
178 | if not self.cached_results:
179 | exec_speed = (time.time() - self.init_time)
180 | self.speed_mem_metrics['Tasks / Evaluation Time'] = len(self.metrics.answers) / exec_speed
181 | self.speed_mem_metrics['Tasks'] = len(self.metrics.answers)
182 | self.speed_mem_metrics['Evaluation Time'] = exec_speed
183 | else:
184 | self.speed_mem_metrics['Tasks / Evaluation Time'] = None
185 | self.speed_mem_metrics['Tasks'] = None
186 | self.speed_mem_metrics['Evaluation Time'] = None
187 |
188 | return super().save(dataset=dataset)
189 |
190 |
191 | # todo: aggregate batches so that size of the batch used for caching does not depend on evaluation batch size
192 | CACHE_BATCH_SIZE = 1024
193 |
194 |
195 | class SQuADMetrics:
196 | def __init__(self, dataset_path: Path, version: SQuADVersion = SQuADVersion.V20):
197 | self.version = version
198 | self.answers = {}
199 | self._dataset = self._load_dataset(dataset_path)
200 | self._results = None
201 |
202 | def _load_dataset(self, path):
203 | with open(path, 'rt') as f:
204 | ds = json.load(f)
205 | if 'version' not in ds or 'data' not in ds:
206 | raise ValueError("Incorrect dataset format, either 'version' or 'data' is missing")
207 | version = ds['version'].strip().lower()
208 | if version and version[0] != 'v':
209 | version = 'v'+version
210 | if self.version.value != version:
211 | raise ValueError("Incorrect dataset version, found {} but was expecting {}"
212 | .format(version, self.version.value))
213 | return ds['data']
214 |
215 | def reset(self):
216 | self._results = None
217 | self.answers = {}
218 |
219 | def add(self, answers: Dict[str, str]):
220 | if not answers:
221 | print("Empty batch added to results")
222 | return
223 | if set(self.answers.keys()) & set(answers.keys()):
224 | print("Multiple predictions for a single question")
225 |
226 | self.answers.update(answers)
227 |
228 | def evaluate(self, ignore_missing=False):
229 | if ignore_missing:
230 | dataset = [{'paragraphs': [
231 | {'qas': [qa for qa in paragraph['qas'] if qa['id'] in self.answers]}
232 | for paragraph in article['paragraphs']
233 | ]} for article in self._dataset]
234 | else:
235 | dataset = self._dataset
236 | if self.version == SQuADVersion.V11:
237 | eval_fn = evaluate_v11
238 | else:
239 | eval_fn = evaluate_v20
240 | results = eval_fn(dataset, self.answers)
241 | self._results = {
242 | 'EM': results['exact_match'] / 100.0,
243 | 'F1': results['f1'] / 100.0
244 | }
245 |
246 | @property
247 | def has_data(self):
248 | return bool(self.answers)
249 |
250 | def get_results(self, ignore_missing=False):
251 | self.evaluate(ignore_missing)
252 |
253 | return self._results
254 |
--------------------------------------------------------------------------------
/sotabencheval/question_answering/utils.py:
--------------------------------------------------------------------------------
1 | from sotabencheval.question_answering.evaluate_v11 import evaluate as evaluate_v11
2 | from sotabencheval.question_answering.evaluate_v20 import get_raw_scores
3 |
4 | __all__ = ["evaluate_v11", "evaluate_v20"]
5 |
6 |
7 | def evaluate_v20(dataset, predictions):
8 | exact_scores, f1_scores = get_raw_scores(dataset, predictions)
9 | total = sum([len(p['qas']) for article in dataset for p in article['paragraphs']])
10 | exact_match = 100.0 * sum(exact_scores.values()) / total
11 | f1 = 100.0 * sum(f1_scores.values()) / total
12 | return {'exact_match': exact_match, 'f1': f1}
13 |
--------------------------------------------------------------------------------
/sotabencheval/semantic_segmentation/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["PASCALVOCEvaluator"]
2 |
3 | from sotabencheval.semantic_segmentation.ade20k import ADE20KEvaluator
4 | from sotabencheval.semantic_segmentation.pascalvoc import PASCALVOCEvaluator
--------------------------------------------------------------------------------
/sotabencheval/semantic_segmentation/ade20k.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sotabenchapi.client import Client
3 | from sotabenchapi.core import BenchmarkResult, check_inputs
4 | import time
5 |
6 | from sotabencheval.utils import calculate_batch_hash, is_server, get_max_memory_allocated
7 | from sotabencheval.semantic_segmentation.utils import ConfusionMatrix
8 |
9 |
10 | class ADE20KEvaluator(object):
11 | """`ADE20K `_ benchmark.
12 |
13 | Examples:
14 | Evaluate a HRNetV2 model from the CSAILVision repository
15 |
16 | .. code-block:: python
17 |
18 | ...
19 |
20 | evaluator = ADE20KEvaluator(model_name='HRNetV2 (HRNetV2-W48)', paper_arxiv_id='1904.04514')
21 |
22 | for batch_data in loader:
23 | # process data
24 | batch_data = batch_data[0]
25 | seg_label = as_numpy(batch_data['seg_label'][0])
26 | img_resized_list = batch_data['img_data']
27 |
28 | torch.cuda.synchronize()
29 | tic = time.perf_counter()
30 | with torch.no_grad():
31 | segSize = (seg_label.shape[0], seg_label.shape[1])
32 | scores = torch.zeros(1, cfg.DATASET.num_class, segSize[0], segSize[1])
33 | scores = async_copy_to(scores, gpu)
34 |
35 | for img in img_resized_list:
36 | feed_dict = batch_data.copy()
37 | feed_dict['img_data'] = img
38 | del feed_dict['img_ori']
39 | del feed_dict['info']
40 | feed_dict = async_copy_to(feed_dict, gpu)
41 |
42 | # forward pass
43 | scores_tmp = segmentation_module(feed_dict, segSize=segSize)
44 | scores = scores + scores_tmp / len(cfg.DATASET.imgSizes)
45 |
46 | _, pred = torch.max(scores, dim=1)
47 | pred = as_numpy(pred.squeeze(0).cpu())
48 |
49 | torch.cuda.synchronize()
50 |
51 | evaluator.update(output=pred.flatten().cpu().numpy(),
52 | target=seg_label.flatten().cpu().numpy())
53 |
54 | if evaluator.cache_exists:
55 | break
56 |
57 | evaluator.save()
58 | """
59 |
60 | task = "Semantic Segmentation"
61 |
62 | def __init__(self,
63 | model_name: str = None,
64 | paper_arxiv_id: str = None,
65 | paper_pwc_id: str = None,
66 | paper_results: dict = None,
67 | model_description=None):
68 | """Initializes a COCO Evaluator object
69 |
70 | Args:
71 | model_name (str, optional): The name of the model from the
72 | paper - if you want to link your build to a machine learning
73 | paper. See the ADE20K benchmark page for model names,
74 | https://sotabench.com/benchmarks/semantic-segmentation-on-ade20k-val,
75 | e.g. on the paper leaderboard tab.
76 | paper_arxiv_id (str, optional): Optional linking to arXiv if you
77 | want to link to papers on the leaderboard; put in the
78 | corresponding paper's arXiv ID, e.g. '1611.05431'.
79 | paper_pwc_id (str, optional): Optional linking to Papers With Code;
80 | put in the corresponding papers with code URL slug, e.g.
81 | 'u-gat-it-unsupervised-generative-attentional'
82 | paper_results (dict, optional) : If the paper you are reproducing
83 | does not have model results on sotabench.com, you can specify
84 | the paper results yourself through this argument, where keys
85 | are metric names, values are metric values. e.g::
86 |
87 | {'mIOU': 0.4566, 'Accuracy': 0.543}.
88 |
89 | Ensure that the metric names match those on the sotabench
90 | leaderboard - for ADE20K it should be 'mIOU', 'Accuracy'
91 | model_description (str, optional): Optional model description.
92 | download (bool) : whether to download the data or not
93 | """
94 |
95 | # Model metadata
96 |
97 | self.model_name = model_name
98 | self.paper_arxiv_id = paper_arxiv_id
99 | self.paper_pwc_id = paper_pwc_id
100 | self.paper_results = paper_results
101 | self.model_description = model_description
102 |
103 | self.ade20k_evaluator = ConfusionMatrix(150)
104 |
105 | self.outputs = np.array([])
106 | self.targets = np.array([])
107 |
108 | self.results = None
109 |
110 | # Backend variables for hashing and caching
111 |
112 | self.first_batch_processed = False
113 | self.batch_hash = None
114 | self.cached_results = False
115 |
116 | # Speed and memory metrics
117 |
118 | self.init_time = time.time()
119 | self.speed_mem_metrics = {}
120 |
121 | @property
122 | def cache_exists(self):
123 | """
124 | Checks whether the cache exists in the sotabench.com database - if so
125 | then sets self.results to cached results and returns True.
126 |
127 | You can use this property for control flow to break a for loop over a dataset
128 | after the first iteration. This prevents rerunning the same calculation for the
129 | same model twice.
130 |
131 | Examples:
132 | Breaking a for loop
133 |
134 | .. code-block:: python
135 |
136 | ...
137 |
138 | with torch.no_grad():
139 | for i, (input, target) in enumerate(iterator):
140 | ...
141 | output = model(input)
142 | # output and target should then be flattened into 1D np.ndarrays and passed in below
143 | evaluator.update(output=output, target=target)
144 |
145 | if evaluator.cache_exists:
146 | break
147 |
148 | evaluator.save()
149 |
150 | :return: bool or None (if not in check mode)
151 | """
152 | if not self.first_batch_processed:
153 | raise ValueError('No batches of data have been processed so no batch_hash exists')
154 |
155 | if not is_server():
156 | return None
157 |
158 | client = Client.public()
159 | cached_res = client.get_results_by_run_hash(self.batch_hash)
160 | if cached_res:
161 | self.results = cached_res
162 | self.cached_results = True
163 | print(
164 | "No model change detected (using the first batch run "
165 | "hash). Will use cached results."
166 | )
167 | return True
168 |
169 | return False
170 |
171 | def add(self, outputs: np.ndarray, targets: np.ndarray):
172 | """
173 | Update the evaluator with new results from the model
174 |
175 | :param outputs (np.ndarray): 1D np.ndarray of semantic class predictions per pixel
176 | :param targets (np.ndarray): 1D np.ndarray of ground truth semantic classes per pixel
177 |
178 | The method requires an outputs input and a targets input - both flattened.
179 |
180 | Suppose you are making predictions, batch by batch, and have your model outputs
181 | and the original targets with batch_size 32, and image size 520 x 480.
182 | The shape of your outputs might look like this:
183 |
184 | batch_output.shape
185 | >> (32, 21, 520, 480) # where 21 is the number of ADE20K classes
186 |
187 | batch_target.shape
188 | >> (32, 520, 480)
189 |
190 | We can flatten the entire output and targets to 1D vectors for each pixel:
191 |
192 | flattened_batch_output.shape
193 | >> (7987200) # flatten by taking the max class prediction
194 | # (batch_output.argmax(1).flatten() in torch with class as second dimension)
195 |
196 | flattened_batch_target.shape
197 | >> (7987200) # (batch_target.flatten() in torch)
198 |
199 | The output might look something like this:
200 |
201 | flattened_batch_output
202 | >> array([6, 6, 6, 6, 6, ...])
203 |
204 | flattened_batch_target
205 | >> array([6, 6, 6, 6, 6, ...])
206 |
207 | In both cases, the prediction and ground truth have class 6 as the semantic label for the first 5
208 | pixels - so the model is correct.
209 |
210 | These flattened arrays can then be passed into the .add() method of the evaluator
211 |
212 | .. code-block:: python
213 |
214 | my_evaluator.update(outputs=flattened_batch_output,
215 | targets=flattened_batch_target)
216 |
217 |
218 | :return: void - updates self.ade20k_evaluator with the data, and updates self.targets and self.outputs
219 | """
220 | self.ade20k_evaluator.update(targets, outputs)
221 |
222 | self.targets = np.append(self.targets, targets)
223 | self.outputs = np.append(self.outputs, outputs)
224 |
225 | if not self.first_batch_processed:
226 | acc_global, acc, iu = self.ade20k_evaluator.compute()
227 | self.batch_hash = calculate_batch_hash(np.append(
228 | np.append(np.around(targets, 3), np.around(outputs, 3)),
229 | np.around(np.array([acc_global.item(), iu.mean().item()]), 3)))
230 | self.first_batch_processed = True
231 |
232 | def get_results(self):
233 | """
234 | Reruns the evaluation using the accumulated detections, returns ADE20K results with IOU and
235 | Accuracy metrics
236 |
237 | :return: dict with ADE20K metrics
238 | """
239 | if self.cached_results:
240 | return self.results
241 |
242 | self.ade20k_evaluator = ConfusionMatrix(150)
243 | self.ade20k_evaluator.update(self.targets.astype(np.int64), self.outputs.astype(np.int64))
244 |
245 | acc_global, acc, iu = self.ade20k_evaluator.compute()
246 |
247 | self.results = {
248 | "Accuracy": acc_global.item(),
249 | "Mean IOU": iu.mean().item(),
250 | }
251 |
252 | self.speed_mem_metrics['Max Memory Allocated (Total)'] = get_max_memory_allocated()
253 |
254 | return self.results
255 |
256 | def reset_time(self):
257 | """
258 | Simple method to reset the timer self.init_time. Often used before a loop, to time the evaluation
259 | appropriately, for example:
260 |
261 | :return: void - resets self.init_time
262 | """
263 | self.init_time = time.time()
264 |
265 | def save(self):
266 | """
267 | Calculate results and then put into a BenchmarkResult object
268 |
269 | On the sotabench.com server, this will produce a JSON file serialisation and results will be recorded
270 | on the platform.
271 |
272 | :return: BenchmarkResult object with results and metadata
273 | """
274 |
275 | # recalculate to ensure no mistakes made during batch-by-batch metric calculation
276 | self.get_results()
277 |
278 | # If this is the first time the model is run, then we record evaluation time information
279 |
280 | if not self.cached_results:
281 | self.speed_mem_metrics['Tasks / Evaluation Time'] = None
282 | self.speed_mem_metrics['Tasks'] = None
283 | self.speed_mem_metrics['Evaluation Time'] = (time.time() - self.init_time)
284 | else:
285 | self.speed_mem_metrics['Tasks / Evaluation Time'] = None
286 | self.speed_mem_metrics['Tasks'] = None
287 | self.speed_mem_metrics['Evaluation Time'] = None
288 |
289 | return BenchmarkResult(
290 | task=self.task,
291 | config={},
292 | dataset='ADE20K val',
293 | results=self.results,
294 | speed_mem_metrics=self.speed_mem_metrics,
295 | model=self.model_name,
296 | model_description=self.model_description,
297 | arxiv_id=self.paper_arxiv_id,
298 | pwc_id=self.paper_pwc_id,
299 | paper_results=self.paper_results,
300 | run_hash=self.batch_hash,
301 | )
302 |
--------------------------------------------------------------------------------
/sotabencheval/semantic_segmentation/pascalvoc.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sotabenchapi.client import Client
3 | from sotabenchapi.core import BenchmarkResult, check_inputs
4 | import time
5 |
6 | from sotabencheval.utils import calculate_batch_hash, is_server, get_max_memory_allocated
7 | from sotabencheval.semantic_segmentation.utils import ConfusionMatrix
8 |
9 |
10 | class PASCALVOCEvaluator(object):
11 | """`PASCAL VOC `_ benchmark.
12 |
13 | Examples:
14 | Evaluate a FCN model from the torchvision repository:
15 |
16 | .. code-block:: python
17 |
18 | ...
19 |
20 | evaluator = PASCALVOCEvaluator(model_name='FCN ResNet-101', paper_arxiv_id='1605.06211')
21 |
22 | with torch.no_grad():
23 | for i, (input, target) in enumerate(iterator):
24 | ...
25 | output = model(input)
26 | # output and target should then be flattened into 1D np.ndarrays and passed in below
27 | evaluator.update(output=output, target=target)
28 |
29 | if evaluator.cache_exists:
30 | break
31 |
32 | evaluator.save()
33 | """
34 |
35 | task = "Semantic Segmentation"
36 |
37 | def __init__(self,
38 | model_name: str = None,
39 | paper_arxiv_id: str = None,
40 | paper_pwc_id: str = None,
41 | paper_results: dict = None,
42 | model_description=None):
43 | """Initializes a COCO Evaluator object
44 |
45 | Args:
46 | model_name (str, optional): The name of the model from the
47 | paper - if you want to link your build to a machine learning
48 | paper. See the VOC benchmark page for model names,
49 | https://sotabench.com/benchmarks/semantic-segmentation-on-pascal-voc-2012-val,
50 | e.g. on the paper leaderboard tab.
51 | paper_arxiv_id (str, optional): Optional linking to arXiv if you
52 | want to link to papers on the leaderboard; put in the
53 | corresponding paper's arXiv ID, e.g. '1611.05431'.
54 | paper_pwc_id (str, optional): Optional linking to Papers With Code;
55 | put in the corresponding papers with code URL slug, e.g.
56 | 'u-gat-it-unsupervised-generative-attentional'
57 | paper_results (dict, optional) : If the paper you are reproducing
58 | does not have model results on sotabench.com, you can specify
59 | the paper results yourself through this argument, where keys
60 | are metric names, values are metric values. e.g::
61 |
62 | {'Mean IOU': 76.42709, 'Accuracy': 95.31, ...}.
63 |
64 | Ensure that the metric names match those on the sotabench
65 | leaderboard - for PASCAL VOC it should be 'Mean IOU', 'Accuracy'
66 | model_description (str, optional): Optional model description.
67 | """
68 |
69 | # Model metadata
70 |
71 | self.model_name = model_name
72 | self.paper_arxiv_id = paper_arxiv_id
73 | self.paper_pwc_id = paper_pwc_id
74 | self.paper_results = paper_results
75 | self.model_description = model_description
76 |
77 | self.voc_evaluator = ConfusionMatrix(21)
78 |
79 | self.outputs = np.array([])
80 | self.targets = np.array([])
81 |
82 | self.results = None
83 |
84 | # Backend variables for hashing and caching
85 |
86 | self.first_batch_processed = False
87 | self.batch_hash = None
88 | self.cached_results = False
89 |
90 | # Speed and memory metrics
91 |
92 | self.init_time = time.time()
93 | self.speed_mem_metrics = {}
94 |
95 | @property
96 | def cache_exists(self):
97 | """
98 | Checks whether the cache exists in the sotabench.com database - if so
99 | then sets self.results to cached results and returns True.
100 |
101 | You can use this property for control flow to break a for loop over a dataset
102 | after the first iteration. This prevents rerunning the same calculation for the
103 | same model twice.
104 |
105 | Examples:
106 | Breaking a for loop
107 |
108 | .. code-block:: python
109 |
110 | ...
111 |
112 | with torch.no_grad():
113 | for i, (input, target) in enumerate(iterator):
114 | ...
115 | output = model(input)
116 | # output and target should then be flattened into 1D np.ndarrays and passed in below
117 | evaluator.update(output=output, target=target)
118 |
119 | if evaluator.cache_exists:
120 | break
121 |
122 | evaluator.save()
123 |
124 | :return: bool or None (if not in check mode)
125 | """
126 | if not self.first_batch_processed:
127 | raise ValueError('No batches of data have been processed so no batch_hash exists')
128 |
129 | if not is_server():
130 | return None
131 |
132 | client = Client.public()
133 | cached_res = client.get_results_by_run_hash(self.batch_hash)
134 | if cached_res:
135 | self.results = cached_res
136 | self.cached_results = True
137 | print(
138 | "No model change detected (using the first batch run "
139 | "hash). Will use cached results."
140 | )
141 | return True
142 |
143 | return False
144 |
145 | def add(self, outputs: np.ndarray, targets: np.ndarray):
146 | """
147 | Update the evaluator with new results from the model
148 |
149 | :param outputs (np.ndarray): 1D np.ndarray of semantic class predictions per pixel
150 | :param targets (np.ndarray): 1D np.ndarray of ground truth semantic classes per pixel
151 |
152 | The method requires an outputs input and a targets input - both flattened.
153 |
154 | Suppose you are making predictions, batch by batch, and have your model outputs
155 | and the original targets with batch_size 32, and image size 520 x 480.
156 | The shape of your outputs might look like this:
157 |
158 | batch_output.shape
159 | >> (32, 21, 520, 480) # where 21 is the number of VOC classes
160 |
161 | batch_target.shape
162 | >> (32, 520, 480)
163 |
164 | We can flatten the entire output and targets to 1D vectors for each pixel:
165 |
166 | flattened_batch_output.shape
167 | >> (7987200) # flatten by taking the max class prediction
168 | # (batch_output.argmax(1).flatten() in torch with class as second dimension)
169 |
170 | flattened_batch_target.shape
171 | >> (7987200) # (batch_target.flatten() in torch)
172 |
173 | The output might look something like this:
174 |
175 | flattened_batch_output
176 | >> array([6, 6, 6, 6, 6, ...])
177 |
178 | flattened_batch_target
179 | >> array([6, 6, 6, 6, 6, ...])
180 |
181 | In both cases, the prediction and ground truth have class 6 as the semantic label for the first 5
182 | pixels - so the model is correct.
183 |
184 | These flattened arrays can then be passed into the .add() method of the evaluator
185 |
186 | .. code-block:: python
187 |
188 | my_evaluator.update(outputs=flattened_batch_output,
189 | targets=flattened_batch_target)
190 |
191 |
192 | :return: void - updates self.voc_evaluator with the data, and updates self.targets and self.outputs
193 | """
194 | self.voc_evaluator.update(targets, outputs)
195 |
196 | self.targets = np.append(self.targets, targets)
197 | self.outputs = np.append(self.outputs, outputs)
198 |
199 | if not self.first_batch_processed:
200 | acc_global, acc, iu = self.voc_evaluator.compute()
201 | self.batch_hash = calculate_batch_hash(np.append(
202 | np.append(np.around(targets, 3), np.around(outputs, 3)),
203 | np.around(np.array([acc_global.item(), iu.mean().item()]), 3)))
204 | self.first_batch_processed = True
205 |
206 | def get_results(self):
207 | """
208 | Reruns the evaluation using the accumulated detections, returns VOC results with IOU and
209 | Accuracy metrics
210 |
211 | :return: dict with PASCAL VOC metrics
212 | """
213 | if self.cached_results:
214 | return self.results
215 |
216 | self.voc_evaluator = ConfusionMatrix(21)
217 | self.voc_evaluator.update(self.targets.astype(np.int64), self.outputs.astype(np.int64))
218 |
219 | acc_global, acc, iu = self.voc_evaluator.compute()
220 |
221 | self.results = {
222 | "Accuracy": acc_global.item(),
223 | "Mean IOU": iu.mean().item(),
224 | }
225 |
226 | self.speed_mem_metrics['Max Memory Allocated (Total)'] = get_max_memory_allocated()
227 |
228 | return self.results
229 |
230 | def reset_time(self):
231 | """
232 | Simple method to reset the timer self.init_time. Often used before a loop, to time the evaluation
233 | appropriately, for example:
234 |
235 | :return: void - resets self.init_time
236 | """
237 | self.init_time = time.time()
238 |
239 | def save(self):
240 | """
241 | Calculate results and then put into a BenchmarkResult object
242 |
243 | On the sotabench.com server, this will produce a JSON file serialisation and results will be recorded
244 | on the platform.
245 |
246 | :return: BenchmarkResult object with results and metadata
247 | """
248 | # recalculate to ensure no mistakes made during batch-by-batch metric calculation
249 | self.get_results()
250 |
251 | # If this is the first time the model is run, then we record evaluation time information
252 |
253 | if not self.cached_results:
254 | self.speed_mem_metrics['Tasks / Evaluation Time'] = None
255 | self.speed_mem_metrics['Tasks'] = None
256 | self.speed_mem_metrics['Evaluation Time'] = (time.time() - self.init_time)
257 | else:
258 | self.speed_mem_metrics['Tasks / Evaluation Time'] = None
259 | self.speed_mem_metrics['Tasks'] = None
260 | self.speed_mem_metrics['Evaluation Time'] = None
261 |
262 | return BenchmarkResult(
263 | task=self.task,
264 | config={},
265 | dataset='PASCAL VOC 2012 val',
266 | results=self.results,
267 | speed_mem_metrics=self.speed_mem_metrics,
268 | model=self.model_name,
269 | model_description=self.model_description,
270 | arxiv_id=self.paper_arxiv_id,
271 | pwc_id=self.paper_pwc_id,
272 | paper_results=self.paper_results,
273 | run_hash=self.batch_hash,
274 | )
275 |
--------------------------------------------------------------------------------
/sotabencheval/semantic_segmentation/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class ConfusionMatrix(object):
5 | def __init__(self, num_classes):
6 | self.num_classes = num_classes
7 | self.mat = None
8 |
9 | def update(self, a, b):
10 | """
11 | print(a.shape)
12 | print(n.shape)
13 | k = (a >= 0) & (a < n)
14 | inds = n * a[k].to(torch.int64) + b[k]
15 | self.mat += np.bincount(inds, minlength=n ** 2).reshape(n, n)
16 | """
17 | n = self.num_classes
18 |
19 | if self.mat is None:
20 | self.mat = np.zeros((n, n), dtype=np.int64)
21 |
22 | k = (a >= 0) & (a < n)
23 | inds = n * a[k].astype(np.int64) + b[k]
24 | self.mat += np.bincount(inds, minlength=n ** 2).reshape(n, n)
25 |
26 | def reset(self):
27 | self.mat.zero_()
28 |
29 | def compute(self):
30 | h = self.mat
31 | acc_global = np.diag(h).sum() / h.sum()
32 | acc = np.diag(h) / h.sum(1)
33 | iu = np.diag(h) / (h.sum(1) + h.sum(0) - np.diag(h))
34 | return acc_global, acc, iu
35 |
36 | def __str__(self):
37 | acc_global, acc, iu = self.compute()
38 | return (
39 | "global correct: {:.1f}\n"
40 | "average row correct: {}\n"
41 | "IoU: {}\n"
42 | "mean IoU: {:.1f}"
43 | ).format(
44 | acc_global.item() * 100,
45 | ["{:.1f}".format(i) for i in (acc * 100).tolist()],
46 | ["{:.1f}".format(i) for i in (iu * 100).tolist()],
47 | iu.mean().item() * 100,
48 | )
49 |
50 |
--------------------------------------------------------------------------------
/sotabencheval/utils.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import gzip
3 | import errno
4 | import tarfile
5 | import zipfile
6 | import os
7 | from tqdm import tqdm
8 | from pathlib import Path
9 |
10 |
11 | SOTABENCH_CACHE = Path.home() / ".cache"
12 |
13 |
14 | class AverageMeter(object):
15 | """Computes and stores the average and current value."""
16 |
17 | def __init__(self):
18 | self.val = 0
19 | self.avg = 0
20 | self.sum = 0
21 | self.count = 0
22 |
23 | def reset(self):
24 | self.val = 0
25 | self.avg = 0
26 | self.sum = 0
27 | self.count = 0
28 |
29 | def update(self, val, n=1):
30 | self.val = val
31 | self.sum += val * n
32 | self.count += n
33 | self.avg = self.sum / self.count
34 |
35 |
36 | def calculate_batch_hash(output):
37 | """Calculate the hash for the output of a batch
38 |
39 | Output is passed into this method, stringified, and a hash is taken of the contents. For example,
40 | it could be an list of predictions that is passed in.
41 |
42 | Args:
43 | output: data to be hashed
44 | """
45 | m = hashlib.sha256()
46 | m.update(str(output).encode("utf-8"))
47 | return m.hexdigest()
48 |
49 |
50 | def change_root_if_server(root: str, server_root: str):
51 | """
52 | This method checks whether code is being executed on the sotabench server - if so it returns
53 | server_root, else root. Written as a method so the user doesn't have to fiddle with environmental
54 | variables.
55 |
56 | :param root: (str) a user-specified root
57 | :param server_root: (str) a server root
58 | :return: server_root if SOTABENCH_SERVER env variable is set, else root
59 | """
60 | check_server = os.environ.get("SOTABENCH_SERVER")
61 |
62 | if check_server == 'true':
63 | return server_root
64 |
65 | return root
66 |
67 |
68 | def is_server():
69 | """
70 | Checks whether code is being executed on server; if so, returns True else False.
71 |
72 | Uses env variable SOTABENCH_SERVER to determine whether code is being run on the server.
73 |
74 | You can use this function for your control flow for server specific settings - e.g. the data paths.
75 |
76 | Examples:
77 |
78 | .. code-block:: python
79 |
80 |
81 | from sotabencheval.utils import is_server
82 |
83 | if is_server():
84 | DATA_ROOT = './.data/vision/imagenet'
85 | else: # local settings
86 | DATA_ROOT = '/home/ubuntu/my_data/'
87 |
88 | :return: bool - whether the code is being run on the server or not
89 | """
90 | if os.environ.get("SOTABENCH_SERVER") == 'true':
91 | return True
92 | else:
93 | return False
94 |
95 |
96 | def set_env_on_server(env_name: str, value):
97 | """
98 | If run on sotabench server, sets an environment variable with a given name to value (casted to str).
99 |
100 | :param env_name: (str) environment variable name
101 | :param value: value to set if executed on sotabench
102 | :return: bool - whether code is being run on the server
103 | """
104 | if is_server():
105 | os.environ[env_name] = str(value)
106 | return True
107 | return False
108 |
109 |
110 | def get_max_memory_allocated(device: str = 'cuda'):
111 | """
112 | Finds out the maximum memory allocated, then clears the max memory allocated.
113 |
114 | This currently only works for PyTorch models.
115 |
116 | TODO: Support TensorFlow and MXNet.
117 |
118 | :param device: (str) - name of device (Torch style) -> e.g. 'cuda'
119 | :return: float or None - if torch is in the environment, max memory allocated, else None
120 | """
121 | try:
122 | import torch
123 | max_mem = torch.cuda.max_memory_allocated(device=device)
124 | torch.cuda.reset_max_memory_allocated(device=device)
125 | return max_mem
126 | except ImportError:
127 | return None
128 |
129 | # Below the utilities have been taken directly from the torchvision repository
130 | # Contains helper functions for unzipping and making directories
131 | # https://github.com/pytorch/vision/tree/master/torchvision
132 |
133 |
134 | def makedir_exist_ok(dirpath):
135 | """
136 | Python2 support for os.makedirs(.., exist_ok=True)
137 | """
138 | try:
139 | os.makedirs(dirpath)
140 | except OSError as e:
141 | if e.errno == errno.EEXIST:
142 | pass
143 | else:
144 | raise
145 |
146 | def gen_bar_updater():
147 | pbar = tqdm(total=None)
148 |
149 | def bar_update(count, block_size, total_size):
150 | if pbar.total is None and total_size:
151 | pbar.total = total_size
152 | progress_bytes = count * block_size
153 | pbar.update(progress_bytes - pbar.n)
154 |
155 | return bar_update
156 |
157 |
158 | def calculate_md5(fpath, chunk_size=1024 * 1024):
159 | md5 = hashlib.md5()
160 | with open(fpath, 'rb') as f:
161 | for chunk in iter(lambda: f.read(chunk_size), b''):
162 | md5.update(chunk)
163 | return md5.hexdigest()
164 |
165 |
166 | def check_md5(fpath, md5, **kwargs):
167 | return md5 == calculate_md5(fpath, **kwargs)
168 |
169 |
170 | def check_integrity(fpath, md5=None):
171 | if not os.path.isfile(fpath):
172 | return False
173 | if md5 is None:
174 | return True
175 | return check_md5(fpath, md5)
176 |
177 |
178 | def download_url(url, root, filename=None, md5=None):
179 | """Download a file from a url and place it in root - utility function taken from torchvision repository
180 | Args:
181 | url (str): URL to download file from
182 | root (str): Directory to place downloaded file in
183 | filename (str, optional): Name to save the file under. If None, use the basename of the URL
184 | md5 (str, optional): MD5 checksum of the download. If None, do not check
185 | """
186 | from six.moves import urllib
187 |
188 | root = os.path.expanduser(root)
189 | if not filename:
190 | filename = os.path.basename(url)
191 | fpath = os.path.join(root, filename)
192 |
193 | makedir_exist_ok(root)
194 |
195 | # downloads file
196 | if check_integrity(fpath, md5):
197 | print('Using downloaded and verified file: ' + fpath)
198 | else:
199 | try:
200 | print('Downloading ' + url + ' to ' + fpath)
201 | urllib.request.urlretrieve(
202 | url, fpath,
203 | reporthook=gen_bar_updater()
204 | )
205 | except (urllib.error.URLError, IOError) as e:
206 | if url[:5] == 'https':
207 | url = url.replace('https:', 'http:')
208 | print('Failed download. Trying https -> http instead.'
209 | ' Downloading ' + url + ' to ' + fpath)
210 | urllib.request.urlretrieve(
211 | url, fpath,
212 | reporthook=gen_bar_updater()
213 | )
214 | else:
215 | raise e
216 |
217 |
218 | def _is_tar(filename):
219 | return filename.endswith(".tar")
220 |
221 |
222 | def _is_targz(filename):
223 | return filename.endswith(".tar.gz")
224 |
225 |
226 | def _is_gzip(filename):
227 | return filename.endswith(".gz") and not filename.endswith(".tar.gz")
228 |
229 |
230 | def _is_zip(filename):
231 | return filename.endswith(".zip")
232 |
233 |
234 | def extract_archive(from_path, to_path=None, remove_finished=False):
235 | if to_path is None:
236 | to_path = os.path.dirname(from_path)
237 |
238 | if _is_tar(from_path):
239 | with tarfile.open(from_path, 'r') as tar:
240 | tar.extractall(path=to_path)
241 | elif _is_targz(from_path):
242 | with tarfile.open(from_path, 'r:gz') as tar:
243 | tar.extractall(path=to_path)
244 | elif _is_gzip(from_path):
245 | to_path = os.path.join(to_path, os.path.splitext(os.path.basename(from_path))[0])
246 | with open(to_path, "wb") as out_f, gzip.GzipFile(from_path) as zip_f:
247 | out_f.write(zip_f.read())
248 | elif _is_zip(from_path):
249 | with zipfile.ZipFile(from_path, 'r') as z:
250 | z.extractall(to_path)
251 | else:
252 | raise ValueError("Extraction of {} not supported".format(from_path))
253 |
254 | if remove_finished:
255 | os.remove(from_path)
256 |
--------------------------------------------------------------------------------
/sotabencheval/version.py:
--------------------------------------------------------------------------------
1 | class Version:
2 | __slots__ = ("major", "minor", "build")
3 |
4 | def __init__(self, major, minor, build):
5 | self.major = major
6 | self.minor = minor
7 | self.build = build
8 |
9 | def __str__(self):
10 | return f"{self.major}.{self.minor}.{self.build}"
11 |
12 | def __repr__(self):
13 | return (
14 | f"Version(major={self.major}, minor={self.minor}, "
15 | f"build={self.build})"
16 | )
17 |
18 | version = Version(0, 0, 38)
19 |
20 | __version__ = str(version)
21 |
--------------------------------------------------------------------------------